├── .gitignore
├── README.md
├── data
└── README.md
├── docs
├── book
│ ├── .gitignore
│ ├── .travis.yml
│ ├── Makefile
│ ├── README.md
│ ├── make.bat
│ └── src
│ │ ├── chapters
│ │ ├── abstract-en.tex
│ │ ├── abstract-id.tex
│ │ ├── appendix-1.tex
│ │ ├── appendix-2.tex
│ │ ├── appendix-3.tex
│ │ ├── approval.tex
│ │ ├── chapter-1.tex
│ │ ├── chapter-2.tex
│ │ ├── chapter-3.tex
│ │ ├── chapter-4.tex
│ │ ├── chapter-5.tex
│ │ ├── cover.tex
│ │ ├── daftar_istilah.tex
│ │ ├── forewords.tex
│ │ ├── instruction.tex
│ │ └── statement.tex
│ │ ├── config
│ │ ├── hypenation-id.tex
│ │ ├── if-itb-thesis.sty
│ │ └── informations.tex
│ │ ├── references.bib
│ │ ├── resources
│ │ ├── Arsitektur-TA-1.png
│ │ ├── Arsitektur-TA-2.png
│ │ ├── Arsitektur-TA-3.png
│ │ ├── Data-tipe-A.png
│ │ ├── Data-tipe-B.png
│ │ ├── Data-tipe-C.png
│ │ ├── Full-fine-tune.png
│ │ ├── Head-fine-tune.png
│ │ ├── cbow-skip-gram-illustration.png
│ │ ├── cover-ganesha.jpg
│ │ ├── data_xlm_r.png
│ │ ├── ilustrasi-mlm.png
│ │ ├── ilustration-eng-spn-word.png
│ │ ├── linimasa-1.jpg
│ │ ├── linimasa-2.jpg
│ │ ├── luong_et_al_2015.jpg
│ │ ├── overview-attention.png
│ │ ├── overview-transformer.png
│ │ ├── plot-full-prosa-xlmr.png
│ │ ├── plot-full-toxic-xlmr.png
│ │ ├── plot-full-trip-advisor-xlmr-duplicate.png
│ │ ├── plot-full-trip-advisor-xlmr.png
│ │ ├── plot-gain-mbert.png
│ │ ├── plot-gain-xlmr.png
│ │ ├── plot-head-prosa-mbert.png
│ │ ├── plot-head-prosa-xlmr.png
│ │ ├── plot-head-toxic-mbert.png
│ │ ├── plot-head-toxic-xlmr.png
│ │ ├── plot-head-trip-mbert.png
│ │ ├── plot-head-trip-xlmr.png
│ │ ├── prosa-mbert-eng-1.png
│ │ ├── prosa-mbert-eng-2.png
│ │ ├── prosa-mbert-malay-1.png
│ │ ├── prosa-mbert-malay-2.png
│ │ ├── prosa-xlmr-eng-1.png
│ │ ├── prosa-xlmr-eng-2.png
│ │ ├── prosa-xlmr-malay-1.png
│ │ ├── prosa-xlmr-malay-2.png
│ │ ├── tandatangan.png
│ │ └── tandatangan_bu_ayu.png
│ │ ├── thesis-blx.bib
│ │ ├── thesis.fdb_latexmk
│ │ ├── thesis.fls
│ │ ├── thesis.run.xml
│ │ └── thesis.tex
└── paper
│ ├── Figure_explained.png
│ ├── Figure_symbol.png
│ ├── Improving Indonesian Text Classification Using Multilingual Language Model.docx
│ ├── Improving Indonesian Text Classification Using Multilingual Language Model.pdf
│ ├── Improving Indonesian Text Classification Using Multilingual Language Model.zip
│ └── Improving-Indonesian-Text-Classification-Using-Multilingual-Language-Model-Putra.pdf
├── notebooks
├── README.md
├── fine_tune_full
│ ├── prosa
│ │ └── xlm_r
│ │ │ └── indoxtc-fine-tune-full-prosa-xlm-r.ipynb
│ ├── toxic
│ │ ├── xlm_r
│ │ │ └── indoxtc-fine-tune-full-toxic-xlm-r-simpler.ipynb
│ │ └── xlm_r_comparable
│ │ │ └── indoxtc-fine-tune-full-toxic-xlm-r-comparable.ipynb
│ └── trip_advisor
│ │ ├── xlm_r
│ │ └── indoxtc-fine-tune-full-tripadvisor-xlm-r.ipynb
│ │ └── xlm_r_duplicate_removed
│ │ └── indoxtc-fine-tune-full-tripadvisor-xlm-r-dupli.ipynb
├── fine_tune_head
│ ├── extracting_features
│ │ ├── jigsaw_toxic
│ │ │ ├── mbert
│ │ │ │ ├── indoxtc-combining-toxic-en-features-mbert.ipynb
│ │ │ │ ├── indoxtc-extracting-toxic-en-features-mbert-1.ipynb
│ │ │ │ ├── indoxtc-extracting-toxic-en-features-mbert-2.ipynb
│ │ │ │ ├── indoxtc-extracting-toxic-en-features-mbert-3.ipynb
│ │ │ │ ├── indoxtc-extracting-toxic-en-features-mbert-4.ipynb
│ │ │ │ ├── indoxtc-extracting-toxic-en-features-mbert-5.ipynb
│ │ │ │ └── indoxtc-extracting-toxic-en-features-mbert-6.ipynb
│ │ │ └── xlm_r
│ │ │ │ ├── indoxtc-combining-toxic-en-features-xlm-r.ipynb
│ │ │ │ ├── indoxtc-extracting-toxic-en-features-xlm-r-1.ipynb
│ │ │ │ ├── indoxtc-extracting-toxic-en-features-xlm-r-2.ipynb
│ │ │ │ └── indoxtc-extracting-toxic-en-features-xlm-r-3.ipynb
│ │ ├── prosa
│ │ │ ├── mbert
│ │ │ │ └── indoxtc-extracting-prosa-features-mbert.ipynb
│ │ │ └── xlm_r
│ │ │ │ └── indoxtc-extracting-prosa-features-xlm-r.ipynb
│ │ ├── toxic
│ │ │ ├── mbert
│ │ │ │ └── indoxtc-extracting-toxic-features-mbert.ipynb
│ │ │ └── xlm_r
│ │ │ │ └── indoxtc-extracting-toxic-features-xlm-r.ipynb
│ │ ├── trip_advisor
│ │ │ ├── mbert
│ │ │ │ └── indoxtc-extracting-tripadvisor-features-mbert.ipynb
│ │ │ └── xlm_r
│ │ │ │ └── indoxtc-extracting-tripadvisor-features-xlm-r.ipynb
│ │ └── yelp_review
│ │ │ ├── mbert
│ │ │ ├── indoxtc-combining-yelp-features-mbert.ipynb
│ │ │ ├── indoxtc-extracting-yelp-features-mbert-1.ipynb
│ │ │ ├── indoxtc-extracting-yelp-features-mbert-2.ipynb
│ │ │ ├── indoxtc-extracting-yelp-features-mbert-3.ipynb
│ │ │ ├── indoxtc-extracting-yelp-features-mbert-4.ipynb
│ │ │ ├── indoxtc-extracting-yelp-features-mbert-5.ipynb
│ │ │ ├── indoxtc-extracting-yelp-features-mbert-6.ipynb
│ │ │ ├── indoxtc-extracting-yelp-features-mbert-7.ipynb
│ │ │ ├── indoxtc-extracting-yelp-features-mbert-8.ipynb
│ │ │ └── indoxtc-extracting-yelp-features-mbert-9.ipynb
│ │ │ └── xlm_r
│ │ │ ├── indoxtc-combining-yelp-features-xlm-r.ipynb
│ │ │ ├── indoxtc-extracting-yelp-features-xlm-r-1.ipynb
│ │ │ ├── indoxtc-extracting-yelp-features-xlm-r-2.ipynb
│ │ │ ├── indoxtc-extracting-yelp-features-xlm-r-3.ipynb
│ │ │ ├── indoxtc-extracting-yelp-features-xlm-r-4.ipynb
│ │ │ ├── indoxtc-extracting-yelp-features-xlm-r-5.ipynb
│ │ │ ├── indoxtc-extracting-yelp-features-xlm-r-6.ipynb
│ │ │ ├── indoxtc-extracting-yelp-features-xlm-r-7.ipynb
│ │ │ ├── indoxtc-extracting-yelp-features-xlm-r-8.ipynb
│ │ │ └── indoxtc-extracting-yelp-features-xlm-r-9.ipynb
│ ├── prosa
│ │ ├── mbert
│ │ │ └── indoxtc-fine-tune-head-prosa-mbert-all.ipynb
│ │ └── xlm_r
│ │ │ └── indoxtc-fine-tune-head-prosa-xlm-r-all.ipynb
│ ├── toxic
│ │ ├── mbert
│ │ │ └── indoxtc-fine-tune-head-toxic-mbert-all.ipynb
│ │ └── xlm_r
│ │ │ └── indoxtc-fine-tune-head-toxic-xlm-r-all.ipynb
│ └── trip_advisor
│ │ ├── mbert
│ │ └── indoxtc-fine-tune-head-tripadvisor-mbert-all.ipynb
│ │ └── xlm_r
│ │ └── indoxtc-fine-tune-head-tripadvisor-xlm-r-all.ipynb
└── result_analysis
│ ├── fine_tune_full
│ ├── prosa
│ │ └── xlm_r
│ │ │ ├── Analyze Improvement.ipynb
│ │ │ ├── Analyze Zero-shot.ipynb
│ │ │ ├── Result Prosa.ipynb
│ │ │ ├── final_prosa_yelp_xlm_r_result_combined_10981.csv
│ │ │ ├── plot-full-prosa-xlmr.png
│ │ │ ├── plot.png
│ │ │ ├── result_prosa_yelp_XLM_R_A_10981_0.5_full.csv
│ │ │ ├── result_prosa_yelp_XLM_R_B_10981_0.5_full.csv
│ │ │ ├── result_prosa_yelp_XLM_R_C_10981_0.5_full.csv
│ │ │ ├── result_prosa_yelp_XLM_R_C_10981_1.5_full.csv
│ │ │ ├── result_prosa_yelp_XLM_R_C_10981_1_full.csv
│ │ │ ├── result_prosa_yelp_XLM_R_C_10981_2_full.csv
│ │ │ └── result_prosa_yelp_XLM_R_C_10981_3_full.csv
│ ├── toxic
│ │ ├── xlm_r
│ │ │ ├── Analyze Improvement.ipynb
│ │ │ ├── Analyze Zero-shot.ipynb
│ │ │ ├── Result Toxic.ipynb
│ │ │ ├── final_toxic_toxic_xlm_r_result_combined_11852.csv
│ │ │ ├── plot-full-toxic-xlmr.png
│ │ │ ├── result_toxic_toxic_XLM_R_A_11852_0.5_full.csv
│ │ │ ├── result_toxic_toxic_XLM_R_B_11852_0.5_full.csv
│ │ │ ├── result_toxic_toxic_XLM_R_C_11852_0.5_full.csv
│ │ │ ├── result_toxic_toxic_XLM_R_C_11852_1.5_full.csv
│ │ │ ├── result_toxic_toxic_XLM_R_C_11852_1_full.csv
│ │ │ ├── result_toxic_toxic_XLM_R_C_11852_2_full.csv
│ │ │ └── result_toxic_toxic_XLM_R_C_11852_3_full.csv
│ │ └── xlm_r_comparable
│ │ │ ├── Result Toxic Comparable.ipynb
│ │ │ ├── result_Abusive_toxic_toxic_XLM_R_A_11852_0.5_full.csv
│ │ │ └── result_HS_toxic_toxic_XLM_R_A_11852_0.5_full.csv
│ └── trip_advisor
│ │ ├── xlm_r
│ │ ├── Analyze Improvement.ipynb
│ │ ├── Analyze Zero-shot.ipynb
│ │ ├── Result Trip.ipynb
│ │ ├── final_trip_advisor_yelp_xlm_r_result_combined_12389.csv
│ │ ├── plot-full-trip-advisor-xlmr.png
│ │ ├── result_trip_advisor_yelp_XLM_R_A_12389_0.5_full.csv
│ │ ├── result_trip_advisor_yelp_XLM_R_B_12389_0.5_full.csv
│ │ ├── result_trip_advisor_yelp_XLM_R_C_12389_0.5_full.csv
│ │ ├── result_trip_advisor_yelp_XLM_R_C_12389_1.5_full.csv
│ │ ├── result_trip_advisor_yelp_XLM_R_C_12389_1_full.csv
│ │ ├── result_trip_advisor_yelp_XLM_R_C_12389_2_full.csv
│ │ └── result_trip_advisor_yelp_XLM_R_C_12389_3_full.csv
│ │ └── xlm_r_duplicate
│ │ ├── Result Trip Dupli.ipynb
│ │ ├── final_trip_advisor_yelp_xlm_r_result_combined_9816.csv
│ │ ├── plot-full-trip-advisor-xlmr-duplicate.png
│ │ ├── result_trip_advisor_yelp_XLM_R_A_9816_0.5_full.csv
│ │ ├── result_trip_advisor_yelp_XLM_R_B_9816_0.5_full.csv
│ │ ├── result_trip_advisor_yelp_XLM_R_C_9816_0.5_full.csv
│ │ ├── result_trip_advisor_yelp_XLM_R_C_9816_1.5_full.csv
│ │ ├── result_trip_advisor_yelp_XLM_R_C_9816_1_full.csv
│ │ ├── result_trip_advisor_yelp_XLM_R_C_9816_2_full.csv
│ │ └── result_trip_advisor_yelp_XLM_R_C_9816_3_full.csv
│ └── fine_tune_head
│ ├── Gain analysis - mBERT.ipynb
│ ├── Gain analysis.ipynb
│ ├── compilation
│ ├── average-f1-score-gains.png
│ ├── plot-prosa-mbert-english.png
│ ├── plot-prosa-xlmr-english.png
│ ├── plot-toxic-mbert-english.png
│ ├── plot-toxic-xlmr-english.png
│ ├── plot-trip-mbert-english.png
│ └── plot-trip-xlmr-english.png
│ ├── plot.png
│ ├── plot_mbert.png
│ ├── prosa
│ ├── mbert
│ │ ├── Plot Result Prosa mBERT.ipynb
│ │ ├── final_prosa_yelp_mBERT_result_combined_1.csv
│ │ ├── final_prosa_yelp_mBERT_result_combined_2.csv
│ │ ├── gains.csv
│ │ ├── plot-prosa-mbert-english.png
│ │ └── plot.png
│ └── xlm_r
│ │ ├── Plot Result Prosa XLMR.ipynb
│ │ ├── final_prosa_yelp_XLM_R_result_combined_1.csv
│ │ ├── final_prosa_yelp_XLM_R_result_combined_2.csv
│ │ ├── gains.csv
│ │ ├── plot-prosa-xlmr-english.png
│ │ └── plot.png
│ ├── toxic
│ ├── mbert
│ │ ├── Plot Result Toxic mBERT.ipynb
│ │ ├── final_toxic_toxic_mBERT_result_combined_1.csv
│ │ ├── final_toxic_toxic_mBERT_result_combined_2.csv
│ │ ├── gains.csv
│ │ ├── plot-toxic-mbert-english.png
│ │ └── plot.png
│ └── xlm_r
│ │ ├── Plot Result Toxic XLMR.ipynb
│ │ ├── final_toxic_toxic_XLM_R_result_combined_1.csv
│ │ ├── final_toxic_toxic_XLM_R_result_combined_2.csv
│ │ ├── gains.csv
│ │ ├── plot-toxic-xlmr-english.png
│ │ └── plot.png
│ └── trip_advisor
│ ├── mbert
│ ├── Plot Result Trip mBERT.ipynb
│ ├── final_trip_advisor_yelp_mBERT_result_combined_1.csv
│ ├── final_trip_advisor_yelp_mBERT_result_combined_2.csv
│ ├── gains.csv
│ ├── plot-trip-mbert-english.png
│ └── plot.png
│ └── xlm_r
│ ├── Plot Result Trip XLMR.ipynb
│ ├── final_trip_advisor_yelp_XLM_R_result_combined_1.csv
│ ├── final_trip_advisor_yelp_XLM_R_result_combined_2.csv
│ ├── gains.csv
│ ├── plot-trip-xlmr-english.png
│ └── plot.png
└── src
├── README.md
├── extract-feature.py
├── load-data.py
├── model-full.py
└── model-head.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | One sentence summary:
2 | > You can use English text data to improve Indonesian text classification performance using a multilingual language model.
3 |
4 | # Indonesian Text Classification Multilingual
5 |
6 | This repository is my final year undergraduate project on Institut Teknologi Bandung, supervised by [Dr. Eng. Ayu Purwarianti, ST.,MT](https://scholar.google.co.id/citations?user=8jUro_cAAAAJ&hl=en). It contains the unpolished source codes (`.py` on [/src](./src/) and `.ipynb` on [/notebooks](./)), [book](./docs/book) (Indonesian), and paper (English).
7 |
8 | * Book title:
9 | **Klasifikasi Teks Berbahasa Indonesia Menggunakan Multilingual Language Model (Studi Kasus: Klasifikasi Ujaran Kebencian dan Analisis Sentimen)**
10 |
11 | * Paper title ([Arxiv](https://arxiv.org/abs/2009.05713)):
12 | **Improving Indonesian Text Classification Using Multilingual Language Model**
13 |
14 |
15 | ## Project Organization
16 | ------------
17 | ├── README.md <- The top-level README
18 | ├── data <- Contain information regarding the data
19 | ├── docs
20 | | ├── book <- Latex source for the book
21 | | └── paper <- Microsoft word source for the paper
22 | |
23 | ├── notebooks <- The *.ipynb jupyter notebooks
24 | | ├── fine_tune_full <- Notebooks for full finetune experiments
25 | | ├── fine_tune_head <- Notebooks for feature-based experiments
26 | | └── result_analysis <- Notebooks analyzing and producing figures
27 | |
28 | └── src <- The *.py source code
29 |
30 | ------------
31 |
32 |
33 | ## Abstract
34 |
35 | Compared to English, the amount of labeled data for Indonesian text classification tasks is very small. Recently developed multilingual language models have shown its ability to create multilingual representations effectively. This paper investigates the effect of combining English and Indonesian data on building Indonesian text classification (e.g., sentiment analysis and hate speech) using multilingual language models. Using the feature-based approach, we observe its performance on various data sizes and total added English data. The experiment showed that the addition of English data, especially if the amount of Indonesian data is small, improves performance. Using the fine-tuning approach, we further showed its effectiveness in utilizing the English language to build Indonesian text classification models.
36 |
37 | ## Experiments
38 |
39 | The experiments consist of two multilingual language model (mBERT [1] & XLM-R [2]), three training data scenarios, two training approaches, and five datasets. Every experiment was run on [Kaggle](https://www.kaggle.com/) kernel. You can find the link to every Kaggle's kernel & datasets on each directory.
40 |
41 | #### A. Training Data Scenarios
42 | We investigate the model performance in three different scenarios. Each differs by the combination of the language used in its training data: monolingual, zero-shot, and multilingual. In the monolingual scenario, we use the Indonesian language text to train and validate the model. In the zero-shot scenario, we use the English language text to train the model while being validated on Indonesian text. Lastly, we use a combination of Indonesian and English text to train the model while being validated on Indonesian text in the multilingual scenario. Using these scenarios, we observe the improvement of the added English text.
43 |
44 | #### B. Training Approaches
45 | There are two approaches on applying large pre-trained language representation to downstream tasks: feature-based and fine-tuning [1]. On the feature-based approach, we extract fixed features from the pre-trained model. In this experiment, we use the last hidden state, which is 768 for mBERT and 1024 for XLM-R Large, as the feature. This extracted feature is then fed into a single dense layer, the only layer we trained on the feature-based approach, connected with dropout before finally ending on a sigmoid function. In contrast, the finetuning approach trains all the language model parameters, 110M for mBERT and 550M for XLM-R Large, including the last dense layer, on the training data binary cross-entropy loss.
46 |
47 | Using the feature-based scenario, we run many experiments as the expensive and multilingual representation have been precomputed on all the data. In all training data scenarios, we vary the total data used. More specifically, we train the model using [500, 1000, 2500, 5000, 7500, Max] text data. Specific to multilingual training data scenario, we vary the amount of added English data by [0.25, 0.5, 0.75, 1, 1.5, 2, 3, 4, 5, 6, 7, 8, 9, 10] times the amount of Indonesian text data. We refer to a multilingual experiment with added English data N times the amount of Indonesian text data as multilingual(N).
48 |
49 | In contrast to the feature-based scenarios, fine-tuning the full language model is expensive and resource-intensive. However, as shown in [1], fully fine-tuning the full language model will result in a better text classifier. We fine-tuned the best performing model on the feature-based scenarios. The experiment was reduced to only using the maximum total data and an added English data multiplier up to 3.
50 |
51 | #### C. Datasets
52 | More details on the book and paper. Quick summary:
53 | * Indonesian:
54 | * Sentiment Analysis 1: [(Farhan & Khodra, 2017) [3]](https://www.researchgate.net/publication/320832619_Sentiment-specific_word_embedding_for_Indonesian_sentiment_analysis)
55 | * Sentiment Analysis 2: [(Crisdayanti & Purwarianti, 2019) [4]](https://ieeexplore.ieee.org/abstract/document/8904199/)
56 | * Hate-speech and Abusive: [(Ibrohim & Budi, 2019) [5]](https://www.aclweb.org/anthology/W19-3506.pdf)
57 |
58 | * English:
59 | * Sentiment Analysis: [Yelp Review Sentiment Dataset](https://www.kaggle.com/ilhamfp31/yelp-review-dataset)
60 | * Toxic Comment: [Jigsaw Toxic Comment](https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification/data)
61 |
62 |
63 | #### D. Training Reproducibility and Hyperparameters
64 | We split the training data into training and validation set with a 90:10 ratio. The split was done in a stratified fashion, conserving the distribution of labels between the training & validation set. The result is a dataset separated into training, validation, and test sets.
65 |
66 | Each experiment will train the model using the training set and validate it to the validation set on each epoch. After each epoch, we will evaluate whether we will continue, reduce the learning rate, or stop the training process based on validation set performance and the hyperparameter set on each condition. In the end, we use the model from the best performing epoch based on its validation performance to predict the test set.
67 |
68 | On the feature-based experiment, we set the final layer dropout probability to 0.2, the learning rate reducer patience to 5, and the early stopping patience to 12. On full fine-tune experiment, we set the final layer dropout probability to 0.2, the learning rate reducer patience to 0, and the early stopping patience to 4. Every validation and prediction use 0.5 as its label threshold.
69 |
70 | To ensure reproducibility, we set every random seed possible on each experiment. On the feature-based experiment, we average the result of 6 different runs by varying the seed from 1-6. Running the same experiment on the feature-based approach will result in the same final score. On the full fine-tune experiment, we only run one experiment. While the result should not differ substantially, [the exact reproducibility cannot be guaranteed as the training was done on a TPU](https://suneeta-mall.github.io/2019/12/22/Reproducible-ml-tensorflow.html).
71 |
72 | ## Result
73 | #### A. Feature-based experiment
74 |
75 |
Fig. 1. Feature-based experiment result with XLM-R on [3] (left), [4] (middle), and [5] (right)
76 |
77 |
78 | The result of feature-based experiments with XLM-R model on all datasets can be seen in Fig 1. Through this result, we can see that adding English data can help the performance of the model. On [3] & [4] dataset, adding English data consistently improves the performance. But on [5] dataset, there's a point where the added English data results in worse performance. We hypothesize this is due to the large difference in what constitutes hate-speech (or toxic by Jigsaw dataset) between the datasets used.
79 |
80 |
81 |
82 |
Fig. 2. Feature-based experiment result with mBERT on [3] (left), [4] (middle), and [5] (right)
83 |
84 | The result of feature-based experiments with mBERT model on all datasets can be seen in Fig 2. The same phenomenon is observed on mBERT based experiment, although the performance is substantially lower. This is expected as XLM-R is designed to improve mBERT on various design choices.
85 |
86 | Defining the gain as the difference between monolingual and its highest multilingual performance, Table I shows the gains averaged on all datasets across total data and model. The highest gain can be seen on the lowest amount of total data used, 500, with F1-score gain of 0.176 using XLM-R model and 0.129 using mBERT model. The results suggest that the lower the amount of data used; the more gains yield by adding English data to the training set.
87 |
88 |
Table I. Average F1-Score Gains
89 |
90 |
91 |
92 | #### B. Full fine-tune experiment
93 | The result of fully fine-tuning all parameters, in addition to utilizing English data, proved to be effective in building a better Indonesian text classification model. On [3] dataset, the highest performance achieved on the zero-shot scenario where it yielded 0.893 F1-score, improving the previous works of 0.834. On [4] dataset, the highest performance achieved on multilingual(1.5) scenario where it yielded perfect F1-score, improving the previous works of 0,9369. On [5] dataset, the highest performance achieved on multilingual(3) scenario where it yielded 0.898 F1-score and 89.9% accuracy. To provide a fair comparison with the previous work by Ibrohim & Budi [5], we also ran the experiment using the original label and monolingual scenario. The experiment yielded 89.52% average accuracy, improving the previous works of 77.36%.
94 |
95 | ## References
96 | Research mentioned in this README:
97 | [1] J. Devlin et al. “BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding”. In: arXiv:1810.04805 [cs] (2019). arXiv: 1810.04805. URL: http://arxiv.org/abs/1810.04805.
98 | [2] A. Conneau et al. “Unsupervised Cross-lingual Representation Learning at Scale”. In: arXiv:1911.02116 [cs] (2020). arXiv: 1911.02116. URL: http://arxiv.org/abs/1911.02116.
99 | [3] A. N. Farhan & M. L. Khodra. “Sentiment-specific word embedding for Indonesian sentiment analysis”. In: 2017 International Conference on Advanced Informatics, Concepts, Theory, and Applications (ICAICTA). 2017, 1–5. DOI: 10.1109/ICAICTA.2017.8090964.
100 | [4] I. A.P. A. Crisdayanti & A. Purwarianti. “Improving Bi-LSTM Performance for Indonesian Sentiment Analysis Using Paragraph Vector”. In: (2019).
101 | [5] M. O. Ibrohim & I. Budi. “Multi-label Hate Speech and Abusive Language Detection in Indonesian Twitter”. In: Proceedings of the Third Workshop on Abusive Language Online. Association for Computational Linguistics, 2019, 46–57. DOI: 10 . 18653 / v1 / W19 - 3506. URL: https://www.aclweb.org/anthology/W19-3506.
102 |
--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | # Dataset Source
2 | Here's the link to the dataset used in this project.
3 |
4 | ## Indonesian
5 | * Sentiment Analysis 1: [(Farhan & Khodra, 2017)](https://www.kaggle.com/ilhamfp31/dataset-tripadvisor)
6 | * Sentiment Analysis 2: [(Crisdayanti & Purwarianti, 2019)](https://www.kaggle.com/ilhamfp31/dataset-prosa/)
7 | * Hate-speech and Abusive (Ibrohim & Budi, 2019):
8 | * Original dataset link: https://github.com/okkyibrohim/id-multi-label-hate-speech-and-abusive-language-detection
9 | * Kaggle port of the original dataset: https://www.kaggle.com/ilhamfp31/indonesian-abusive-and-hate-speech-twitter-text
10 | * [Here's the kernel to my preprocessing process](https://www.kaggle.com/ilhamfp31/simpler-preprocess-indonesian-hate-abusive-text)
11 |
12 |
13 |
14 | ## English
15 | * Sentiment Analysis: [Yelp Review Sentiment Dataset](https://www.kaggle.com/ilhamfp31/yelp-review-dataset)
16 | * Toxic Comment: [Jigsaw Toxic Comment](https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification/data)
--------------------------------------------------------------------------------
/docs/book/.gitignore:
--------------------------------------------------------------------------------
1 | *.aux
2 | *.toc
3 | *.tex~
4 | *.log
5 | *.lof
6 | *.idx
7 | *.bbl
8 | *.blg
9 | *.lot
10 | *.ilg
11 | *.lol
12 | *.out
13 | *.ind
14 | *.backup
15 | *.synctex.gz
16 | texmf
17 | thesis.pdf
18 | .directory
19 | output/*
20 | build/*
21 | *.pth
22 | codes_xnli_100.txt
23 | notebook/tools/mosesdecoder/*
24 | __pycache__/*
--------------------------------------------------------------------------------
/docs/book/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: required
2 | dist: trusty
3 |
4 | before_install:
5 | - sudo apt-get update
6 |
7 | install:
8 | - sudo apt-get install -y texlive-latex-recommended texlive-latex-extra texlive-fonts-recommended
9 | - sudo apt-get install -y texlive-bibtex-extra biber xzdec texlive-lang-other
10 | - sudo apt-get install -y latexmk
11 | - tex --version
12 | - pdflatex --version
13 |
14 | script:
15 | - make install
16 |
--------------------------------------------------------------------------------
/docs/book/Makefile:
--------------------------------------------------------------------------------
1 | all: clean install
2 |
3 | install:
4 | mkdir -p output
5 | mkdir -p build
6 | cd src && latexmk -pdf -bibtex -outdir=../build thesis.tex
7 | mv build/thesis.pdf output/ta.pdf
8 |
9 | clean:
10 | rm -f output/* build/*
11 | find . -iname "*~" -exec rm '{}' ';'
12 |
--------------------------------------------------------------------------------
/docs/book/README.md:
--------------------------------------------------------------------------------
1 | Templat LaTeX Tesis Informatika ITB
2 | ===================================
3 | oleh: Petra Novandi
4 |
5 | Dokumen ini merupakan templat LaTeX yang ditujukan untuk laporan
6 | tesis di program studi Teknik Informatika ITB. Templat ini penulis
7 | gunakan dalam penulisan laporan tesis penulis dan dengan semangat
8 | berbagi penulis memutuskan untuk mempublikasikan templat ini agar
9 | dapat digunakan oleh banyak orang.
10 |
11 | Silakan mengunduh, menggunakan, memodifikasi, dan menyebarkan
12 | templat ini. :)
13 |
14 |
15 | Kebutuhan
16 | ---------
17 |
18 | Program telah diuji dalam sistem operasi Linux Ubuntu 18.04. Untuk melakukan instalasi
19 | perangkat lunak yang dibutuhkan, eksekusi perintah berikut.
20 |
21 | ```
22 | sudo apt-get -qq update && sudo apt-get install -y --no-install-recommends \
23 | texlive-fonts-recommended texlive-latex-extra texlive-fonts-extra \
24 | dvipng texlive-latex-recommended \
25 | texlive-bibtex-extra biber xzdec
26 | ```
27 |
28 | For latexmk package
29 | ```
30 | sudo apt install latexmk
31 | ```
32 |
33 | For bahasa package
34 | ```
35 | sudo apt-get install texlive-lang-other
36 | ```
37 |
38 | For russian package
39 | ```
40 | sudo apt-get install texlive-lang-cyrillic
41 | ```
42 |
43 | For japanese package
44 | ```
45 | sudo apt-get install latex-cjk-all
46 | ```
47 |
48 | Penggunaan
49 | ----------
50 |
51 | Templat ini telah dilengkapi oleh skrip untuk melakukan kompilasi
52 | Makefile. Untuk melakukan kompilasi cukup eksekusi perintah berikut
53 |
54 | ```
55 | make
56 | ```
57 |
58 | Hasil kompilasi akan berada pada berkas `output/tesis.pdf`.
59 |
60 | Kontribusi
61 | ----------
62 |
63 | Templat ini dapat digunakan secara gratis, akan tetapi penulis sangat
64 | berharap adanya kritik serta saran dari pengguna untuk meningkatkan
65 | kualitas hasil dan penggunaan templat ini.
66 |
67 | Kritik dan saran tersebut dapat dikirim melalui URL
68 | .
69 |
70 | Terima Kasih
71 | -----------
72 |
73 | * Steven Lolong atas pemberian templat LaTeX yang asli.
74 | * Peb Ruswono Aryan atas bantuan pelengkapan struktur dokumen.
75 |
--------------------------------------------------------------------------------
/docs/book/make.bat:
--------------------------------------------------------------------------------
1 | REM clean
2 | rmdir /S /Q build
3 | rmdir /S /Q output
4 |
5 | REM install
6 | mkdir -p output
7 | mkdir -p build
8 | latexmk -pdf -bibtex -outdir=../build -cd src/thesis.tex
9 | move build\thesis.pdf output
10 |
--------------------------------------------------------------------------------
/docs/book/src/chapters/abstract-en.tex:
--------------------------------------------------------------------------------
1 | \clearpage
2 | \chapter*{Abstract}
3 | \addcontentsline{toc}{chapter}{Abstract}
4 |
5 | %put your abstract here
6 | \blindtext
7 |
8 | \clearpage
--------------------------------------------------------------------------------
/docs/book/src/chapters/abstract-id.tex:
--------------------------------------------------------------------------------
1 | \clearpage
2 | \chapter*{ABSTRAK}
3 | \addcontentsline{toc}{chapter}{Abstrak}
4 | \begin{center}
5 | \large \bfseries \MakeUppercase{Klasifikasi Teks Berbahasa Indonesia Menggunakan \textit{Multilingual Language Model (Studi Kasus: Klasifikasi Ujaran Kebencian dan Analisis Sentimen)}}
6 |
7 | \normalsize \normalfont{Oleh\\
8 | ILHAM FIRDAUSI PUTRA\\
9 | NIM : 13516140
10 | }
11 | \end{center}
12 |
13 | %taruh abstrak bahasa indonesia di sini
14 |
15 | Klasifikasi teks adalah proses memprediksi kategori tertentu dari sebuah teks. Contoh kategori adalah nilai sentimen atau status ujaran kebencian. Teknik klasifikasi teks \textit{state-of-the-art} saat ini menggunakan deep learning yang memerlukan data latih dalam ukuran besar. Bersamaan dengan itu, perkembangan dalam bidang representasi teks telah memungkinkan teks dari berbagai bahasa direpresentasikan dalam satu bidang yang sama menggunakan \textit{multilingual language model}. Dua diantaranya adalah MultilingualBERT \parencite{Devlin_Chang_Lee_Toutanova_2019} dan XLM-R\parencite{Conneau_XLMR}.
16 |
17 | Dengan memanfaatkan MultilingualBERT dan XLM-R, representasi teks antar bahasa dapat digunakan untuk membangun model klasifikasi bahasa Indonesia dengan kombinasi data bahasa Indonesia dan bahasa Inggris. Tugas akhir ini memanfaatkan hal tersebut untuk membangun model klasifikasi teks bahasa Indonesia yang meningkatkan performa hasil penelitian \parencite{FarhanKhodra2017} \& \parencite{CrisdayantiPurwarianti2019} mengenai analisis sentimen dan versi biner penelitian \parencite{Ibrohim_Budi_2019} mengenai ujaran kebencian \& kasar. Eksperimen dilakukan dengan memvariasikan jumlah data bahasa Indonesia, jumlah data bahasa Inggris, dan teknik \textit{fine-tuning}.
18 |
19 | Hasil eksperimen menunjukkan XLM-R berhasil meningkatkan hasil analisis sentimen pada dataset penelitian \parencite{FarhanKhodra2017} dari F1-score 0,8341 ke 0,893; penelitian \parencite{CrisdayantiPurwarianti2019} dari F1-score 0,9369 ke 1; dan penelitian \parencite{Ibrohim_Budi_2019} dari rata-rata akurasi 77.36\% ke 89.52\%. Meski ada kasus dimana penambahan data bahasa Inggris berlebih menurunkan performa klasifikasi yang harus dianalisa lebih lanjut, hasil eksperimen menunjukkan bahwa penambahan dataset bahasa Inggris, terutama jika data bahasa Indonesia sedikit, dapat membantu meningkatkan performa klasifikasi teks bahasa Indonesia menggunakan model XLM-R.
20 |
21 | \textbf{Kata kunci:} \textit{multilingual language model}, analisis sentimen, klasifikasi ujaran kebencian
22 | \clearpage
--------------------------------------------------------------------------------
/docs/book/src/chapters/appendix-1.tex:
--------------------------------------------------------------------------------
1 | \chapter{Algoritma \textit{Byte Pair Encoding} Sederhana}
2 | \label{appendix:simple_bpe_algorithm}
3 |
4 | Algoritma (contoh nama file: \(bpe.py\)):
5 | \begin{lstlisting}[language=Python]
6 | import re, collections
7 | def get_stats(vocab):
8 | pairs = collections.defaultdict(int)
9 | for word, freq in vocab.items():
10 | symbols = word.split()
11 | for i in range(len(symbols)-1):
12 | pairs[symbols[i],symbols[i+1]] += freq
13 | return pairs
14 |
15 | def merge_vocab(pair, v_in):
16 | v_out = {}
17 | bigram = re.escape(' '.join(pair))
18 | p = re.compile(r'(?' : 5, 'l o w e r ' : 2,
25 | 'n e w e s t ':6, 'w i d e s t ':3}
26 | vocab_test = {'l o w e s t ': 1}
27 |
28 | num_merges = 10
29 | for i in range(num_merges):
30 | pairs = get_stats(vocab)
31 | best = max(pairs, key=pairs.get)
32 | print('~~~')
33 | vocab = merge_vocab(best, vocab)
34 | vocab_test = merge_vocab(best, vocab_test)
35 | print("best: ", best)
36 | print("vocab: ", vocab)
37 | print("vocab_test: ", vocab_test)
38 | \end{lstlisting}
39 |
40 | Setelah dijalankan di mesin bersistem operasi Ubuntu 18.04 dengan perintah
41 | \begin{lstlisting}[language=bash]
42 | $ python3 bpe.py
43 | \end{lstlisting}
44 |
45 | akan didapatkan keluaran sebagai berikut:
46 | \begin{lstlisting}[language=bash]
47 | ~~~
48 | best: ('e', 's')
49 | vocab: {'l o w ': 5, 'l o w e r ': 2, 'n e w es t ': 6, 'w i d es t ': 3}
50 | vocab_test: {'l o w es t ': 1}
51 | ~~~
52 | best: ('es', 't')
53 | vocab: {'l o w ': 5, 'l o w e r ': 2, 'n e w est ': 6, 'w i d est ': 3}
54 | vocab_test: {'l o w est ': 1}
55 | ~~~
56 | best: ('est', '')
57 | vocab: {'l o w ': 5, 'l o w e r ': 2, 'n e w est': 6, 'w i d est': 3}
58 | vocab_test: {'l o w est': 1}
59 | ~~~
60 | best: ('l', 'o')
61 | vocab: {'lo w ': 5, 'lo w e r ': 2, 'n e w est': 6, 'w i d est': 3}
62 | vocab_test: {'lo w est': 1}
63 | ~~~
64 | best: ('lo', 'w')
65 | vocab: {'low ': 5, 'low e r ': 2, 'n e w est': 6, 'w i d est': 3}
66 | vocab_test: {'low est': 1}
67 | ~~~
68 | best: ('n', 'e')
69 | vocab: {'low ': 5, 'low e r ': 2, 'ne w est': 6, 'w i d est': 3}
70 | vocab_test: {'low est': 1}
71 | ~~~
72 | best: ('ne', 'w')
73 | vocab: {'low ': 5, 'low e r ': 2, 'new est': 6, 'w i d est': 3}
74 | vocab_test: {'low est': 1}
75 | ~~~
76 | best: ('new', 'est')
77 | vocab: {'low ': 5, 'low e r ': 2, 'newest': 6, 'w i d est': 3}
78 | vocab_test: {'low est': 1}
79 | ~~~
80 | best: ('low', '')
81 | vocab: {'low': 5, 'low e r ': 2, 'newest': 6, 'w i d est': 3}
82 | vocab_test: {'low est': 1}
83 | ~~~
84 | best: ('w', 'i')
85 | vocab: {'low': 5, 'low e r ': 2, 'newest': 6, 'wi d est': 3}
86 | vocab_test: {'low est': 1}
87 | \end{lstlisting}
--------------------------------------------------------------------------------
/docs/book/src/chapters/appendix-2.tex:
--------------------------------------------------------------------------------
1 | \chapter{Rincian Kasus Uji}
--------------------------------------------------------------------------------
/docs/book/src/chapters/appendix-3.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/chapters/appendix-3.tex
--------------------------------------------------------------------------------
/docs/book/src/chapters/approval.tex:
--------------------------------------------------------------------------------
1 | \clearpage
2 | \pagestyle{empty}
3 | \newgeometry{top=3.5cm,bottom=2.5cm,left=3cm,right=2cm}
4 | \begin{center}
5 | \smallskip
6 |
7 | \Large \bfseries \MakeUppercase{\thetitle}
8 | \vfill
9 |
10 | \Large Laporan Tugas Akhir
11 | \vfill
12 |
13 | \large Oleh
14 |
15 | \Large \theauthor
16 |
17 | \large Program Studi Teknik Informatika \\
18 | \normalsize \normalfont Sekolah Teknik Elektro dan Informatika \\
19 | Institut Teknologi Bandung \\
20 |
21 | \vfill
22 | \normalsize \normalfont{
23 |
24 | }
25 |
26 | Telah disetujui dan disahkan sebagai Laporan Tugas Akhir \\
27 | di Bandung, pada tanggal 22 Juni 2020.
28 |
29 | \vfill
30 | \normalsize \normalfont
31 | Pembimbing\\
32 | % \begin{figure}[!h]
33 | % \centering
34 | % \includegraphics[width=0.2\textwidth]{resources/tandatangan_bu_ayu.png}
35 | % \end{figure}
36 | \vfill
37 | \underline{Dr. Eng. Ayu Purwarianti, ST.,MT.} \\
38 | NIP 19770127 200801 2 011
39 |
40 | % \begin{tabular}{c@{\hskip 0.5in}c}
41 | % Pembimbing I, & Pembimbing II \\
42 | % & \\
43 | % & \\
44 | % & \\
45 | % & \\
46 | % Dr. Eng. Ayu Purwarianti, ST.,MT. & Nama dan Gelar Pembimbing II \\
47 | % NIP 19770127 200801 2 011 & NIP 123456789 \\
48 | % \end{tabular}
49 |
50 | \end{center}
51 | \restoregeometry
52 | \clearpage
53 |
--------------------------------------------------------------------------------
/docs/book/src/chapters/chapter-5.tex:
--------------------------------------------------------------------------------
1 | \chapter{Kesimpulan dan Saran}
2 |
3 | \section{Kesimpulan}
4 | Berikut beberapa kesimpulan yang dapat ditarik dari tugas akhir ini:
5 | \begin{enumerate}
6 | \item Penambahan dataset bahasa Inggris, terutama jika data bahasa Indonesia sedikit, dapat membantu meningkatkan performa klasifikasi teks bahasa Indonesia menggunakan \textit{multilingual language model} XLM-R. Dapat dilihat pada Tabel \ref{tab:gain_conclusion}, jumlah peningkatan performa naik dengan semakin sedikit total data. Hanya saja ada kasus dimana penambahan data bahasa Inggris berlebih menurunkan performa klasifikasi. Analisis lebih lanjut mengenai kenapa hal ini terjadi dan apa solusinya diperlukan.
7 | % Please add the following required packages to your document preamble:
8 | % \usepackage{multirow}
9 | \begin{table}[]
10 | \centering
11 | \caption{Rangkuman peningkatan performa}
12 | \begin{tabular}{|l|r|r|}
13 | \hline
14 | \multicolumn{1}{|c|}{\multirow{2}{*}{\textbf{\begin{tabular}[c]{@{}c@{}}Total\\ Data\end{tabular}}}} & \multicolumn{2}{c|}{\textbf{Rata-rata peningkatan}} \\ \cline{2-3}
15 | \multicolumn{1}{|c|}{} & \multicolumn{1}{c|}{\textbf{XLM-R}} & \multicolumn{1}{c|}{\textbf{mBERT}} \\ \hline
16 | 500 & 0.176221 & 0.129394 \\ \hline
17 | 1000 & 0.165718 & 0.109215 \\ \hline
18 | 2500 & 0.118456 & 0.051226 \\ \hline
19 | 5000 & 0.095780 & 0.029564 \\ \hline
20 | 7500 & 0.086930 & 0.028043 \\ \hline
21 | 10000 & 0.077875 & 0.020184 \\ \hline
22 | \end{tabular}
23 | \label{tab:gain_conclusion}
24 | \end{table}
25 | \item Penambahan dataset bahasa Inggris dapat membantu model mendapatkan informasi baru dalam bahasa Indonesia. Hal ini dapat dilihat dari analisis hasil peningkatan performa pada analisis sentimen dataset B. Model yang sebelumnya gagal memprediksi teks bahasa Indonesia, berhasil setelah ditambahkan data bahasa Inggris.
26 | \item Penggunaan \textit{multilingual language model} yang di \textit{fine-tune} sepenuhnya sangat efektif dalam klasifikasi teks bahasa Indonesia.
27 | \begin{enumerate}
28 | \item Pada eksperimen sentimen analisis dataset A, model mendapatkan F1-score 0,893. Sebuah peningkatan dari penelitian sebelumnya yang mendapatkan F1-score 0,8521.
29 | \item Pada eksperimen sentimen analisis dataset B, model mendapatkan F1-score sempurna. Sebuah peningkatan absolut dari penelitian sebelumnya yang mendapatkan F1-score 0,9369.
30 | \item Pada eksperimen klasifikasi ujaran kebencian, model mendapatkan F1-score 0.898 dan akurasi 89.9\%. Penelitian sebelumnya yang menggunakan 3 label, bukan yang disimplifikasi menjadi 2 seperti di penelitian ini, mendapatkan rata-rata akurasi tertinggi 77.36\%. Agar dapat dibandingkan, eksperimen dijalankan dengan konfigurasi label yang sama dan didapatkan rata-rata akurasi 89.52\% yang merupakan peningkatan dari penelitian sebelumnya.
31 | \end{enumerate}
32 |
33 | \end{enumerate}
34 |
35 | \section{Saran}
36 | Berikut beberapa saran yang dapat digunakan untuk memperbaiki, memperbaharui, atau mengembangkan hasil tugas akhir ini:
37 | \begin{enumerate}
38 | \item Jauhnya perbedaan, baik dari domain, bahasa, atau faktor lainnya yang mempengaruhi pengumpulan data, antara dataset dapat menyebabkan memburuknya performa \textit{multilingual learning}. Untuk penelitian selanjutnya, dapat dicoba beberapa cara untuk mengatasi hal ini. Beberapa diantaranya adalah seperti penelitian \parencite{Lai_Oguz_Yang_Stoyanov_2019} yang menggunakan \textit{universal data augmentation} untuk mengurangi perbedaan tadi pada saat pembelajaran dilakukan.
39 | \item Turunnya performa pada \textit{multilingual learning} permasalahan klasifikasi ujaran kebencian masih harus diteliti lebih lanjut. Perlu dianalisa lebih dalam lagi apakah hal ini dikarenakan teknik fine-tuningnya atau perbedaan domain yang melekat dalam dataset. Penelitian \parencite{Peters_Ruder_Smith_2019} meneliti perbedaan antara dua teknik fine-tuning yang dicoba pada tugas akhir ini dan mencoba menganalisa kemiripan datasetnya. Hal tersebut dapat dijadikan pedoman dalam menganalisa lebih lanjut fenomena yang diobservasi pada tugas akhir ini.
40 | \item Tugas akhir ini sudah membuktikkan efektifnya penggunaan \textit{language model} dalam berbagai permasalahan klasifikasi teks bahasa Indonesia. Penelitian \parencite{Conneau_XLMR} telah mengobservasi turunnya performa model secara general dengan ditambahnya bahasa dalam pelatihan \textit{multilingual language model} Untuk sampai saat tugas akhir ini ditulis, belum terdapat \textit{language model} spesifik yang dilatih secara masif dalam bahasa Indonesia. Hal ini dapat dicoba dan dibandingkan performanya dengan \textit{multilingual language model} yang dilatih dalam berbagai bahasa.
41 | \end{enumerate}
--------------------------------------------------------------------------------
/docs/book/src/chapters/cover.tex:
--------------------------------------------------------------------------------
1 | \clearpage
2 | \pagestyle{empty}
3 | \newgeometry{top=3.5cm,bottom=2.5cm,left=3cm,right=2cm}
4 | \begin{center}
5 | \smallskip
6 |
7 | \Large \bfseries \MakeUppercase{\thetitle}
8 | \vfill
9 |
10 | \Large Laporan Tugas Akhir
11 | \vfill
12 |
13 | \large Disusun sebagai syarat kelulusan tingkat sarjana
14 | \vfill
15 |
16 | \large Oleh
17 |
18 | \Large \theauthor
19 |
20 | \vfill
21 | \begin{figure}[h]
22 | \centering
23 | \includegraphics[width=0.2\textwidth]{resources/cover-ganesha.jpg}
24 | \end{figure}
25 | \vfill
26 |
27 | \large
28 | \uppercase{
29 | Program Studi Teknik Informatika \\
30 | Sekolah Teknik Elektro dan Informatika \\
31 | Institut Teknologi Bandung
32 | }
33 |
34 | Juni 2020
35 |
36 | \end{center}
37 | \restoregeometry
38 | \clearpage
39 |
--------------------------------------------------------------------------------
/docs/book/src/chapters/daftar_istilah.tex:
--------------------------------------------------------------------------------
1 | % \chapter*{Daftar Istilah}
2 | \clearpage
3 | \begin{center}
4 | \smallskip
5 | \large \bfseries{Daftar Istilah}
6 |
7 | \begin{table}[h]
8 | \begin{tabularx}{\textwidth}{|l|X|}
9 | \textbf{Dataset} & Kumpulan data yang digunakan untuk melakukan pelatihan, validasi, maupun evaluasi \\
10 | \textbf{Model} & Representasi matematika yang didapat dari hasil pembelajaran menggunakan data latih \\
11 | \textbf{Baseline} & Performa model atau model yang dijadikan acuan dasar \\
12 | \textbf{Arsitektur} & Struktur model yang terdiri dari berbagai macam rule, fungsionalitas, dan implementasi \\
13 | \textbf{Transformer} & Arsitektur yang pertama kali dideskripsikan oleh (Vaswani et al., 2017) untuk memodelkan sekuens. \\
14 | \textbf{LearningRate} & Besar perubahan yang dilakukan ke model pada setiap iterasi pembelajaran \\
15 | \textbf{Callback} & Fungsi yang melekat pada fase pembelajaran model \\
16 | \textbf{EarlyStopping} & Callback yang akan memberhentikan pembelajaran ketika kondisi yang ditentukan telah dipenuhi \\
17 | \textbf{ReduceeLrOnPlateau} & Callback yang akan menurunkan besar LearningRate ketika model sudah tidak belajar lagi berdasarkan kondisi ditentukan. \\
18 | \textbf{Fine-tune} & Proses melatih kembali model ke permasalahan spesifik dari model yang sebelumnya sudah dilatih pada data umum \\
19 | \end{tabularx}
20 | \end{table}
21 | \end{center}
22 | \clearpage
23 |
--------------------------------------------------------------------------------
/docs/book/src/chapters/forewords.tex:
--------------------------------------------------------------------------------
1 | \chapter*{Kata Pengantar}
2 | \addcontentsline{toc}{chapter}{Kata Pengantar}
3 |
4 | Puji syukur penulis panjatkan ke hadirat Tuhan Yang Maha Kuasa karena atas berkat dan karunia-Nya, penulis dapat menyelesaikan tugas akhir yang berjudul “Klasifikasi Teks Berbahasa Indonesia Menggunakan \textit{Multilingual Language Model (Studi Kasus: Klasifikasi Ujaran Kebencian dan Analisis Sentimen)}” untuk memenuhi syarat kelulusan tingkat sarjana. Penulis juga ingin mengucapkan terima kasih kepada pihak-pihak yang telah membantu dan mendukung penulis selama pengerjaan tugas akhir ini:
5 |
6 | \begin{enumerate}
7 | \item Ibu Dr. Eng. Ayu Purwarianti, ST.,MT., selaku dosen pembimbing yang telah memberikan arahan, nasehat, dan dukungan selama pengerjaan tugas akhir.
8 | \item Ibu Fariska Zakhralativa Ruskanda S.T., M.T., dan Ibu Dr. Masayu Leylia Khodra, ST., MT. selaku dosen penguji yang telah memberikan evaluasi dan saran kepada penulis.
9 | \item Ibu Dessi Puji Lestari S.T.,M.Eng.,Ph.D, Ibu Dr. Fazat Nur Azizah S.T., M.Sc., dan Bapak Nugraha Priya Utama, Ph.D. selaku dosen mata kuliah IF4091 Tugas Akhir I K01 dan IF4092 Tugas Akhir II yang telah memberi arahan selama pelaksanaan tugas akhir ini.
10 | \item Ibu Dra. Harlili M.Sc. selaku dosen wali yang telah memberikan arahan, nasehat, dan dukungan selama empat tahun berkuliah di program studi Teknik Informatika ITB.
11 | \item Keluarga penulis yang selalu mendukung dan memotivasi penulis untuk tetap semangat dalam kuliah hingga menyelesaikan tugas akhir.
12 | \item Seluruh staf pengajar yang belum disebutkan dari program studi Teknik Informatika yang telah membekali penulis dengan ilmu dan wawasan untuk mendukung pengerjaan tugas akhir.
13 | \item Staf Tata Usaha program studi Teknik Informatika yang telah membantu selama perkuliahan khususnya dalam proses administrasi tugas akhir.
14 | \item Teman-teman penulis yang telah mendukung serta menemani perjalanan kuliah dan pengerjaan tugas akhir ini.
15 |
16 |
17 | \end{enumerate}
18 | Akhir kata, terima kasih banyak kepada semua pihak yang telah secara langsung maupun tidak langsung membantu penyelesaian tugas akhir ini. Penulis berharap tugas akhir ini dapat bermanfaat bagi para pembaca. Penulis juga menyadari bahwa tugas akhir ini tidaklah sempurna. Oleh karena itu, penulis sangat terbuka terhadap kritik dan saran yang membangun terkait tugas akhir ini.
19 |
20 | \begin{flushright}
21 | Bandung, 22 Juni 2020 \\
22 | % \begin{figure}[!h]
23 | % \raggedleft
24 | % \includegraphics[width=0.2\textwidth]{resources/tandatangan.png}
25 | % \end{figure}
26 | Penulis
27 | \end{flushright}
28 | \clearpage
29 |
--------------------------------------------------------------------------------
/docs/book/src/chapters/instruction.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/chapters/instruction.tex
--------------------------------------------------------------------------------
/docs/book/src/chapters/statement.tex:
--------------------------------------------------------------------------------
1 | \clearpage
2 |
3 | \chapter*{Lembar Pernyataan}
4 |
5 | Dengan ini saya menyatakan bahwa:
6 |
7 | \begin{enumerate}
8 |
9 | \item Pengerjaan dan penulisan Laporan Tugas Akhir ini dilakukan tanpa menggunakan bantuan yang tidak dibenarkan.
10 | \item Segala bentuk kutipan dan acuan terhadap tulisan orang lain yang digunakan di dalam penyusunan laporan tugas akhir ini telah dituliskan dengan baik dan benar.
11 | \item Laporan Tugas Akhir ini belum pernah diajukan pada program pendidikan di perguruan tinggi mana pun.
12 |
13 | \end{enumerate}
14 |
15 | Jika terbukti melanggar hal-hal di atas, saya bersedia dikenakan sanksi sesuai dengan Peraturan Akademik dan Kemahasiswaan Institut Teknologi Bandung bagian Penegakan Norma Akademik dan Kemahasiswaan khususnya Pasal 2.1 dan Pasal 2.2.
16 | \vspace{15mm}
17 |
18 |
19 |
20 | \begin{flushright}
21 | Bandung, 22 Juni 2020 \\
22 | \vskip 0.5in
23 | % \begin{figure}[!h]
24 | % \raggedleft
25 | % \includegraphics[width=0.2\textwidth]{resources/tandatangan.png}
26 | % \end{figure}
27 | Ilham Firdausi Putra \\
28 | NIM 13516140
29 | \end{flushright}
30 |
31 | \clearpage
--------------------------------------------------------------------------------
/docs/book/src/config/hypenation-id.tex:
--------------------------------------------------------------------------------
1 | %--------------------------------------------------------------------%
2 | %
3 | % Hypenation untuk Bahasa Indonesia
4 | %
5 | % @author Petra Barus
6 | %
7 | %--------------------------------------------------------------------%
8 | %
9 | % Secara otomatis LaTeX dapat langsung memenggal kata dalam dokumen,
10 | % tapi sering kali terdapat kesalahan dalam pemenggalan kata. Untuk
11 | % memperbaiki kesalahan pemenggalan kata tertentu, cara pemenggalan
12 | % kata tersebut dapat ditambahkan pada dokumen ini. Pemenggalan
13 | % dilakukan dengan menambahkan karakter '-' pada suku kata yang
14 | % perlu dipisahkan.
15 | %
16 | % Contoh pemenggalan kata 'analisa' dilakukan dengan 'a-na-li-sa'
17 | %
18 | %--------------------------------------------------------------------%
19 |
20 | \hypenation {
21 | % A
22 | %
23 | a-na-li-sa
24 | a-pli-ka-si
25 |
26 | % B
27 | %
28 | be-be-ra-pa
29 | ber-ge-rak
30 |
31 | % C
32 | %
33 | ca-ri
34 |
35 | % D
36 | %
37 | da-e-rah
38 | di-nya-ta-kan
39 | de-fi-ni-si
40 |
41 | % E
42 | %
43 | e-ner-gi
44 | eks-klu-sif
45 |
46 | % F
47 | %
48 | fa-si-li-tas
49 |
50 | % G
51 | %
52 | ga-bung-an
53 |
54 | % H
55 | %
56 | ha-lang-an
57 |
58 | % I
59 | %
60 | i-nduk
61 |
62 | % J
63 | %
64 | ka-me-ra
65 | kua-li-tas
66 |
67 | % K
68 | %
69 |
70 | % L
71 | %
72 |
73 | % M
74 | %
75 |
76 | % N
77 | %
78 |
79 | % O
80 | %
81 |
82 | % P
83 | %
84 |
85 | % Q
86 | %
87 |
88 | % R
89 | %
90 |
91 | % S
92 | %
93 |
94 | % T
95 | %
96 |
97 | % U
98 | %
99 |
100 | % V
101 | %
102 |
103 | % W
104 | %
105 |
106 | % X
107 | %
108 |
109 | % Y
110 | %
111 |
112 | % Z
113 | %
114 | }
115 |
--------------------------------------------------------------------------------
/docs/book/src/config/if-itb-thesis.sty:
--------------------------------------------------------------------------------
1 | %-------------------------------------------------------------------%
2 | %
3 | % Konfigurasi dokumen LaTeX untuk laporan tesis IF ITB
4 | %
5 | % @author Ilham Firdausi Putra
6 | %
7 | %-------------------------------------------------------------------%
8 | %
9 | % Berkas ini merupakan pembaharuan dari berkas awal milik Petra Novandi dan Steven Lolong
10 | %
11 | %-------------------------------------------------------------------%
12 |
13 | % Ukuran kertas
14 | \special{papersize=210mm,297mm}
15 |
16 | % Setting margin
17 | \usepackage[top=3cm,bottom=2.5cm,left=4cm,right=2.5cm]{geometry}
18 |
19 | \usepackage{mathptmx}
20 |
21 | % Format citation
22 | \usepackage[backend=bibtex,citestyle=authoryear,sorting=nyt,firstinits=true]{biblatex}
23 | \renewcommand*{\nameyeardelim}{\addcomma\space}
24 | \renewcommand*\finalnamedelim{\addspace\&\space}
25 |
26 | % Anti hyphenation
27 | \tolerance=1
28 | \emergencystretch=\maxdimen
29 | \hyphenpenalty=10000
30 | \hbadness=10000
31 |
32 | % Judul bahasa Indonesia
33 | \usepackage[russian, bahasa]{babel}
34 | \usepackage[utf8]{inputenc}
35 | \usepackage{csquotes}
36 | \setquotestyle{english}
37 | \usepackage{graphicx}
38 | \usepackage{titling}
39 | \usepackage{blindtext}
40 | \usepackage{sectsty}
41 | \usepackage{chngcntr}
42 | \usepackage{etoolbox}
43 | \usepackage{hyperref} % Package untuk link di daftar isi.
44 | \usepackage{titlesec} % Package Format judul
45 | \usepackage{parskip}
46 |
47 | % Daftar Istilah
48 | \usepackage{tabularx}
49 |
50 | % confusion matrix
51 | \usepackage{multirow}
52 |
53 | % package di equation
54 | \usepackage{amsmath}
55 | \usepackage{amsfonts}
56 |
57 | % Line satu setengah spasi
58 | \renewcommand{\baselinestretch}{1.5}
59 |
60 | % Setting judul chapter
61 | \chapterfont{\centering \large}
62 | \titleformat{\chapter}[display]
63 | {\large\centering\bfseries}
64 | {\chaptertitlename\ \thechapter}{0pt}
65 | {\large\bfseries\uppercase}
66 |
67 | \titlespacing*{\chapter}{0pt}{-30pt}{40pt}
68 | \titlespacing*{\section}{0pt}{10pt}{0pt}
69 | \titlespacing*{\subsection}{0pt}{10pt}{0pt}
70 |
71 | % Setting besar font section
72 | % \newcommand{\secfnt}{\fontsize{8}{12}}
73 | % \newcommand{\ssecfnt}{\fontsize{8}{12}}
74 |
75 | % \titleformat{\section}
76 | % {\normalfont\secfnt\bfseries}{\thesection}{1em}{}
77 |
78 | % \titleformat{\subsection}
79 | % {\normalfont\ssecfnt\bfseries}{\thesubsection}{1em}{}
80 | % \titleformat*{\section}{\normalsize\bfseries}
81 | % \titleformat*{\subsection}{\normalsize\bfseries}
82 | % \sectionfont{\fontsize{8}{12}\selectfont}
83 |
84 | % Untuk nampilin kode
85 | \usepackage[utf8]{inputenc}
86 |
87 | \usepackage{listings}
88 | \usepackage{xcolor}
89 |
90 | \definecolor{codegreen}{rgb}{0,0.6,0}
91 | \definecolor{codegray}{rgb}{0.5,0.5,0.5}
92 | \definecolor{codepurple}{rgb}{0.58,0,0.82}
93 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92}
94 |
95 | \lstdefinestyle{mystyle}{
96 | backgroundcolor=\color{backcolour},
97 | commentstyle=\color{codegreen},
98 | keywordstyle=\color{magenta},
99 | numberstyle=\tiny\color{codegray},
100 | stringstyle=\color{codepurple},
101 | basicstyle=\ttfamily\footnotesize,
102 | breakatwhitespace=false,
103 | breaklines=true,
104 | captionpos=b,
105 | keepspaces=true,
106 | numbers=left,
107 | numbersep=5pt,
108 | showspaces=false,
109 | showstringspaces=false,
110 | showtabs=false,
111 | tabsize=2
112 | }
113 |
114 | \lstset{style=mystyle}
115 |
116 | % Setting nomor pada subbsubsubbab
117 | \setcounter{secnumdepth}{3}
118 |
119 | \makeatletter
120 |
121 | \makeatother
122 |
123 | % Counter untuk figure dan table.
124 | \counterwithin{figure}{section}
125 | \counterwithin{table}{section}
126 |
127 | % bahasa asing
128 | \usepackage{CJKutf8}
129 |
130 | % Spacing pada daftar figure dan table
131 | \makeatletter
132 | \renewcommand*\l@figure{\@dottedtocline{1}{1em}{3.2em}}
133 | \makeatother
134 |
135 | \makeatletter
136 | \renewcommand*\l@table{\@dottedtocline{1}{1em}{3.2em}}
137 | \makeatother
138 |
139 |
140 | % Ganti judul bibliography
141 | % \renewcommand\bibname{Daftar Pustaka}
142 |
--------------------------------------------------------------------------------
/docs/book/src/config/informations.tex:
--------------------------------------------------------------------------------
1 | %-------------------------------------------------------------------%
2 | %
3 | % Berkas informasi umum tesis
4 | %
5 | % @author Ilham Firdausi Putra
6 | %
7 | %-------------------------------------------------------------------%
8 | %
9 | %-------------------------------------------------------------------%
10 |
--------------------------------------------------------------------------------
/docs/book/src/resources/Arsitektur-TA-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/Arsitektur-TA-1.png
--------------------------------------------------------------------------------
/docs/book/src/resources/Arsitektur-TA-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/Arsitektur-TA-2.png
--------------------------------------------------------------------------------
/docs/book/src/resources/Arsitektur-TA-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/Arsitektur-TA-3.png
--------------------------------------------------------------------------------
/docs/book/src/resources/Data-tipe-A.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/Data-tipe-A.png
--------------------------------------------------------------------------------
/docs/book/src/resources/Data-tipe-B.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/Data-tipe-B.png
--------------------------------------------------------------------------------
/docs/book/src/resources/Data-tipe-C.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/Data-tipe-C.png
--------------------------------------------------------------------------------
/docs/book/src/resources/Full-fine-tune.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/Full-fine-tune.png
--------------------------------------------------------------------------------
/docs/book/src/resources/Head-fine-tune.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/Head-fine-tune.png
--------------------------------------------------------------------------------
/docs/book/src/resources/cbow-skip-gram-illustration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/cbow-skip-gram-illustration.png
--------------------------------------------------------------------------------
/docs/book/src/resources/cover-ganesha.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/cover-ganesha.jpg
--------------------------------------------------------------------------------
/docs/book/src/resources/data_xlm_r.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/data_xlm_r.png
--------------------------------------------------------------------------------
/docs/book/src/resources/ilustrasi-mlm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/ilustrasi-mlm.png
--------------------------------------------------------------------------------
/docs/book/src/resources/ilustration-eng-spn-word.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/ilustration-eng-spn-word.png
--------------------------------------------------------------------------------
/docs/book/src/resources/linimasa-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/linimasa-1.jpg
--------------------------------------------------------------------------------
/docs/book/src/resources/linimasa-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/linimasa-2.jpg
--------------------------------------------------------------------------------
/docs/book/src/resources/luong_et_al_2015.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/luong_et_al_2015.jpg
--------------------------------------------------------------------------------
/docs/book/src/resources/overview-attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/overview-attention.png
--------------------------------------------------------------------------------
/docs/book/src/resources/overview-transformer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/overview-transformer.png
--------------------------------------------------------------------------------
/docs/book/src/resources/plot-full-prosa-xlmr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-full-prosa-xlmr.png
--------------------------------------------------------------------------------
/docs/book/src/resources/plot-full-toxic-xlmr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-full-toxic-xlmr.png
--------------------------------------------------------------------------------
/docs/book/src/resources/plot-full-trip-advisor-xlmr-duplicate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-full-trip-advisor-xlmr-duplicate.png
--------------------------------------------------------------------------------
/docs/book/src/resources/plot-full-trip-advisor-xlmr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-full-trip-advisor-xlmr.png
--------------------------------------------------------------------------------
/docs/book/src/resources/plot-gain-mbert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-gain-mbert.png
--------------------------------------------------------------------------------
/docs/book/src/resources/plot-gain-xlmr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-gain-xlmr.png
--------------------------------------------------------------------------------
/docs/book/src/resources/plot-head-prosa-mbert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-head-prosa-mbert.png
--------------------------------------------------------------------------------
/docs/book/src/resources/plot-head-prosa-xlmr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-head-prosa-xlmr.png
--------------------------------------------------------------------------------
/docs/book/src/resources/plot-head-toxic-mbert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-head-toxic-mbert.png
--------------------------------------------------------------------------------
/docs/book/src/resources/plot-head-toxic-xlmr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-head-toxic-xlmr.png
--------------------------------------------------------------------------------
/docs/book/src/resources/plot-head-trip-mbert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-head-trip-mbert.png
--------------------------------------------------------------------------------
/docs/book/src/resources/plot-head-trip-xlmr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-head-trip-xlmr.png
--------------------------------------------------------------------------------
/docs/book/src/resources/prosa-mbert-eng-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/prosa-mbert-eng-1.png
--------------------------------------------------------------------------------
/docs/book/src/resources/prosa-mbert-eng-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/prosa-mbert-eng-2.png
--------------------------------------------------------------------------------
/docs/book/src/resources/prosa-mbert-malay-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/prosa-mbert-malay-1.png
--------------------------------------------------------------------------------
/docs/book/src/resources/prosa-mbert-malay-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/prosa-mbert-malay-2.png
--------------------------------------------------------------------------------
/docs/book/src/resources/prosa-xlmr-eng-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/prosa-xlmr-eng-1.png
--------------------------------------------------------------------------------
/docs/book/src/resources/prosa-xlmr-eng-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/prosa-xlmr-eng-2.png
--------------------------------------------------------------------------------
/docs/book/src/resources/prosa-xlmr-malay-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/prosa-xlmr-malay-1.png
--------------------------------------------------------------------------------
/docs/book/src/resources/prosa-xlmr-malay-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/prosa-xlmr-malay-2.png
--------------------------------------------------------------------------------
/docs/book/src/resources/tandatangan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/tandatangan.png
--------------------------------------------------------------------------------
/docs/book/src/resources/tandatangan_bu_ayu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/tandatangan_bu_ayu.png
--------------------------------------------------------------------------------
/docs/book/src/thesis-blx.bib:
--------------------------------------------------------------------------------
1 | @Comment{$ biblatex control file $}
2 | @Comment{$ biblatex version 2.9 $}
3 | Do not modify this file!
4 |
5 | This is an auxiliary file used by the 'biblatex' package.
6 | This file may safely be deleted. It will be recreated as
7 | required.
8 |
9 | @Control{biblatex-control,
10 | options = {2.9:0:0:1:0:1:1:0:0:1:0:2:3:1:79:+:nyt},
11 | }
12 |
--------------------------------------------------------------------------------
/docs/book/src/thesis.run.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
23 |
28 |
33 |
36 |
39 |
42 | ]>
43 |
44 |
45 | latex
46 |
47 | thesis.aux
48 | thesis-blx.bib
49 |
50 |
51 | thesis.bbl
52 |
53 |
54 | blx-dm.def
55 | blx-compat.def
56 | blx-bibtex.def
57 | biblatex.def
58 | numeric.bbx
59 | standard.bbx
60 | authoryear.cbx
61 | biblatex.cfg
62 | english.lbx
63 | russian.lbx
64 |
65 |
66 |
67 | bibtex
68 |
69 | bibtex
70 |
71 | thesis
72 |
73 |
74 | thesis.aux
75 |
76 |
79 |
80 | thesis.bbl
81 |
82 |
83 | thesis.aux
84 | thesis-blx.bib
85 |
86 |
87 | references.bib
88 |
89 |
90 | biblatex.bst
91 |
92 |
93 |
94 |
--------------------------------------------------------------------------------
/docs/book/src/thesis.tex:
--------------------------------------------------------------------------------
1 | %--------------------------------------------------------------------%
2 | %
3 | % Berkas utama templat LaTeX.
4 | %
5 | % author Ilham Firadusi Putra
6 | % template dari Petra Barus dan Peb Ruswono Aryan
7 | %
8 | %--------------------------------------------------------------------%
9 | %
10 | % Berkas ini berisi struktur utama dokumen LaTeX yang akan dibuat.
11 | %
12 | %--------------------------------------------------------------------%
13 |
14 | \documentclass[12pt, a4paper, onecolumn, oneside, final]{report}
15 |
16 | \input{config/if-itb-thesis.sty}
17 |
18 | \makeatletter
19 |
20 | \makeatother
21 |
22 | \bibliography{references}
23 |
24 | \begin{document}
25 |
26 | %Basic configuration
27 | \title{
28 | Klasifikasi Teks Berbahasa Indonesia Menggunakan \textit{Multilingual Language Model} \\
29 | (Studi Kasus: Klasifikasi Ujaran Kebencian dan Analisis Sentimen)}
30 | \date{}
31 | \author{
32 | Ilham Firdausi Putra\\
33 | NIM : 13516140
34 | }
35 |
36 | \pagenumbering{roman}
37 | \setcounter{page}{0}
38 |
39 | \input{chapters/cover}
40 | \input{chapters/approval}
41 | \input{chapters/statement}
42 |
43 | \pagestyle{plain}
44 |
45 | \input{chapters/abstract-id}
46 | % \input{chapters/abstract-en}
47 | \input{chapters/forewords}
48 |
49 | % \titleformat*{\section}{\centering\bfseries\Large\MakeUpperCase}
50 | \titleformat*{\section}{\centering\bfseries\fontsize{8}{12}\MakeUpperCase}
51 |
52 | \tableofcontents
53 | \listoffigures
54 | \listoftables
55 | \input{chapters/daftar_istilah}
56 |
57 |
58 | % \titleformat*{\section}{\bfseries\Large}
59 | \titleformat*{\section}{\bfseries\fontsize{8}{12}}
60 | \titleformat*{\subsection}{\bfseries\fontsize{8}{12}}
61 | \pagenumbering{arabic}
62 |
63 | %----------------------------------------------------------------%
64 | % Konfigurasi Bab
65 | %----------------------------------------------------------------%
66 | \setcounter{page}{1}
67 | \renewcommand{\chaptername}{BAB}
68 | \renewcommand{\thechapter}{\Roman{chapter}}
69 | %----------------------------------------------------------------%
70 |
71 | %----------------------------------------------------------------%
72 | % Dafter Bab
73 | % Untuk menambahkan daftar bab, buat berkas bab misalnya `chapter-6` di direktori `chapters`, dan masukkan ke sini.
74 | %----------------------------------------------------------------%
75 | \input{chapters/chapter-1}
76 | \input{chapters/chapter-2}
77 | \input{chapters/chapter-3}
78 | \input{chapters/chapter-4}
79 | \input{chapters/chapter-5}
80 | %----------------------------------------------------------------%
81 |
82 | % Daftar pustaka
83 | \printbibliography[title={Daftar Pustaka}]
84 |
85 | % Index
86 | \appendix
87 |
88 | \addcontentsline{toc}{part}{Lampiran}
89 | \part*{Lampiran}
90 |
91 | \input{chapters/appendix-1}
92 | % \input{chapters/appendix-2}
93 |
94 | \end{document}
95 |
--------------------------------------------------------------------------------
/docs/paper/Figure_explained.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/paper/Figure_explained.png
--------------------------------------------------------------------------------
/docs/paper/Figure_symbol.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/paper/Figure_symbol.png
--------------------------------------------------------------------------------
/docs/paper/Improving Indonesian Text Classification Using Multilingual Language Model.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/paper/Improving Indonesian Text Classification Using Multilingual Language Model.docx
--------------------------------------------------------------------------------
/docs/paper/Improving Indonesian Text Classification Using Multilingual Language Model.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/paper/Improving Indonesian Text Classification Using Multilingual Language Model.pdf
--------------------------------------------------------------------------------
/docs/paper/Improving Indonesian Text Classification Using Multilingual Language Model.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/paper/Improving Indonesian Text Classification Using Multilingual Language Model.zip
--------------------------------------------------------------------------------
/docs/paper/Improving-Indonesian-Text-Classification-Using-Multilingual-Language-Model-Putra.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/paper/Improving-Indonesian-Text-Classification-Using-Multilingual-Language-Model-Putra.pdf
--------------------------------------------------------------------------------
/notebooks/README.md:
--------------------------------------------------------------------------------
1 | # Notebook Source Code
2 | Except the `/result_analysis` directory, any notebooks in this directory was originally a Kaggle kernel. I have provided the `.ipynb` file in this directory along with links to the original Kaggle kernel. The naming is slightly different from the original:
3 | * `trip_advisor` means the data is from [(Farhan & Khodra, 2017)](https://www.researchgate.net/publication/320832619_Sentiment-specific_word_embedding_for_Indonesian_sentiment_analysis)
4 | * `prosa` means the data is from [(Crisdayanti & Purwarianti, 2019)](https://ieeexplore.ieee.org/abstract/document/8904199/)
5 | * `toxic` means the data is from [(Ibrohim & Budi, 2019)](https://www.aclweb.org/anthology/W19-3506.pdf)
6 |
7 |
8 | Links to the original Kaggle kernel:
9 | * fine_tune_full
10 | * prosa
11 | * [xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-full-prosa-xlm-r)
12 | * toxic
13 | * [xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-full-toxic-xlm-r-simpler)
14 | * [xlm_r_comparable](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-full-toxic-xlm-r-comparable/)
15 | * trip_advisor
16 | * [xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-full-tripadvisor-xlm-r)
17 | * [xlm_r_duplicate_removed](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-full-tripadvisor-xlm-r-dupli)
18 |
19 | * fine_tune_head
20 | * prosa
21 | * [mbert](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-head-prosa-mbert-all)
22 | * [xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-head-prosa-xlm-r-all)
23 | * toxic
24 | * [mbert](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-head-toxic-mbert-all)
25 | * [xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-head-toxic-xlm-r-all)
26 | * trip_advisor
27 | * [mbert](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-head-tripadvisor-mbert-all)
28 | * [xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-head-tripadvisor-xlm-r-all)
29 | * extracting_features
30 | * prosa
31 | * [mbert](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-prosa-features-mbert)
32 | * [xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-prosa-features-xlm-r)
33 | * toxic
34 | * [mbert](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-features-mbert)
35 | * [xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-features-xlm-r)
36 | * trip_advisor
37 | * [mbert](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-tripadvisor-features-mbert)
38 | * [xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-tripadvisor-features-xlm-r)
39 | * yelp_review
40 | * mbert
41 | * [mbert_1](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-mbert-1)
42 | * [mbert_2](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-mbert-2)
43 | * [mbert_3](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-mbert-3)
44 | * [mbert_4](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-mbert-4)
45 | * [mbert_5](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-mbert-5)
46 | * [mbert_6](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-mbert-6)
47 | * [mbert_7](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-mbert-7)
48 | * [mbert_8](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-mbert-8)
49 | * [mbert_9](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-mbert-9)
50 | * [combining_mbert](https://www.kaggle.com/ilhamfp31/indoxtc-combining-yelp-features-mbert)
51 | * xlm_r
52 | * [xlm_r_1](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-xlm-r-1)
53 | * [xlm_r_2](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-xlm-r-2)
54 | * [xlm_r_3](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-xlm-r-3)
55 | * [xlm_r_4](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-xlm-r-4)
56 | * [xlm_r_5](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-xlm-r-5)
57 | * [xlm_r_6](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-xlm-r-6)
58 | * [xlm_r_7](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-xlm-r-7)
59 | * [xlm_r_8](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-xlm-r-8)
60 | * [xlm_r_9](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-xlm-r-9)
61 | * [combining_xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-combining-yelp-features-xlm-r)
62 | * jigsaw_toxic
63 | * mbert
64 | * [mbert_1](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-en-features-mbert-1)
65 | * [mbert_2](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-en-features-mbert-2)
66 | * [mbert_3](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-en-features-mbert-3)
67 | * [mbert_4](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-en-features-mbert-4)
68 | * [mbert_5](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-en-features-mbert-5)
69 | * [mbert_6](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-en-features-mbert-6)
70 | * [combining_mbert](https://www.kaggle.com/ilhamfp31/indoxtc-combining-toxic-en-features-mbert)
71 | * xlm_r
72 | * [xlm_r_1](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-en-features-xlm-r-1)
73 | * [xlm_r_2](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-en-features-xlm-r-2)
74 | * [xlm_r_3](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-en-features-xlm-r-3)
75 | * [combining_xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-combining-toxic-en-features-xlm-r)
76 |
77 |
--------------------------------------------------------------------------------
/notebooks/fine_tune_head/extracting_features/jigsaw_toxic/mbert/indoxtc-combining-toxic-en-features-mbert.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# IndoXTC - Combining Toxic-EN Features [mBERT]\n",
8 | "Exploring Indonesian hate speech/abusive & sentiment text classification using multilingual language model.\n",
9 | "\n",
10 | "This kernel is a part of my undergraduate final year project.\n",
11 | "Checkout the full github repository:\n",
12 | "https://github.com/ilhamfp/indonesian-text-classification-multilingual"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 1,
18 | "metadata": {
19 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
20 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
21 | },
22 | "outputs": [
23 | {
24 | "name": "stdout",
25 | "output_type": "stream",
26 | "text": [
27 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-1/__notebook__.ipynb\n",
28 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-1/custom.css\n",
29 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-1/train_label.csv\n",
30 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-1/__results__.html\n",
31 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-1/__output__.json\n",
32 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-1/train_text.npy\n",
33 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-5/__notebook__.ipynb\n",
34 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-5/custom.css\n",
35 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-5/train_label.csv\n",
36 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-5/__results__.html\n",
37 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-5/__output__.json\n",
38 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-5/train_text.npy\n",
39 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-4/__notebook__.ipynb\n",
40 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-4/custom.css\n",
41 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-4/train_label.csv\n",
42 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-4/__results__.html\n",
43 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-4/__output__.json\n",
44 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-4/train_text.npy\n",
45 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-2/__notebook__.ipynb\n",
46 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-2/custom.css\n",
47 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-2/train_label.csv\n",
48 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-2/__results__.html\n",
49 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-2/__output__.json\n",
50 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-2/train_text.npy\n",
51 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-3/__notebook__.ipynb\n",
52 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-3/custom.css\n",
53 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-3/train_label.csv\n",
54 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-3/__results__.html\n",
55 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-3/__output__.json\n",
56 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-3/train_text.npy\n",
57 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-6/__notebook__.ipynb\n",
58 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-6/custom.css\n",
59 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-6/train_label.csv\n",
60 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-6/__results__.html\n",
61 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-6/__output__.json\n",
62 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-6/train_text.npy\n"
63 | ]
64 | }
65 | ],
66 | "source": [
67 | "import numpy as np\n",
68 | "import pandas as pd \n",
69 | "import os\n",
70 | "for dirname, _, filenames in os.walk('/kaggle/input'):\n",
71 | " for filename in filenames:\n",
72 | " print(os.path.join(dirname, filename))"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 2,
78 | "metadata": {
79 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
80 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
81 | },
82 | "outputs": [
83 | {
84 | "name": "stdout",
85 | "output_type": "stream",
86 | "text": [
87 | "(120000, 1, 768)\n"
88 | ]
89 | }
90 | ],
91 | "source": [
92 | "train_x = np.concatenate([\n",
93 | " np.array([x for x in np.load('../input/indoxtc-extracting-toxic-en-features-mbert-1/train_text.npy', allow_pickle=True)]),\n",
94 | " np.array([x for x in np.load('../input/indoxtc-extracting-toxic-en-features-mbert-2/train_text.npy', allow_pickle=True)]),\n",
95 | " np.array([x for x in np.load('../input/indoxtc-extracting-toxic-en-features-mbert-3/train_text.npy', allow_pickle=True)]),\n",
96 | " np.array([x for x in np.load('../input/indoxtc-extracting-toxic-en-features-mbert-4/train_text.npy', allow_pickle=True)]),\n",
97 | " np.array([x for x in np.load('../input/indoxtc-extracting-toxic-en-features-mbert-5/train_text.npy', allow_pickle=True)]),\n",
98 | " np.array([x for x in np.load('../input/indoxtc-extracting-toxic-en-features-mbert-6/train_text.npy', allow_pickle=True)]),\n",
99 | " ])\n",
100 | "\n",
101 | "print(train_x.shape)\n",
102 | "np.save(\"train_text.npy\", train_x)"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 3,
108 | "metadata": {},
109 | "outputs": [
110 | {
111 | "name": "stdout",
112 | "output_type": "stream",
113 | "text": [
114 | "(120000, 1)\n",
115 | "1 60000\n",
116 | "0 60000\n",
117 | "Name: label, dtype: int64\n"
118 | ]
119 | },
120 | {
121 | "data": {
122 | "text/html": [
123 | "