├── .gitattributes
├── .github
└── FUNDING.yml
├── .gitignore
├── .ipynb_checkpoints
├── Subreddit_and_solution-checkpoint.ipynb
├── Tf-Idf-checkpoint.ipynb
├── W2VXAuthor-checkpoint.ipynb
├── data-mining-challange-checkpoint.Rproj
├── resources -checkpoint.Rmd
├── scoreXSparseValidation-checkpoint.csv
├── submission-checkpoint.csv
└── tutorial-checkpoint.ipynb
├── LICENSE
├── Notebooks
├── .DS_Store
├── other-attempts
│ ├── .DS_Store
│ ├── keras-neural-networks
│ │ ├── .DS_Store
│ │ ├── .ipynb_checkpoints
│ │ │ ├── embeddings-checkpoint.ipynb
│ │ │ └── simple-net-prediction-checkpoint.ipynb
│ │ ├── README.md
│ │ ├── embeddings.ipynb
│ │ ├── pretrained-embeddings.ipynb
│ │ ├── simple-net-grid.ipynb
│ │ ├── simple-net-prediction.ipynb
│ │ └── simple-net.ipynb
│ └── spaCy
│ │ ├── .DS_Store
│ │ ├── ReadMe.md
│ │ ├── data_preparation
│ │ ├── .ipynb_checkpoints
│ │ │ ├── lemmatizer-checkpoint.ipynb
│ │ │ └── vectorizer-checkpoint.ipynb
│ │ ├── lemmatizer.ipynb
│ │ └── vectorizer.ipynb
│ │ ├── finals
│ │ ├── .ipynb_checkpoints
│ │ │ ├── ReadMe-checkpoint.ipynb
│ │ │ ├── final_bal_lr-checkpoint.ipynb
│ │ │ ├── final_bodieswS-checkpoint.ipynb
│ │ │ ├── final_lr-checkpoint.ipynb
│ │ │ ├── final_subreddits-checkpoint.ipynb
│ │ │ ├── final_svm-checkpoint.ipynb
│ │ │ ├── lemmatizer_final-checkpoint.ipynb
│ │ │ ├── solution-checkpoint.ipynb
│ │ │ ├── solution_bal-checkpoint.ipynb
│ │ │ └── spacyW2v_Final-checkpoint.ipynb
│ │ ├── ReadMe.md
│ │ ├── final_bal_lr.ipynb
│ │ ├── final_bodieswS.ipynb
│ │ ├── final_lr.ipynb
│ │ ├── final_subreddits.ipynb
│ │ ├── final_svm.ipynb
│ │ ├── lemmatizer_final.ipynb
│ │ ├── solution.ipynb
│ │ ├── solution_bal.ipynb
│ │ └── spacyW2v_Final.ipynb
│ │ ├── images
│ │ ├── bodieswS_test_ensemble_balanced_e15_wS.png
│ │ └── bodieswS_test_ensemble_balanced_e3.png
│ │ ├── intermediate_models
│ │ ├── .ipynb_checkpoints
│ │ │ ├── ReadMe-checkpoint.ipynb
│ │ │ ├── final-checkpoint.ipynb
│ │ │ ├── spactW2v-checkpoint.ipynb
│ │ │ ├── spacyBowAggAveraged-checkpoint.ipynb
│ │ │ ├── spacySubreddits-checkpoint.ipynb
│ │ │ ├── spacyTransformerEnsembleLemmatizedAveraged-checkpoint.ipynb
│ │ │ └── subreddits-checkpoint.ipynb
│ │ ├── ReadMe.md
│ │ ├── final.ipynb
│ │ ├── spactW2v.ipynb
│ │ ├── spacyBowAggAveraged.ipynb
│ │ ├── spacySubreddits.ipynb
│ │ ├── spacyTransformerEnsembleLemmatizedAveraged.ipynb
│ │ └── subreddits.ipynb
│ │ └── outputs
│ │ ├── .ipynb_checkpoints
│ │ ├── Untitled-checkpoint.ipynb
│ │ ├── bow_bal_lPunctAgg-checkpoint.txt
│ │ ├── bow_bal_lPunctNumAgg-checkpoint.txt
│ │ ├── bow_dlPunctNumStopLemOovAgg-checkpoint.txt
│ │ ├── bow_lPunctAgg-checkpoint.txt
│ │ ├── bow_lPunctNumAgg-checkpoint.txt
│ │ ├── bow_lPunctNumLemAgg-checkpoint.txt
│ │ ├── bow_lPunctNumLemOovAgg-checkpoint.txt
│ │ ├── bow_lPunctNumOovAgg-checkpoint.txt
│ │ ├── bow_lPunctNumPersAgg-checkpoint.txt
│ │ ├── bow_lPunctNumPersLemAgg-checkpoint.txt
│ │ ├── bow_lPunctNumPersLemOovAgg-checkpoint.txt
│ │ ├── bow_lPunctNumStopLemAgg-checkpoint.txt
│ │ ├── bow_lPunctNumStopLemOovAgg-checkpoint.txt
│ │ ├── bow_lPunctNumStopOovAgg-checkpoint.txt
│ │ ├── ensemble_bal_lPunctAgg-checkpoint.txt
│ │ ├── ensemble_bal_lPunctNumStopLemOovAgg-checkpoint.txt
│ │ ├── ensemble_dlPunctNumLemOovAgg-checkpoint.txt
│ │ ├── ensemble_dlPunctNumStopLemOovAgg-checkpoint.txt
│ │ ├── ensemble_lPunctAgg-checkpoint.txt
│ │ ├── ensemble_lPunctNumAgg-checkpoint.txt
│ │ ├── ensemble_lPunctNumLemAgg-checkpoint.txt
│ │ ├── ensemble_lPunctNumLemOovAgg-checkpoint.txt
│ │ ├── ensemble_lPunctNumOovAgg-checkpoint.txt
│ │ ├── ensemble_lPunctNumPersAgg-checkpoint.txt
│ │ ├── ensemble_lPunctNumPersLemAgg-checkpoint.txt
│ │ ├── ensemble_lPunctNumPersLemOovAgg-checkpoint.txt
│ │ ├── ensemble_lPunctNumStopLemAgg-checkpoint.txt
│ │ ├── ensemble_lPunctNumStopLemOovAgg-checkpoint.txt
│ │ ├── ensemble_lPunctNumStopOovAgg-checkpoint.txt
│ │ └── spacyW2vMlp-checkpoint.txt
│ │ ├── bow_bal_lPunctAgg.txt
│ │ ├── bow_bal_lPunctNumAgg.txt
│ │ ├── bow_bal_lPunctNumLemAgg.txt
│ │ ├── bow_bal_lPunctNumLemOovAgg.txt
│ │ ├── bow_bal_lPunctNumOovAgg.txt
│ │ ├── bow_bal_lPunctNumPersAgg.txt
│ │ ├── bow_bal_lPunctNumPersLemAgg.txt
│ │ ├── bow_bal_lPunctNumPersLemOovAgg.txt
│ │ ├── bow_bal_lPunctNumStopLemAgg.txt
│ │ ├── bow_bal_lPunctNumStopLemOovAgg.txt
│ │ ├── bow_bal_lPunctNumStopOovAgg.txt
│ │ ├── bow_dlPunctNumStopLemOovAgg.txt
│ │ ├── bow_lPunctAgg.txt
│ │ ├── bow_lPunctNumAgg.txt
│ │ ├── bow_lPunctNumLemAgg.txt
│ │ ├── bow_lPunctNumLemOovAgg.txt
│ │ ├── bow_lPunctNumOovAgg.txt
│ │ ├── bow_lPunctNumPersAgg.txt
│ │ ├── bow_lPunctNumPersLemAgg.txt
│ │ ├── bow_lPunctNumPersLemOovAgg.txt
│ │ ├── bow_lPunctNumStopLemAgg.txt
│ │ ├── bow_lPunctNumStopLemOovAgg.txt
│ │ ├── bow_lPunctNumStopOovAgg.txt
│ │ ├── ensemble_bal_lPunctAgg.txt
│ │ ├── ensemble_bal_lPunctNumAgg.txt
│ │ ├── ensemble_bal_lPunctNumLemAgg.txt
│ │ ├── ensemble_bal_lPunctNumLemOovAgg.txt
│ │ ├── ensemble_bal_lPunctNumOovAgg.txt
│ │ ├── ensemble_bal_lPunctNumPersAgg.txt
│ │ ├── ensemble_bal_lPunctNumPersLemAgg.txt
│ │ ├── ensemble_bal_lPunctNumPersLemOovAgg.txt
│ │ ├── ensemble_bal_lPunctNumStopLemAgg.txt
│ │ ├── ensemble_bal_lPunctNumStopLemOovAgg.txt
│ │ ├── ensemble_bal_lPunctNumStopOovAgg.txt
│ │ ├── ensemble_dlPunctNumLemOovAgg.txt
│ │ ├── ensemble_dlPunctNumStopLemOovAgg.txt
│ │ ├── ensemble_lPunctAgg.txt
│ │ ├── ensemble_lPunctNumAgg.txt
│ │ ├── ensemble_lPunctNumLemAgg.txt
│ │ ├── ensemble_lPunctNumLemOovAgg.txt
│ │ ├── ensemble_lPunctNumOovAgg.txt
│ │ ├── ensemble_lPunctNumPersAgg.txt
│ │ ├── ensemble_lPunctNumPersLemAgg.txt
│ │ ├── ensemble_lPunctNumPersLemOovAgg.txt
│ │ ├── ensemble_lPunctNumStopLemAgg.txt
│ │ ├── ensemble_lPunctNumStopLemOovAgg.txt
│ │ ├── ensemble_lPunctNumStopOovAgg.txt
│ │ ├── softmax_bert_lPunctNumStopLemOovAgg.txt
│ │ └── spacyW2vMlp.txt
└── successful-models
│ ├── .DS_Store
│ ├── .ipynb_checkpoints
│ ├── Final_sub-checkpoint.ipynb
│ ├── MLPs(90)_sub-checkpoint.ipynb
│ ├── MLPs_test_sub-checkpoint.ipynb
│ ├── doc2vec-4000-checkpoint.ipynb
│ ├── doc2vec-5000-checkpoint.ipynb
│ ├── final-model-selection-checkpoint.ipynb
│ ├── mlp-subreddits-4000-checkpoint.ipynb
│ ├── mlp-subreddits-5000-checkpoint.ipynb
│ ├── submission-checkpoint.ipynb
│ ├── xgb-4000-checkpoint.ipynb
│ ├── xgb-5000-checkpoint.ipynb
│ └── xgb-gridsearch-checkpoint.ipynb
│ ├── doc2vec-4000.ipynb
│ ├── doc2vec-5000.ipynb
│ ├── final-model-selection.ipynb
│ ├── mlp-subreddits-4000.ipynb
│ ├── mlp-subreddits-5000.ipynb
│ ├── submission.ipynb
│ ├── xgb-4000.ipynb
│ ├── xgb-5000.ipynb
│ └── xgb-gridsearch.ipynb
├── README.md
├── _config.yml
├── images
└── flow-chart.png
└── index.md
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.csv
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # GitHub Sponsors
2 |
3 | github: [pitmonticone]
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
--------------------------------------------------------------------------------
/.ipynb_checkpoints/data-mining-challange-checkpoint.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
--------------------------------------------------------------------------------
/.ipynb_checkpoints/resources -checkpoint.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Data Mining Challange: Resoures"
3 | author: "Pietro Monticone"
4 | date: "`r Sys.Date()` | Turin University"
5 | output:
6 | prettydoc::html_pretty:
7 | theme: cayman
8 | highlight: github
9 | toc: true
10 | ---
11 |
12 | ```{r setup, include=FALSE}
13 | knitr::opts_chunk$set(
14 | echo = FALSE,
15 | message = FALSE,
16 | warning = FALSE
17 | )
18 | ```
19 |
20 | # Data Camp
21 |
22 | ## Theory
23 | * [Data Science for Everyone](https://www.datacamp.com/courses/data-science-for-everyone)
24 | * [Machine Learning for Everyone](https://www.datacamp.com/courses/machine-learning-for-everyone)
25 |
26 | ## Python
27 |
28 | ### Programming
29 | * [Introduction to Python](https://www.datacamp.com/courses/intro-to-python-for-data-science)
30 | * [Intermediate Python](https://www.datacamp.com/courses/intermediate-python)
31 | * [Data Science Toolbox 1](https://www.datacamp.com/courses/python-data-science-toolbox-part-1)
32 | * [Data Science Toolbox 2](https://www.datacamp.com/courses/python-data-science-toolbox-part-2)
33 |
34 | #### Coding Best Practices with Python
35 | * [Writing Efficient Python Code](https://www.datacamp.com/courses/writing-efficient-python-code)
36 | * [Writing Efficient Code with pandas](https://www.datacamp.com/courses/writing-efficient-code-with-pandas)
37 | * [Writing Funcitons in Python](https://www.datacamp.com/courses/writing-functions-in-python)
38 | * [Object-Oriented Programming in Python](https://www.datacamp.com/courses/object-oriented-programming-in-python)
39 |
40 | ### Data Collection & Cleaning
41 |
42 | * [Introduction](https://www.datacamp.com/courses/introduction-to-importing-data-in-python)
43 | * [Intermediate](https://www.datacamp.com/courses/intermediate-importing-data-in-python)
44 | * [Cleaning](https://www.datacamp.com/courses/cleaning-data-in-python)
45 |
46 | ### Data Manipulation
47 | * [pandas Foundations](https://www.datacamp.com/courses/pandas-foundations)
48 | * [Manipulating DataFrames with pandas](https://www.datacamp.com/courses/manipulating-dataframes-with-pandas)
49 | * [Merging DataFrames with pandas](https://www.datacamp.com/courses/merging-dataframes-with-pandas)
50 |
51 | ### Data Visualization
52 | * [Introduction to Data Viz with Matplotlib](https://www.datacamp.com/courses/introduction-to-data-visualization-with-matplotlib)
53 | * [Introduction to Data Viz with Seaborn](https://www.datacamp.com/courses/introduction-to-data-visualization-with-seaborn)
54 | * [Improving Data Viz)(https://www.datacamp.com/courses/improving-your-data-visualizations-in-python)
55 | * [Interactive Data Viz](https://www.datacamp.com/courses/interactive-data-visualization-with-bokeh)
56 |
57 | ### Machine Learning
58 | * [Supervised Learning with scikit-learn](https://www.datacamp.com/courses/supervised-learning-with-scikit-learn)
59 | * [Unsupervised Learning in Python](https://www.datacamp.com/courses/unsupervised-learning-in-python)
60 | * [Linear Classifiers in Python](https://www.datacamp.com/courses/linear-classifiers-in-python)
61 |
62 | #### NLP
63 | * [Introduction to Natural Language Processing in Python](https://www.datacamp.com/courses/introduction-to-natural-language-processing-in-python)
64 | * [Advanced NLP with spaCy](https://learn.datacamp.com/courses/advanced-nlp-with-spacy)
65 |
66 | # Kaggle
67 | * Python
68 | * Bag of words
69 |
70 | # Notebooks
71 |
72 | # Github
73 | * **2018** [Project by Simone Azeglio](https://github.com/simoneazeglio/DataMiningChallenge2018)
74 |
75 | # Lectures
76 |
77 | ## MIT
78 |
79 | * **2020**[6.S191: Introduction to
80 | Deep Learning](http://introtodeeplearning.com)
81 |
82 |
83 | ## Caltech
84 |
85 | ## Stanford
86 |
87 | * **2020** [CS224n: Natural Language Processing with Deep Learning](http://web.stanford.edu/class/cs224n/index.html)
88 | * **2019** [CS224n: Natural Language Processing with Deep Learning](https://web.stanford.edu/class/archive/cs/cs224n/cs224n.1194/)
89 |
90 | ## 3B1B
91 | * [Neural Networks](https://www.youtube.com/playlist?list=PLZHQObOWTQDNU6R1_67000Dx_ZCJB-3pi)
92 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Interdisciplinary Physics Team (InPhyT)
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Notebooks/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pitmonticone/RedditTextClassification/fdd8b3a6e649781df9147599889c4669517f65ab/Notebooks/.DS_Store
--------------------------------------------------------------------------------
/Notebooks/other-attempts/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pitmonticone/RedditTextClassification/fdd8b3a6e649781df9147599889c4669517f65ab/Notebooks/other-attempts/.DS_Store
--------------------------------------------------------------------------------
/Notebooks/other-attempts/keras-neural-networks/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pitmonticone/RedditTextClassification/fdd8b3a6e649781df9147599889c4669517f65ab/Notebooks/other-attempts/keras-neural-networks/.DS_Store
--------------------------------------------------------------------------------
/Notebooks/other-attempts/keras-neural-networks/.ipynb_checkpoints/simple-net-prediction-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 6,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "[name: \"/device:CPU:0\"\n",
13 | "device_type: \"CPU\"\n",
14 | "memory_limit: 268435456\n",
15 | "locality {\n",
16 | "}\n",
17 | "incarnation: 1331614535948791056\n",
18 | ", name: \"/device:GPU:0\"\n",
19 | "device_type: \"GPU\"\n",
20 | "memory_limit: 7473294746\n",
21 | "locality {\n",
22 | " bus_id: 1\n",
23 | " links {\n",
24 | " }\n",
25 | "}\n",
26 | "incarnation: 17851818086571483571\n",
27 | "physical_device_desc: \"device: 0, name: GeForce GTX 1070, pci bus id: 0000:01:00.0, compute capability: 6.1\"\n",
28 | "]\n",
29 | "2.3.1\n",
30 | "Wall time: 1.1 s\n"
31 | ]
32 | }
33 | ],
34 | "source": [
35 | "%%time\n",
36 | "#print(\"1\")\n",
37 | "import tensorflow as tf\n",
38 | "from numba import cuda\n",
39 | "from tensorflow.python.client import device_lib\n",
40 | "print(device_lib.list_local_devices())\n",
41 | "from keras.preprocessing.sequence import pad_sequences\n",
42 | "#print(\"2\")\n",
43 | "from sklearn.feature_extraction.text import TfidfVectorizer\n",
44 | "from sklearn.preprocessing import StandardScaler\n",
45 | "import pickle\n",
46 | "from keras.layers import Dense, Input, Dropout\n",
47 | "#print(\"3\")\n",
48 | "from keras import Sequential\n",
49 | "#print(\"4\")\n",
50 | "from sklearn.preprocessing import StandardScaler\n",
51 | "from tensorflow.keras.callbacks import EarlyStopping\n",
52 | "import matplotlib.pyplot as plt\n",
53 | "import keras\n",
54 | "print(keras.__version__)\n",
55 | "from sklearn.model_selection import train_test_split\n",
56 | "from keras.constraints import maxnorm\n",
57 | "import numpy as np"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 7,
63 | "metadata": {},
64 | "outputs": [
65 | {
66 | "name": "stdout",
67 | "output_type": "stream",
68 | "text": [
69 | "training tfidf and tranforming\n",
70 | "vocab_size = 120536\n",
71 | "padding\n",
72 | "done\n",
73 | "Wall time: 1min 50s\n"
74 | ]
75 | }
76 | ],
77 | "source": [
78 | "%%time\n",
79 | "\n",
80 | "with open(r\"comments.txt\", \"rb\") as f:\n",
81 | " clean_train_comments = pickle.load(f) \n",
82 | " f.close()\n",
83 | "\n",
84 | "with open(r\"targets.txt\", \"rb\") as ft:\n",
85 | " y= pickle.load(ft) \n",
86 | " ft.close()\n",
87 | "\n",
88 | " \n",
89 | "y = [int(s) for s in y]\n",
90 | "\n",
91 | "\n",
92 | "\n",
93 | "#tfidf vectorization\n",
94 | "tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,\n",
95 | " ngram_range=(1, 2), \n",
96 | " stop_words='english')\n",
97 | "\n",
98 | "# We transform each complaint into a vector\n",
99 | "print(\"training tfidf and tranforming\")\n",
100 | "X = tfidf.fit_transform(clean_train_comments).toarray() #clean-train_comments # as this: https://stats.stackexchange.com/questions/154660/tfidfvectorizer-should-it-be-used-on-train-only-or-traintest and this: https://stackoverflow.com/questions/47778403/computing-tf-idf-on-the-whole-dataset-or-only-on-training-data suggest,train tfidf only on training set\n",
101 | "vocab_size = len(tfidf.vocabulary_) + 1\n",
102 | "print(\"vocab_size = \", vocab_size)\n",
103 | "# evaluate max len train data\n",
104 | "maxlen = max([len(x) for x in X])\n",
105 | "# pad train data accordingly\n",
106 | "print(\"padding\")\n",
107 | "X_pad = pad_sequences(X, padding='post', maxlen=maxlen, dtype='float32') \n",
108 | "\n",
109 | "print(\"done\")"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 8,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "# Define the models.\n",
119 | "\n",
120 | "def model0(): # from https://medium.com/@am.benatmane/keras-hyperparameter-tuning-using-sklearn-pipelines-grid-search-with-cross-validation-ccfc74b0ce9f\n",
121 | "\n",
122 | " METRICS = [ \n",
123 | " tf.keras.metrics.BinaryAccuracy(name='accuracy'),\n",
124 | " tf.keras.metrics.AUC(name='auc'),\n",
125 | " ]\n",
126 | "\n",
127 | " optimizer=\"Adamax\" #\"adam\"\n",
128 | " dropout=0.1 #0.1\n",
129 | " init='uniform'\n",
130 | " nbr_features= vocab_size-1 #2500\n",
131 | " dense_nparams=256\n",
132 | "\n",
133 | " model = Sequential()\n",
134 | " model.add(Dense(dense_nparams, activation='softsign', input_shape=(nbr_features,), kernel_initializer=init, kernel_constraint=maxnorm(3))) # maxnorm(0) & softmax & sigmoid -> 0.89 # maxnorm(0) & softmax & softmax -> 0.5 maxnorm(2) & relu & sigmoid ->0.92 maxnorm(1) & relu & sigmoid ->0.82\n",
135 | " model.add(Dropout(dropout))\n",
136 | " model.add(Dense(1, activation='sigmoid')) # relu & \"softmax\" fa 0.5-> non va bene #' relu & softplus' -> 0.75 #'sigmoid'\n",
137 | " model.compile(loss='binary_crossentropy', optimizer=optimizer,metrics = METRICS)\n",
138 | " return model\n",
139 | " "
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 9,
145 | "metadata": {},
146 | "outputs": [
147 | {
148 | "name": "stdout",
149 | "output_type": "stream",
150 | "text": [
151 | "Epoch 1/3\n",
152 | "Epoch 2/3\n",
153 | "Epoch 3/3\n"
154 | ]
155 | }
156 | ],
157 | "source": [
158 | "\n",
159 | "model = model0()\n",
160 | "\n",
161 | "history = model.fit(x=X_pad, y=y, batch_size = 8, epochs = 3, verbose=10, shuffle=True, max_queue_size=10, workers=4, use_multiprocessing=True) #, callbacks=callbacks , validation_split=0.2\n",
162 | "\n",
163 | "# reset gpu memory https://stackoverflow.com/a/60354785/13110508 (but be warned: it crashes python, so use it just at the end)\n",
164 | "# device = cuda.get_current_device()\n",
165 | "# device.reset()"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 10,
171 | "metadata": {},
172 | "outputs": [],
173 | "source": [
174 | "import pandas as pd\n",
175 | "with open(r\"comments_test.txt\", \"rb\") as f:\n",
176 | " clean_test_comments = pickle.load(f) \n",
177 | " f.close()\n",
178 | " \n",
179 | " \n",
180 | "X_test = tfidf.transform(clean_test_comments).toarray()\n",
181 | "maxlen_test = max([len(x) for x in X_test])\n",
182 | "X_test_pad = pad_sequences(X_test, padding='post', maxlen=maxlen, dtype='float32')\n",
183 | "#X_test_pad_scal = scaler.transform(X_test_pad)\n",
184 | "\n",
185 | "y_pred = model.predict_proba(X_test_pad)\n",
186 | "y_pred_unp = [y_pred[i][0] for i in range(len(y_pred))]\n",
187 | "with open(r\"authors_test.txt\", \"rb\") as f:\n",
188 | " authors = pickle.load(f) \n",
189 | " f.close()\n",
190 | " \n",
191 | "solution = pd.DataFrame({\"author\":authors, \"gender\":y_pred_unp})\n",
192 | "\n",
193 | "solution.to_csv(r\"Q:\\tooBigToDrive\\data-mining\\kaggle\\data\\challengedadata\\solutions\\simpleNetNoScalProbaGridD0_sol.csv\",index = False)"
194 | ]
195 | }
196 | ],
197 | "metadata": {
198 | "kernelspec": {
199 | "display_name": "Python 3",
200 | "language": "python",
201 | "name": "python3"
202 | },
203 | "language_info": {
204 | "codemirror_mode": {
205 | "name": "ipython",
206 | "version": 3
207 | },
208 | "file_extension": ".py",
209 | "mimetype": "text/x-python",
210 | "name": "python",
211 | "nbconvert_exporter": "python",
212 | "pygments_lexer": "ipython3",
213 | "version": "3.7.4"
214 | }
215 | },
216 | "nbformat": 4,
217 | "nbformat_minor": 4
218 | }
219 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/keras-neural-networks/README.md:
--------------------------------------------------------------------------------
1 | ## Keras Neural Networks
2 |
3 | This folder contains three approaches with neural networks.
4 |
5 | ### 1. TFIDF classification
6 |
7 | This task is brought forward by the `simple-net.ipynb`,`simple-net-prediction.ipynb`,`simple-net-grid.ipynb`. The first validates the model, the third gridsearchs it and the fourth outpus the predictions. It does roc = 89.7 on the test set.
8 |
9 | ### 2. Embeddings classification
10 |
11 | Trains and predicts an embedding layer before classifying. Several netowrks have been tried. Due to a poorer validation performance if compared to more transparent models like an MLP on doc2vec (see [successful-models](https://github.com/pitmonticone/data-mining-challange/tree/master/successful-models)), we thought it not to be worth a gridsearch & prediction effort. Releted notebook: `embeddings.ipynb`
12 |
13 | ### 3. Embeddings classification
14 |
15 | Same as above, but with glove vectors pretrained on 6B words and 300 dimensions. Related notebook: `pretrained-embeddings.ipynb`.
16 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/keras-neural-networks/simple-net-prediction.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 6,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "[name: \"/device:CPU:0\"\n",
13 | "device_type: \"CPU\"\n",
14 | "memory_limit: 268435456\n",
15 | "locality {\n",
16 | "}\n",
17 | "incarnation: 1331614535948791056\n",
18 | ", name: \"/device:GPU:0\"\n",
19 | "device_type: \"GPU\"\n",
20 | "memory_limit: 7473294746\n",
21 | "locality {\n",
22 | " bus_id: 1\n",
23 | " links {\n",
24 | " }\n",
25 | "}\n",
26 | "incarnation: 17851818086571483571\n",
27 | "physical_device_desc: \"device: 0, name: GeForce GTX 1070, pci bus id: 0000:01:00.0, compute capability: 6.1\"\n",
28 | "]\n",
29 | "2.3.1\n",
30 | "Wall time: 1.1 s\n"
31 | ]
32 | }
33 | ],
34 | "source": [
35 | "%%time\n",
36 | "#print(\"1\")\n",
37 | "import tensorflow as tf\n",
38 | "from numba import cuda\n",
39 | "from tensorflow.python.client import device_lib\n",
40 | "print(device_lib.list_local_devices())\n",
41 | "from keras.preprocessing.sequence import pad_sequences\n",
42 | "#print(\"2\")\n",
43 | "from sklearn.feature_extraction.text import TfidfVectorizer\n",
44 | "from sklearn.preprocessing import StandardScaler\n",
45 | "import pickle\n",
46 | "from keras.layers import Dense, Input, Dropout\n",
47 | "#print(\"3\")\n",
48 | "from keras import Sequential\n",
49 | "#print(\"4\")\n",
50 | "from sklearn.preprocessing import StandardScaler\n",
51 | "from tensorflow.keras.callbacks import EarlyStopping\n",
52 | "import matplotlib.pyplot as plt\n",
53 | "import keras\n",
54 | "print(keras.__version__)\n",
55 | "from sklearn.model_selection import train_test_split\n",
56 | "from keras.constraints import maxnorm\n",
57 | "import numpy as np"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 7,
63 | "metadata": {},
64 | "outputs": [
65 | {
66 | "name": "stdout",
67 | "output_type": "stream",
68 | "text": [
69 | "training tfidf and tranforming\n",
70 | "vocab_size = 120536\n",
71 | "padding\n",
72 | "done\n",
73 | "Wall time: 1min 50s\n"
74 | ]
75 | }
76 | ],
77 | "source": [
78 | "%%time\n",
79 | "\n",
80 | "with open(r\"comments.txt\", \"rb\") as f:\n",
81 | " clean_train_comments = pickle.load(f) \n",
82 | " f.close()\n",
83 | "\n",
84 | "with open(r\"targets.txt\", \"rb\") as ft:\n",
85 | " y= pickle.load(ft) \n",
86 | " ft.close()\n",
87 | "\n",
88 | " \n",
89 | "y = [int(s) for s in y]\n",
90 | "\n",
91 | "\n",
92 | "\n",
93 | "#tfidf vectorization\n",
94 | "tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,\n",
95 | " ngram_range=(1, 2), \n",
96 | " stop_words='english')\n",
97 | "\n",
98 | "# We transform each complaint into a vector\n",
99 | "print(\"training tfidf and tranforming\")\n",
100 | "X = tfidf.fit_transform(clean_train_comments).toarray() #clean-train_comments # as this: https://stats.stackexchange.com/questions/154660/tfidfvectorizer-should-it-be-used-on-train-only-or-traintest and this: https://stackoverflow.com/questions/47778403/computing-tf-idf-on-the-whole-dataset-or-only-on-training-data suggest,train tfidf only on training set\n",
101 | "vocab_size = len(tfidf.vocabulary_) + 1\n",
102 | "print(\"vocab_size = \", vocab_size)\n",
103 | "# evaluate max len train data\n",
104 | "maxlen = max([len(x) for x in X])\n",
105 | "# pad train data accordingly\n",
106 | "print(\"padding\")\n",
107 | "X_pad = pad_sequences(X, padding='post', maxlen=maxlen, dtype='float32') \n",
108 | "\n",
109 | "print(\"done\")"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 8,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "# Define the models.\n",
119 | "\n",
120 | "def model0(): # from https://medium.com/@am.benatmane/keras-hyperparameter-tuning-using-sklearn-pipelines-grid-search-with-cross-validation-ccfc74b0ce9f\n",
121 | "\n",
122 | " METRICS = [ \n",
123 | " tf.keras.metrics.BinaryAccuracy(name='accuracy'),\n",
124 | " tf.keras.metrics.AUC(name='auc'),\n",
125 | " ]\n",
126 | "\n",
127 | " optimizer=\"Adamax\" #\"adam\"\n",
128 | " dropout=0.1 #0.1\n",
129 | " init='uniform'\n",
130 | " nbr_features= vocab_size-1 #2500\n",
131 | " dense_nparams=256\n",
132 | "\n",
133 | " model = Sequential()\n",
134 | " model.add(Dense(dense_nparams, activation='softsign', input_shape=(nbr_features,), kernel_initializer=init, kernel_constraint=maxnorm(3))) # maxnorm(0) & softmax & sigmoid -> 0.89 # maxnorm(0) & softmax & softmax -> 0.5 maxnorm(2) & relu & sigmoid ->0.92 maxnorm(1) & relu & sigmoid ->0.82\n",
135 | " model.add(Dropout(dropout))\n",
136 | " model.add(Dense(1, activation='sigmoid')) # relu & \"softmax\" fa 0.5-> non va bene #' relu & softplus' -> 0.75 #'sigmoid'\n",
137 | " model.compile(loss='binary_crossentropy', optimizer=optimizer,metrics = METRICS)\n",
138 | " return model\n",
139 | " "
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 9,
145 | "metadata": {},
146 | "outputs": [
147 | {
148 | "name": "stdout",
149 | "output_type": "stream",
150 | "text": [
151 | "Epoch 1/3\n",
152 | "Epoch 2/3\n",
153 | "Epoch 3/3\n"
154 | ]
155 | }
156 | ],
157 | "source": [
158 | "\n",
159 | "model = model0()\n",
160 | "\n",
161 | "history = model.fit(x=X_pad, y=y, batch_size = 8, epochs = 3, verbose=10, shuffle=True, max_queue_size=10, workers=4, use_multiprocessing=True) #, callbacks=callbacks , validation_split=0.2\n",
162 | "\n",
163 | "# reset gpu memory https://stackoverflow.com/a/60354785/13110508 (but be warned: it crashes python, so use it just at the end)\n",
164 | "# device = cuda.get_current_device()\n",
165 | "# device.reset()"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 10,
171 | "metadata": {},
172 | "outputs": [],
173 | "source": [
174 | "import pandas as pd\n",
175 | "with open(r\"comments_test.txt\", \"rb\") as f:\n",
176 | " clean_test_comments = pickle.load(f) \n",
177 | " f.close()\n",
178 | " \n",
179 | " \n",
180 | "X_test = tfidf.transform(clean_test_comments).toarray()\n",
181 | "maxlen_test = max([len(x) for x in X_test])\n",
182 | "X_test_pad = pad_sequences(X_test, padding='post', maxlen=maxlen, dtype='float32')\n",
183 | "#X_test_pad_scal = scaler.transform(X_test_pad)\n",
184 | "\n",
185 | "y_pred = model.predict_proba(X_test_pad)\n",
186 | "y_pred_unp = [y_pred[i][0] for i in range(len(y_pred))]\n",
187 | "with open(r\"authors_test.txt\", \"rb\") as f:\n",
188 | " authors = pickle.load(f) \n",
189 | " f.close()\n",
190 | " \n",
191 | "solution = pd.DataFrame({\"author\":authors, \"gender\":y_pred_unp})\n",
192 | "\n",
193 | "solution.to_csv(r\"Q:\\tooBigToDrive\\data-mining\\kaggle\\data\\challengedadata\\solutions\\simpleNetNoScalProbaGridD0_sol.csv\",index = False)"
194 | ]
195 | }
196 | ],
197 | "metadata": {
198 | "kernelspec": {
199 | "display_name": "Python 3",
200 | "language": "python",
201 | "name": "python3"
202 | },
203 | "language_info": {
204 | "codemirror_mode": {
205 | "name": "ipython",
206 | "version": 3
207 | },
208 | "file_extension": ".py",
209 | "mimetype": "text/x-python",
210 | "name": "python",
211 | "nbconvert_exporter": "python",
212 | "pygments_lexer": "ipython3",
213 | "version": "3.7.4"
214 | }
215 | },
216 | "nbformat": 4,
217 | "nbformat_minor": 4
218 | }
219 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pitmonticone/RedditTextClassification/fdd8b3a6e649781df9147599889c4669517f65ab/Notebooks/other-attempts/spaCy/.DS_Store
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/ReadMe.md:
--------------------------------------------------------------------------------
1 | # NLP with spaCy
2 |
3 | Notebooks and ReadMes inside folders provide concise decriptions of the code. To have a more in-depth resume and the big picture, please read [this Stack Overflow Question](https://stackoverflow.com/questions/60821793/text-classification-with-spacy-going-beyond-the-basics-to-improve-performance), this [GitHub Issue](https://github.com/explosion/spaCy/issues/5224) and a comment to a [Feature Request](https://github.com/explosion/spaCy/issues/2253#issuecomment-605502320).
4 |
5 | To access data, submisisons and the various lemmatizations/vectorizations, please visit this [Google Drive Link](https://drive.google.com/open?id=1ARPbyK6uyudZTZ9m0UEDY5_xgrH7D6PX)
6 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/data_preparation/.ipynb_checkpoints/vectorizer-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Vectorizer\n",
8 | "\n",
9 | "This notebook takes all preprocessings and vectorizes them, in order to be classified with the MLP. As an exploration, we used spaCy's pre-trained vectors. Note that the docuemnt vectors are obtained from the word vectors via an average. "
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import spacy\n",
19 | "import pandas as pd\n",
20 | "import numpy as np\n",
21 | "from progressbar import ProgressBar, Bar, Percentage\n",
22 | "from os import listdir\n",
23 | "from os.path import isfile, join"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "Load the big model (as per [documentation](https://spacy.io/usage/vectors-similarity)"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "nlp = spacy.load(r\"Q:\\anaconda\\Lib\\site-packages\\en_core_web_lg\\en_core_web_lg-2.2.5\")"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "%%time\n",
49 | "\n",
50 | "def_str = r\"Q:\\\\tooBigToDrive\\data-mining\\kaggle\\data\\csv\"\n",
51 | "path = r\"Q:\\tooBigToDrive\\data-mining\\kaggle\\data\\csv\"\n",
52 | "files = listdir(def_str)\n",
53 | "files = [f.replace(\".csv\",\"\") for f in files if \"Agg\" in f]\n",
54 | "\n",
55 | "for s in files:\n",
56 | " csvPath = def_str +\"\\\\\"+ s + \".csv\"\n",
57 | " npyPath = def_str +\"\\\\\"+ s +\"sSub\"+ \".npy\"\n",
58 | " train = pd.read_csv(csvPath)\n",
59 | " train.replace(to_replace = \"empty\", value = \"\", inplace = True)\n",
60 | " train[\"body\"].fillna(\"\",inplace = True)\n",
61 | " # enable this to add subreddits to body \n",
62 | " train[\"body\"] = train[\"subreddit\"]+\" \"+train[\"body\"]\n",
63 | " to_be_vectorized = train[\"body\"].tolist()\n",
64 | " vectorsl = []\n",
65 | " print(\"doing\"+\" \"+s+\".csv ...\", \"len(to_be_vectorized) = \",len(to_be_vectorized) )\n",
66 | " pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(to_be_vectorized)).start()\n",
67 | " i = 0\n",
68 | " # disable parser and ner pipes to have better performance\n",
69 | " with nlp.disable_pipes():\n",
70 | " for tex in to_be_vectorized:\n",
71 | " vectorsl.append(nlp(tex).vector)\n",
72 | " i += 1\n",
73 | " pbar.update(i)\n",
74 | " pbar.finish()\n",
75 | " vectors = np.array(vectorsl)\n",
76 | " np.save(npyPath,vectors)\n",
77 | " print(\"done\")\n"
78 | ]
79 | }
80 | ],
81 | "metadata": {
82 | "kernelspec": {
83 | "display_name": "Python [conda env:myEnv]",
84 | "language": "python",
85 | "name": "conda-env-myEnv-py"
86 | },
87 | "language_info": {
88 | "codemirror_mode": {
89 | "name": "ipython",
90 | "version": 3
91 | },
92 | "file_extension": ".py",
93 | "mimetype": "text/x-python",
94 | "name": "python",
95 | "nbconvert_exporter": "python",
96 | "pygments_lexer": "ipython3",
97 | "version": "3.7.6"
98 | }
99 | },
100 | "nbformat": 4,
101 | "nbformat_minor": 4
102 | }
103 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/data_preparation/vectorizer.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Vectorizer\n",
8 | "\n",
9 | "This notebook takes all preprocessings and vectorizes them, in order to be classified with the MLP. As an exploration, we used spaCy's pre-trained vectors. Note that the docuemnt vectors are obtained from the word vectors via an average. "
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import spacy\n",
19 | "import pandas as pd\n",
20 | "import numpy as np\n",
21 | "from progressbar import ProgressBar, Bar, Percentage\n",
22 | "from os import listdir\n",
23 | "from os.path import isfile, join"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "Load the big model (as per [documentation](https://spacy.io/usage/vectors-similarity)"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "nlp = spacy.load(r\"Q:\\anaconda\\Lib\\site-packages\\en_core_web_lg\\en_core_web_lg-2.2.5\")"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "%%time\n",
49 | "\n",
50 | "def_str = r\"Q:\\\\tooBigToDrive\\data-mining\\kaggle\\data\\csv\"\n",
51 | "path = r\"Q:\\tooBigToDrive\\data-mining\\kaggle\\data\\csv\"\n",
52 | "files = listdir(def_str)\n",
53 | "files = [f.replace(\".csv\",\"\") for f in files if \"Agg\" in f]\n",
54 | "\n",
55 | "for s in files:\n",
56 | " csvPath = def_str +\"\\\\\"+ s + \".csv\"\n",
57 | " npyPath = def_str +\"\\\\\"+ s +\"sSub\"+ \".npy\"\n",
58 | " train = pd.read_csv(csvPath)\n",
59 | " train.replace(to_replace = \"empty\", value = \"\", inplace = True)\n",
60 | " train[\"body\"].fillna(\"\",inplace = True)\n",
61 | " # enable this to add subreddits to body \n",
62 | " train[\"body\"] = train[\"subreddit\"]+\" \"+train[\"body\"]\n",
63 | " to_be_vectorized = train[\"body\"].tolist()\n",
64 | " vectorsl = []\n",
65 | " print(\"doing\"+\" \"+s+\".csv ...\", \"len(to_be_vectorized) = \",len(to_be_vectorized) )\n",
66 | " pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(to_be_vectorized)).start()\n",
67 | " i = 0\n",
68 | " # disable parser and ner pipes to have better performance\n",
69 | " with nlp.disable_pipes():\n",
70 | " for tex in to_be_vectorized:\n",
71 | " vectorsl.append(nlp(tex).vector)\n",
72 | " i += 1\n",
73 | " pbar.update(i)\n",
74 | " pbar.finish()\n",
75 | " vectors = np.array(vectorsl)\n",
76 | " np.save(npyPath,vectors)\n",
77 | " print(\"done\")\n"
78 | ]
79 | }
80 | ],
81 | "metadata": {
82 | "kernelspec": {
83 | "display_name": "Python [conda env:myEnv]",
84 | "language": "python",
85 | "name": "conda-env-myEnv-py"
86 | },
87 | "language_info": {
88 | "codemirror_mode": {
89 | "name": "ipython",
90 | "version": 3
91 | },
92 | "file_extension": ".py",
93 | "mimetype": "text/x-python",
94 | "name": "python",
95 | "nbconvert_exporter": "python",
96 | "pygments_lexer": "ipython3",
97 | "version": "3.7.6"
98 | }
99 | },
100 | "nbformat": 4,
101 | "nbformat_minor": 4
102 | }
103 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/finals/.ipynb_checkpoints/ReadMe-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 2
6 | }
7 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/finals/.ipynb_checkpoints/final_bal_lr-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 26,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd \n",
10 | "from sklearn.metrics import roc_curve, auc\n",
11 | "from sklearn.model_selection import train_test_split\n",
12 | "from sklearn.model_selection import KFold\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "import numpy as np\n",
15 | "from sklearn.linear_model import LogisticRegression\n",
16 | "import joblib"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 27,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "subs = pd.read_csv(r\"subs_bal.csv\")\n",
26 | "W2v= pd.read_csv(r\"W2v_bal.csv\")\n",
27 | "bodieswSdrop = pd.read_csv(r\"bodieswSdrop_bal.csv\")"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 28,
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | " true_y bodieswSdrop_y subs_y W2v_y\n",
40 | "0 0 0.004841 0.354758 0.505174\n",
41 | "1 0 0.959119 0.556274 0.217193\n",
42 | "2 0 0.124737 0.321295 0.338535\n",
43 | "3 0 0.975953 0.398921 0.818394\n",
44 | "4 0 0.978466 0.354308 0.656013\n"
45 | ]
46 | }
47 | ],
48 | "source": [
49 | "df = pd.DataFrame({\"true_y\": bodieswSdrop[\"true_y\"].tolist(), \"bodieswSdrop_y\":bodieswSdrop[\"pred_y\"].tolist(), \"subs_y\": subs[\"pred_y\"].tolist(), \"W2v_y\": W2v[\"pred_y\"].tolist() })\n",
50 | "print(df.head(5))"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 29,
56 | "metadata": {},
57 | "outputs": [
58 | {
59 | "name": "stdout",
60 | "output_type": "stream",
61 | "text": [
62 | " bodieswSdrop_y subs_y W2v_y\n",
63 | "0 0.004841 0.354758 0.505174\n",
64 | "1 0.959119 0.556274 0.217193\n",
65 | "2 0.124737 0.321295 0.338535\n",
66 | "3 0.975953 0.398921 0.818394\n",
67 | "4 0.978466 0.354308 0.656013\n"
68 | ]
69 | }
70 | ],
71 | "source": [
72 | "X = df.loc[:, [\"bodieswSdrop_y\", \"subs_y\", \"W2v_y\"]] # \"bodieswSdrop_y\", \"subs_y\", \"W2v_y\" #, \"subs_y\", \"W2v_y\"\n",
73 | "print(X.head(5))\n",
74 | "y = df.true_y"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 30,
80 | "metadata": {},
81 | "outputs": [
82 | {
83 | "name": "stdout",
84 | "output_type": "stream",
85 | "text": [
86 | "2842 2842 (2842, 3) (2842,)\n"
87 | ]
88 | }
89 | ],
90 | "source": [
91 | "X = X.to_numpy()\n",
92 | "y = y.to_numpy()\n",
93 | "print(len(X), len(y), X.shape, y.shape)"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 31,
99 | "metadata": {},
100 | "outputs": [
101 | {
102 | "name": "stdout",
103 | "output_type": "stream",
104 | "text": [
105 | "done 1\n",
106 | "done 1\n",
107 | "done 1\n",
108 | "done 1\n",
109 | "done 1\n",
110 | "done 1\n",
111 | "done 1\n",
112 | "done 1\n",
113 | "done 1\n",
114 | "done 1\n"
115 | ]
116 | }
117 | ],
118 | "source": [
119 | "lrClf = LogisticRegression(C = 1) #modello\n",
120 | " \n",
121 | "kf = KFold(n_splits = 10, shuffle = True)\n",
122 | "\n",
123 | "for train_indices, test_indices in kf.split(X):\n",
124 | " lrClf.fit(X[train_indices], y[train_indices])\n",
125 | " print(\"done 1\")"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 32,
131 | "metadata": {},
132 | "outputs": [
133 | {
134 | "data": {
135 | "text/plain": [
136 | "['Q:\\\\tooBigToDrive\\\\data-mining\\\\kaggle\\\\my_models\\\\spaCy\\\\savedModels\\\\bal_lr\\\\bal_lr.sav']"
137 | ]
138 | },
139 | "execution_count": 32,
140 | "metadata": {},
141 | "output_type": "execute_result"
142 | }
143 | ],
144 | "source": [
145 | "joblib.dump(lrClf , r\"bal_lr\\bal_lr.sav\")"
146 | ]
147 | }
148 | ],
149 | "metadata": {
150 | "kernelspec": {
151 | "display_name": "Python [conda env:myEnv]",
152 | "language": "python",
153 | "name": "conda-env-myEnv-py"
154 | },
155 | "language_info": {
156 | "codemirror_mode": {
157 | "name": "ipython",
158 | "version": 3
159 | },
160 | "file_extension": ".py",
161 | "mimetype": "text/x-python",
162 | "name": "python",
163 | "nbconvert_exporter": "python",
164 | "pygments_lexer": "ipython3",
165 | "version": "3.7.6"
166 | }
167 | },
168 | "nbformat": 4,
169 | "nbformat_minor": 4
170 | }
171 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/finals/.ipynb_checkpoints/final_lr-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 5,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "Using TensorFlow backend.\n",
13 | "Q:\\anaconda\\envs\\myEnv\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:523: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
14 | " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
15 | "Q:\\anaconda\\envs\\myEnv\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:524: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
16 | " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
17 | "Q:\\anaconda\\envs\\myEnv\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
18 | " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
19 | "Q:\\anaconda\\envs\\myEnv\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
20 | " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
21 | "Q:\\anaconda\\envs\\myEnv\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
22 | " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
23 | "Q:\\anaconda\\envs\\myEnv\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:532: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
24 | " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n"
25 | ]
26 | }
27 | ],
28 | "source": [
29 | "import pandas as pd \n",
30 | "from sklearn.metrics import roc_curve, auc\n",
31 | "from sklearn.model_selection import train_test_split\n",
32 | "from sklearn.model_selection import KFold\n",
33 | "import matplotlib.pyplot as plt\n",
34 | "import numpy as np\n",
35 | "from sklearn.linear_model import LogisticRegression\n",
36 | "import joblib\n",
37 | "from imblearn.over_sampling import ADASYN "
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 6,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "bodies = pd.read_csv(r\"bodies.csv\")\n",
47 | "bodieswS = pd.read_csv(r\"bodieswS.csv\")\n",
48 | "subs = pd.read_csv(r\"subs.csv\")\n",
49 | "W2v= pd.read_csv(r\"W2v.csv\")\n",
50 | "W2vwS = pd.read_csv(r\"W2vwS.csv\")\n",
51 | "bodieswSdrop = pd.read_csv(r\"bodieswSdrop.csv\")"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 7,
57 | "metadata": {},
58 | "outputs": [
59 | {
60 | "name": "stdout",
61 | "output_type": "stream",
62 | "text": [
63 | " true_y bodies_y bodieswS_y subs_y W2v_y W2vwS_y bodieswSdrop_y\n",
64 | "0 0 0.094856 0.120936 0.193810 0.093180 0.060750 0.046726\n",
65 | "1 1 0.106738 0.099757 0.478376 0.178994 0.149258 0.044623\n",
66 | "2 0 0.549541 0.253948 0.338182 0.892713 0.913806 0.222466\n",
67 | "3 1 0.425894 0.838085 0.374291 0.856181 0.820273 0.900915\n",
68 | "4 0 0.553865 0.341898 0.284349 0.461019 0.478457 0.525452\n"
69 | ]
70 | }
71 | ],
72 | "source": [
73 | "df = pd.DataFrame({\"true_y\": bodies[\"true_y\"].tolist(), \"bodies_y\":bodies[\"pred_y\"].tolist(), \"bodieswS_y\": bodieswS[\"pred_y\"].tolist(), \"subs_y\": subs[\"pred_y\"].tolist(), \"W2v_y\": W2v[\"pred_y\"].tolist(), \"W2vwS_y\": W2vwS[\"pred_y\"].tolist(), \"bodieswSdrop_y\":bodieswSdrop[\"pred_y\"].tolist() })\n",
74 | "print(df.head(5))"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 8,
80 | "metadata": {},
81 | "outputs": [
82 | {
83 | "name": "stdout",
84 | "output_type": "stream",
85 | "text": [
86 | " bodieswSdrop_y subs_y W2v_y\n",
87 | "0 0.046726 0.193810 0.093180\n",
88 | "1 0.044623 0.478376 0.178994\n",
89 | "2 0.222466 0.338182 0.892713\n",
90 | "3 0.900915 0.374291 0.856181\n",
91 | "4 0.525452 0.284349 0.461019\n",
92 | "len(X) before adasyn: 1000 len(y_train) before adasyn: 1000 percentage before: 0.265\n",
93 | "len(X) after adasyn: 1467 len(y) after adasyn: 1467 percentage after: 0.49897750511247446\n"
94 | ]
95 | }
96 | ],
97 | "source": [
98 | "X = df.loc[:, [\"bodieswSdrop_y\", \"subs_y\", \"W2v_y\"]] #, \"subs_y\", \"W2v_y\"\n",
99 | "print(X.head(5))\n",
100 | "y = df.true_y\n",
101 | "\n",
102 | "sm = ADASYN()\n",
103 | "print(\"len(X) before adasyn: \",len(X), \"len(y_train) before adasyn:\", len(y), \"percentage before: \", sum(y.tolist())/len(y.tolist()))\n",
104 | "X, y = sm.fit_sample(X, y)\n",
105 | "print(\"len(X) after adasyn: \",len(X), \"len(y) after adasyn:\", len(y), \"percentage after: \", sum(y.tolist())/len(y.tolist()))\n",
106 | "#sum(y_validation.tolist())/len(y_validation.tolist())\n"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 9,
112 | "metadata": {},
113 | "outputs": [
114 | {
115 | "name": "stdout",
116 | "output_type": "stream",
117 | "text": [
118 | "1467 1467 (1467, 3) (1467,)\n"
119 | ]
120 | }
121 | ],
122 | "source": [
123 | "X = X.to_numpy()\n",
124 | "y = y.to_numpy()\n",
125 | "print(len(X), len(y), X.shape, y.shape)"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 10,
131 | "metadata": {},
132 | "outputs": [
133 | {
134 | "name": "stdout",
135 | "output_type": "stream",
136 | "text": [
137 | "done 1\n",
138 | "done 1\n",
139 | "done 1\n",
140 | "done 1\n",
141 | "done 1\n",
142 | "done 1\n",
143 | "done 1\n",
144 | "done 1\n",
145 | "done 1\n",
146 | "done 1\n"
147 | ]
148 | }
149 | ],
150 | "source": [
151 | "lrClf = LogisticRegression(C = 1) #modello\n",
152 | " \n",
153 | "kf = KFold(n_splits = 10)\n",
154 | "\n",
155 | "for train_indices, test_indices in kf.split(X):\n",
156 | " lrClf.fit(X[train_indices], y[train_indices])\n",
157 | " print(\"done 1\")\n",
158 | "# print(svm.score(x_train[test_indices], y_train[test_indices]))\n",
159 | "# y_scoreSVM = svm.predict_proba(x_validation)[:,1]"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 11,
165 | "metadata": {},
166 | "outputs": [
167 | {
168 | "data": {
169 | "text/plain": [
170 | "['Q:\\\\tooBigToDrive\\\\data-mining\\\\kaggle\\\\my_models\\\\spaCy\\\\savedModels\\\\lr_adasyn\\\\lr_adasyn.sav']"
171 | ]
172 | },
173 | "execution_count": 11,
174 | "metadata": {},
175 | "output_type": "execute_result"
176 | }
177 | ],
178 | "source": [
179 | "joblib.dump(lrClf , r\"lr_adasyn\\lr_adasyn.sav\")"
180 | ]
181 | }
182 | ],
183 | "metadata": {
184 | "kernelspec": {
185 | "display_name": "Python [conda env:myEnv]",
186 | "language": "python",
187 | "name": "conda-env-myEnv-py"
188 | },
189 | "language_info": {
190 | "codemirror_mode": {
191 | "name": "ipython",
192 | "version": 3
193 | },
194 | "file_extension": ".py",
195 | "mimetype": "text/x-python",
196 | "name": "python",
197 | "nbconvert_exporter": "python",
198 | "pygments_lexer": "ipython3",
199 | "version": "3.7.6"
200 | }
201 | },
202 | "nbformat": 4,
203 | "nbformat_minor": 4
204 | }
205 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/finals/.ipynb_checkpoints/final_svm-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd \n",
10 | "from sklearn.metrics import roc_curve, auc\n",
11 | "from sklearn.model_selection import train_test_split\n",
12 | "from sklearn.model_selection import KFold\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "import numpy as np\n",
15 | "from sklearn import svm\n",
16 | "import joblib"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 2,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "bodies = pd.read_csv(r\"bodies.csv\")\n",
26 | "bodieswS = pd.read_csv(r\"bodieswS.csv\")\n",
27 | "subs = pd.read_csv(r\"subs.csv\")\n",
28 | "W2v= pd.read_csv(r\"W2v.csv\")\n",
29 | "W2vwS = pd.read_csv(r\"W2vwS.csv\")\n",
30 | "bodieswSdrop = pd.read_csv(r\"bodieswSdrop.csv\")"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 3,
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "name": "stdout",
40 | "output_type": "stream",
41 | "text": [
42 | " true_y bodies_y bodieswS_y subs_y W2v_y W2vwS_y bodieswSdrop_y\n",
43 | "0 0 0.094856 0.120936 0.193810 0.093180 0.060750 0.046726\n",
44 | "1 1 0.106738 0.099757 0.478376 0.178994 0.149258 0.044623\n",
45 | "2 0 0.549541 0.253948 0.338182 0.892713 0.913806 0.222466\n",
46 | "3 1 0.425894 0.838085 0.374291 0.856181 0.820273 0.900915\n",
47 | "4 0 0.553865 0.341898 0.284349 0.461019 0.478457 0.525452\n"
48 | ]
49 | }
50 | ],
51 | "source": [
52 | "df = pd.DataFrame({\"true_y\": bodies[\"true_y\"].tolist(), \"bodies_y\":bodies[\"pred_y\"].tolist(), \"bodieswS_y\": bodieswS[\"pred_y\"].tolist(), \"subs_y\": subs[\"pred_y\"].tolist(), \"W2v_y\": W2v[\"pred_y\"].tolist(), \"W2vwS_y\": W2vwS[\"pred_y\"].tolist(), \"bodieswSdrop_y\":bodieswSdrop[\"pred_y\"].tolist() })\n",
53 | "print(df.head(5))"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 4,
59 | "metadata": {},
60 | "outputs": [
61 | {
62 | "name": "stdout",
63 | "output_type": "stream",
64 | "text": [
65 | " bodieswSdrop_y subs_y W2v_y\n",
66 | "0 0.046726 0.193810 0.093180\n",
67 | "1 0.044623 0.478376 0.178994\n",
68 | "2 0.222466 0.338182 0.892713\n",
69 | "3 0.900915 0.374291 0.856181\n",
70 | "4 0.525452 0.284349 0.461019\n"
71 | ]
72 | }
73 | ],
74 | "source": [
75 | "X = df.loc[:, [\"bodieswSdrop_y\", \"subs_y\", \"W2v_y\"]] #, \"subs_y\", \"W2v_y\"\n",
76 | "print(X.head(5))\n",
77 | "y = df.true_y"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 5,
83 | "metadata": {},
84 | "outputs": [
85 | {
86 | "name": "stdout",
87 | "output_type": "stream",
88 | "text": [
89 | "1000 1000 (1000, 3) (1000,)\n"
90 | ]
91 | }
92 | ],
93 | "source": [
94 | "X = X.to_numpy()\n",
95 | "y = y.to_numpy()\n",
96 | "print(len(X), len(y), X.shape, y.shape)"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 6,
102 | "metadata": {},
103 | "outputs": [
104 | {
105 | "name": "stdout",
106 | "output_type": "stream",
107 | "text": [
108 | "done 1\n",
109 | "done 1\n",
110 | "done 1\n",
111 | "done 1\n",
112 | "done 1\n",
113 | "done 1\n",
114 | "done 1\n",
115 | "done 1\n",
116 | "done 1\n",
117 | "done 1\n"
118 | ]
119 | }
120 | ],
121 | "source": [
122 | "svm = svm.SVC(C=1.0, kernel='poly', degree=2, gamma='scale', coef0=0.0, shrinking=True, probability=True, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1,\n",
123 | " decision_function_shape='ovr', break_ties=False, random_state=None)\n",
124 | " \n",
125 | "kf = KFold(n_splits = 10)\n",
126 | "\n",
127 | "for train_indices, test_indices in kf.split(X):\n",
128 | " svm.fit(X[train_indices], y[train_indices])\n",
129 | " print(\"done 1\")\n",
130 | "# print(svm.score(x_train[test_indices], y_train[test_indices]))\n",
131 | "# y_scoreSVM = svm.predict_proba(x_validation)[:,1]"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 8,
137 | "metadata": {},
138 | "outputs": [
139 | {
140 | "data": {
141 | "text/plain": [
142 | "['Q:\\\\tooBigToDrive\\\\data-mining\\\\kaggle\\\\my_models\\\\spaCy\\\\savedModels\\\\svm\\\\svm.sav']"
143 | ]
144 | },
145 | "execution_count": 8,
146 | "metadata": {},
147 | "output_type": "execute_result"
148 | }
149 | ],
150 | "source": [
151 | "joblib.dump(svm , r\"svm\\svm.sav\")"
152 | ]
153 | }
154 | ],
155 | "metadata": {
156 | "kernelspec": {
157 | "display_name": "Python [conda env:myEnv]",
158 | "language": "python",
159 | "name": "conda-env-myEnv-py"
160 | },
161 | "language_info": {
162 | "codemirror_mode": {
163 | "name": "ipython",
164 | "version": 3
165 | },
166 | "file_extension": ".py",
167 | "mimetype": "text/x-python",
168 | "name": "python",
169 | "nbconvert_exporter": "python",
170 | "pygments_lexer": "ipython3",
171 | "version": "3.7.6"
172 | }
173 | },
174 | "nbformat": 4,
175 | "nbformat_minor": 4
176 | }
177 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/finals/.ipynb_checkpoints/solution-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import joblib"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 2,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "subs = pd.read_csv(r\"subs_test_predictions.csv\")\n",
20 | "bodieswS = pd.read_csv(r\"bodieswS_test_predictions.csv\")\n",
21 | "W2v = pd.read_csv(r\"spacyW2v_test_predictions.csv\")"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 3,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "#svm = joblib.load(r\"Q:\\tooBigToDrive\\data-mining\\kaggle\\my_models\\spaCy\\savedModels\\svm\\svm.sav\")\n",
31 | "lr = joblib.load(r\"Q:\\tooBigToDrive\\data-mining\\kaggle\\my_models\\spaCy\\savedModels\\lr_adasyn\\lr_adasyn.sav\")"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 4,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "df = pd.DataFrame({\"subs\": subs[\"pred_y\"].tolist(), \"bodieswS\": bodieswS[\"pred_y\"].tolist(), \"W2v\": W2v[\"pred_y\"].tolist()})"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 5,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "name": "stdout",
50 | "output_type": "stream",
51 | "text": [
52 | "[[0.27114913 0.01077099 0.04565962]\n",
53 | " [0.32025427 0.95815259 0.584287 ]\n",
54 | " [0.11948037 0.0448686 0.23606443]\n",
55 | " [0.27441698 0.46897009 0.28487484]\n",
56 | " [0.1256479 0.05315585 0.78758538]]\n"
57 | ]
58 | }
59 | ],
60 | "source": [
61 | "X = df.to_numpy()\n",
62 | "print(X[0:5])"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 6,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "# sols = svm.predict_proba(X)[:,1]\n",
72 | "# print(sols[:5])"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 7,
78 | "metadata": {},
79 | "outputs": [
80 | {
81 | "name": "stdout",
82 | "output_type": "stream",
83 | "text": [
84 | "[0.14656961 0.97952294 0.17106152 0.71290015 0.28597205]\n"
85 | ]
86 | }
87 | ],
88 | "source": [
89 | "sols = lr.predict_proba(X)[:,1]\n",
90 | "print(sols[:5])"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "metadata": {},
97 | "outputs": [],
98 | "source": []
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 8,
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "name": "stdout",
107 | "output_type": "stream",
108 | "text": [
109 | " author gender\n",
110 | "0 --redbeard-- 0.146570\n",
111 | "1 -Allaina- 0.979523\n",
112 | "2 -AllonsyAlonso 0.171062\n",
113 | "3 -Beth- 0.712900\n",
114 | "4 -Greeny- 0.285972\n"
115 | ]
116 | }
117 | ],
118 | "source": [
119 | "solution = pd.DataFrame({\"author\": subs[\"author\"].tolist(), \"gender\":sols})\n",
120 | "print(solution.head())"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 9,
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "solution.to_csv(r\"Q:\\tooBigToDrive\\data-mining\\kaggle\\my_models\\spaCy\\results\\finals\\csv\\test\\lrSolution_adasyn.csv\", index = False)"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 10,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "# sols1 = [1 if s >= 0.5 else 0 for s in sols]\n",
139 | "# print(sols1[:5])"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 11,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "# solution1 = pd.DataFrame({\"author\": subs[\"author\"].tolist(), \"gender\": sols1})\n",
149 | "# solution1.to_csv(r\"Q:\\tooBigToDrive\\data-mining\\kaggle\\my_models\\spaCy\\results\\finals\\csv\\test\\lrSolution_adasyn.csv\", index = False)"
150 | ]
151 | }
152 | ],
153 | "metadata": {
154 | "kernelspec": {
155 | "display_name": "Python [conda env:myEnv]",
156 | "language": "python",
157 | "name": "conda-env-myEnv-py"
158 | },
159 | "language_info": {
160 | "codemirror_mode": {
161 | "name": "ipython",
162 | "version": 3
163 | },
164 | "file_extension": ".py",
165 | "mimetype": "text/x-python",
166 | "name": "python",
167 | "nbconvert_exporter": "python",
168 | "pygments_lexer": "ipython3",
169 | "version": "3.7.6"
170 | }
171 | },
172 | "nbformat": 4,
173 | "nbformat_minor": 4
174 | }
175 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/finals/.ipynb_checkpoints/solution_bal-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import joblib"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 4,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "subs = pd.read_csv(r\"subs_bal_test_predictions.csv\")\n",
20 | "bodieswS = pd.read_csv(r\"bodieswS_bal_test_predictions.csv\")\n",
21 | "W2v = pd.read_csv(r\"spacyW2v_bal_test_predictions.csv\")"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 5,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "lr = joblib.load(r\"Qbal_lr\\bal_lr.sav\")"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 6,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "df = pd.DataFrame({\"subs\": subs[\"pred_y\"].tolist(), \"bodieswS\": bodieswS[\"pred_y\"].tolist(), \"W2v\": W2v[\"pred_y\"].tolist()})"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 7,
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "name": "stdout",
49 | "output_type": "stream",
50 | "text": [
51 | "[[0.48052195 0.09621675 0.53980164]\n",
52 | " [0.51025856 0.98378307 0.56349511]\n",
53 | " [0.26692441 0.34320322 0.47294487]\n",
54 | " [0.40504095 0.37116724 0.69072162]\n",
55 | " [0.43180564 0.83457643 0.59301484]]\n"
56 | ]
57 | }
58 | ],
59 | "source": [
60 | "X = df.to_numpy()\n",
61 | "print(X[0:5])"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 8,
67 | "metadata": {},
68 | "outputs": [
69 | {
70 | "name": "stdout",
71 | "output_type": "stream",
72 | "text": [
73 | "[0.01094408 0.62939482 0.030943 0.06394275 0.42612817]\n"
74 | ]
75 | }
76 | ],
77 | "source": [
78 | "sols = lr.predict_proba(X)[:,1]\n",
79 | "print(sols[:5])"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 9,
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "name": "stdout",
89 | "output_type": "stream",
90 | "text": [
91 | " author gender\n",
92 | "0 --redbeard-- 0.010944\n",
93 | "1 -Allaina- 0.629395\n",
94 | "2 -AllonsyAlonso 0.030943\n",
95 | "3 -Beth- 0.063943\n",
96 | "4 -Greeny- 0.426128\n"
97 | ]
98 | }
99 | ],
100 | "source": [
101 | "solution = pd.DataFrame({\"author\": subs[\"author\"].tolist(), \"gender\":sols})\n",
102 | "print(solution.head())"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 10,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "solution.to_csv(r\"bal_lrSolution.csv\", index = False)"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": []
120 | }
121 | ],
122 | "metadata": {
123 | "kernelspec": {
124 | "display_name": "Python [conda env:myEnv]",
125 | "language": "python",
126 | "name": "conda-env-myEnv-py"
127 | },
128 | "language_info": {
129 | "codemirror_mode": {
130 | "name": "ipython",
131 | "version": 3
132 | },
133 | "file_extension": ".py",
134 | "mimetype": "text/x-python",
135 | "name": "python",
136 | "nbconvert_exporter": "python",
137 | "pygments_lexer": "ipython3",
138 | "version": "3.7.6"
139 | }
140 | },
141 | "nbformat": 4,
142 | "nbformat_minor": 4
143 | }
144 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/finals/ReadMe.md:
--------------------------------------------------------------------------------
1 | # Folder explanation
2 |
3 | These models are the same as the ones in intemrediate_models folder. They are here trained over all 5000 points and used to predict the test_data.csv. Also, these mdel arwe saved and some of them output the prediction distribution as an image.
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/finals/final_bal_lr.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 26,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd \n",
10 | "from sklearn.metrics import roc_curve, auc\n",
11 | "from sklearn.model_selection import train_test_split\n",
12 | "from sklearn.model_selection import KFold\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "import numpy as np\n",
15 | "from sklearn.linear_model import LogisticRegression\n",
16 | "import joblib"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 27,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "subs = pd.read_csv(r\"subs_bal.csv\")\n",
26 | "W2v= pd.read_csv(r\"W2v_bal.csv\")\n",
27 | "bodieswSdrop = pd.read_csv(r\"bodieswSdrop_bal.csv\")"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 28,
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | " true_y bodieswSdrop_y subs_y W2v_y\n",
40 | "0 0 0.004841 0.354758 0.505174\n",
41 | "1 0 0.959119 0.556274 0.217193\n",
42 | "2 0 0.124737 0.321295 0.338535\n",
43 | "3 0 0.975953 0.398921 0.818394\n",
44 | "4 0 0.978466 0.354308 0.656013\n"
45 | ]
46 | }
47 | ],
48 | "source": [
49 | "df = pd.DataFrame({\"true_y\": bodieswSdrop[\"true_y\"].tolist(), \"bodieswSdrop_y\":bodieswSdrop[\"pred_y\"].tolist(), \"subs_y\": subs[\"pred_y\"].tolist(), \"W2v_y\": W2v[\"pred_y\"].tolist() })\n",
50 | "print(df.head(5))"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 29,
56 | "metadata": {},
57 | "outputs": [
58 | {
59 | "name": "stdout",
60 | "output_type": "stream",
61 | "text": [
62 | " bodieswSdrop_y subs_y W2v_y\n",
63 | "0 0.004841 0.354758 0.505174\n",
64 | "1 0.959119 0.556274 0.217193\n",
65 | "2 0.124737 0.321295 0.338535\n",
66 | "3 0.975953 0.398921 0.818394\n",
67 | "4 0.978466 0.354308 0.656013\n"
68 | ]
69 | }
70 | ],
71 | "source": [
72 | "X = df.loc[:, [\"bodieswSdrop_y\", \"subs_y\", \"W2v_y\"]] # \"bodieswSdrop_y\", \"subs_y\", \"W2v_y\" #, \"subs_y\", \"W2v_y\"\n",
73 | "print(X.head(5))\n",
74 | "y = df.true_y"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 30,
80 | "metadata": {},
81 | "outputs": [
82 | {
83 | "name": "stdout",
84 | "output_type": "stream",
85 | "text": [
86 | "2842 2842 (2842, 3) (2842,)\n"
87 | ]
88 | }
89 | ],
90 | "source": [
91 | "X = X.to_numpy()\n",
92 | "y = y.to_numpy()\n",
93 | "print(len(X), len(y), X.shape, y.shape)"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 31,
99 | "metadata": {},
100 | "outputs": [
101 | {
102 | "name": "stdout",
103 | "output_type": "stream",
104 | "text": [
105 | "done 1\n",
106 | "done 1\n",
107 | "done 1\n",
108 | "done 1\n",
109 | "done 1\n",
110 | "done 1\n",
111 | "done 1\n",
112 | "done 1\n",
113 | "done 1\n",
114 | "done 1\n"
115 | ]
116 | }
117 | ],
118 | "source": [
119 | "lrClf = LogisticRegression(C = 1) #modello\n",
120 | " \n",
121 | "kf = KFold(n_splits = 10, shuffle = True)\n",
122 | "\n",
123 | "for train_indices, test_indices in kf.split(X):\n",
124 | " lrClf.fit(X[train_indices], y[train_indices])\n",
125 | " print(\"done 1\")"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 32,
131 | "metadata": {},
132 | "outputs": [
133 | {
134 | "data": {
135 | "text/plain": [
136 | "['Q:\\\\tooBigToDrive\\\\data-mining\\\\kaggle\\\\my_models\\\\spaCy\\\\savedModels\\\\bal_lr\\\\bal_lr.sav']"
137 | ]
138 | },
139 | "execution_count": 32,
140 | "metadata": {},
141 | "output_type": "execute_result"
142 | }
143 | ],
144 | "source": [
145 | "joblib.dump(lrClf , r\"bal_lr\\bal_lr.sav\")"
146 | ]
147 | }
148 | ],
149 | "metadata": {
150 | "kernelspec": {
151 | "display_name": "Python [conda env:myEnv]",
152 | "language": "python",
153 | "name": "conda-env-myEnv-py"
154 | },
155 | "language_info": {
156 | "codemirror_mode": {
157 | "name": "ipython",
158 | "version": 3
159 | },
160 | "file_extension": ".py",
161 | "mimetype": "text/x-python",
162 | "name": "python",
163 | "nbconvert_exporter": "python",
164 | "pygments_lexer": "ipython3",
165 | "version": "3.7.6"
166 | }
167 | },
168 | "nbformat": 4,
169 | "nbformat_minor": 4
170 | }
171 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/finals/final_lr.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 5,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "Using TensorFlow backend.\n",
13 | "Q:\\anaconda\\envs\\myEnv\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:523: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
14 | " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
15 | "Q:\\anaconda\\envs\\myEnv\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:524: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
16 | " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
17 | "Q:\\anaconda\\envs\\myEnv\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
18 | " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
19 | "Q:\\anaconda\\envs\\myEnv\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
20 | " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
21 | "Q:\\anaconda\\envs\\myEnv\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
22 | " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
23 | "Q:\\anaconda\\envs\\myEnv\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:532: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
24 | " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n"
25 | ]
26 | }
27 | ],
28 | "source": [
29 | "import pandas as pd \n",
30 | "from sklearn.metrics import roc_curve, auc\n",
31 | "from sklearn.model_selection import train_test_split\n",
32 | "from sklearn.model_selection import KFold\n",
33 | "import matplotlib.pyplot as plt\n",
34 | "import numpy as np\n",
35 | "from sklearn.linear_model import LogisticRegression\n",
36 | "import joblib\n",
37 | "from imblearn.over_sampling import ADASYN "
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 6,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "bodies = pd.read_csv(r\"bodies.csv\")\n",
47 | "bodieswS = pd.read_csv(r\"bodieswS.csv\")\n",
48 | "subs = pd.read_csv(r\"subs.csv\")\n",
49 | "W2v= pd.read_csv(r\"W2v.csv\")\n",
50 | "W2vwS = pd.read_csv(r\"W2vwS.csv\")\n",
51 | "bodieswSdrop = pd.read_csv(r\"bodieswSdrop.csv\")"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 7,
57 | "metadata": {},
58 | "outputs": [
59 | {
60 | "name": "stdout",
61 | "output_type": "stream",
62 | "text": [
63 | " true_y bodies_y bodieswS_y subs_y W2v_y W2vwS_y bodieswSdrop_y\n",
64 | "0 0 0.094856 0.120936 0.193810 0.093180 0.060750 0.046726\n",
65 | "1 1 0.106738 0.099757 0.478376 0.178994 0.149258 0.044623\n",
66 | "2 0 0.549541 0.253948 0.338182 0.892713 0.913806 0.222466\n",
67 | "3 1 0.425894 0.838085 0.374291 0.856181 0.820273 0.900915\n",
68 | "4 0 0.553865 0.341898 0.284349 0.461019 0.478457 0.525452\n"
69 | ]
70 | }
71 | ],
72 | "source": [
73 | "df = pd.DataFrame({\"true_y\": bodies[\"true_y\"].tolist(), \"bodies_y\":bodies[\"pred_y\"].tolist(), \"bodieswS_y\": bodieswS[\"pred_y\"].tolist(), \"subs_y\": subs[\"pred_y\"].tolist(), \"W2v_y\": W2v[\"pred_y\"].tolist(), \"W2vwS_y\": W2vwS[\"pred_y\"].tolist(), \"bodieswSdrop_y\":bodieswSdrop[\"pred_y\"].tolist() })\n",
74 | "print(df.head(5))"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 8,
80 | "metadata": {},
81 | "outputs": [
82 | {
83 | "name": "stdout",
84 | "output_type": "stream",
85 | "text": [
86 | " bodieswSdrop_y subs_y W2v_y\n",
87 | "0 0.046726 0.193810 0.093180\n",
88 | "1 0.044623 0.478376 0.178994\n",
89 | "2 0.222466 0.338182 0.892713\n",
90 | "3 0.900915 0.374291 0.856181\n",
91 | "4 0.525452 0.284349 0.461019\n",
92 | "len(X) before adasyn: 1000 len(y_train) before adasyn: 1000 percentage before: 0.265\n",
93 | "len(X) after adasyn: 1467 len(y) after adasyn: 1467 percentage after: 0.49897750511247446\n"
94 | ]
95 | }
96 | ],
97 | "source": [
98 | "X = df.loc[:, [\"bodieswSdrop_y\", \"subs_y\", \"W2v_y\"]] #, \"subs_y\", \"W2v_y\"\n",
99 | "print(X.head(5))\n",
100 | "y = df.true_y\n",
101 | "\n",
102 | "sm = ADASYN()\n",
103 | "print(\"len(X) before adasyn: \",len(X), \"len(y_train) before adasyn:\", len(y), \"percentage before: \", sum(y.tolist())/len(y.tolist()))\n",
104 | "X, y = sm.fit_sample(X, y)\n",
105 | "print(\"len(X) after adasyn: \",len(X), \"len(y) after adasyn:\", len(y), \"percentage after: \", sum(y.tolist())/len(y.tolist()))\n",
106 | "#sum(y_validation.tolist())/len(y_validation.tolist())\n"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 9,
112 | "metadata": {},
113 | "outputs": [
114 | {
115 | "name": "stdout",
116 | "output_type": "stream",
117 | "text": [
118 | "1467 1467 (1467, 3) (1467,)\n"
119 | ]
120 | }
121 | ],
122 | "source": [
123 | "X = X.to_numpy()\n",
124 | "y = y.to_numpy()\n",
125 | "print(len(X), len(y), X.shape, y.shape)"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 10,
131 | "metadata": {},
132 | "outputs": [
133 | {
134 | "name": "stdout",
135 | "output_type": "stream",
136 | "text": [
137 | "done 1\n",
138 | "done 1\n",
139 | "done 1\n",
140 | "done 1\n",
141 | "done 1\n",
142 | "done 1\n",
143 | "done 1\n",
144 | "done 1\n",
145 | "done 1\n",
146 | "done 1\n"
147 | ]
148 | }
149 | ],
150 | "source": [
151 | "lrClf = LogisticRegression(C = 1) #modello\n",
152 | " \n",
153 | "kf = KFold(n_splits = 10)\n",
154 | "\n",
155 | "for train_indices, test_indices in kf.split(X):\n",
156 | " lrClf.fit(X[train_indices], y[train_indices])\n",
157 | " print(\"done 1\")\n",
158 | "# print(svm.score(x_train[test_indices], y_train[test_indices]))\n",
159 | "# y_scoreSVM = svm.predict_proba(x_validation)[:,1]"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 11,
165 | "metadata": {},
166 | "outputs": [
167 | {
168 | "data": {
169 | "text/plain": [
170 | "['Q:\\\\tooBigToDrive\\\\data-mining\\\\kaggle\\\\my_models\\\\spaCy\\\\savedModels\\\\lr_adasyn\\\\lr_adasyn.sav']"
171 | ]
172 | },
173 | "execution_count": 11,
174 | "metadata": {},
175 | "output_type": "execute_result"
176 | }
177 | ],
178 | "source": [
179 | "joblib.dump(lrClf , r\"lr_adasyn\\lr_adasyn.sav\")"
180 | ]
181 | }
182 | ],
183 | "metadata": {
184 | "kernelspec": {
185 | "display_name": "Python [conda env:myEnv]",
186 | "language": "python",
187 | "name": "conda-env-myEnv-py"
188 | },
189 | "language_info": {
190 | "codemirror_mode": {
191 | "name": "ipython",
192 | "version": 3
193 | },
194 | "file_extension": ".py",
195 | "mimetype": "text/x-python",
196 | "name": "python",
197 | "nbconvert_exporter": "python",
198 | "pygments_lexer": "ipython3",
199 | "version": "3.7.6"
200 | }
201 | },
202 | "nbformat": 4,
203 | "nbformat_minor": 4
204 | }
205 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/finals/final_svm.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd \n",
10 | "from sklearn.metrics import roc_curve, auc\n",
11 | "from sklearn.model_selection import train_test_split\n",
12 | "from sklearn.model_selection import KFold\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "import numpy as np\n",
15 | "from sklearn import svm\n",
16 | "import joblib"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 2,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "bodies = pd.read_csv(r\"bodies.csv\")\n",
26 | "bodieswS = pd.read_csv(r\"bodieswS.csv\")\n",
27 | "subs = pd.read_csv(r\"subs.csv\")\n",
28 | "W2v= pd.read_csv(r\"W2v.csv\")\n",
29 | "W2vwS = pd.read_csv(r\"W2vwS.csv\")\n",
30 | "bodieswSdrop = pd.read_csv(r\"bodieswSdrop.csv\")"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 3,
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "name": "stdout",
40 | "output_type": "stream",
41 | "text": [
42 | " true_y bodies_y bodieswS_y subs_y W2v_y W2vwS_y bodieswSdrop_y\n",
43 | "0 0 0.094856 0.120936 0.193810 0.093180 0.060750 0.046726\n",
44 | "1 1 0.106738 0.099757 0.478376 0.178994 0.149258 0.044623\n",
45 | "2 0 0.549541 0.253948 0.338182 0.892713 0.913806 0.222466\n",
46 | "3 1 0.425894 0.838085 0.374291 0.856181 0.820273 0.900915\n",
47 | "4 0 0.553865 0.341898 0.284349 0.461019 0.478457 0.525452\n"
48 | ]
49 | }
50 | ],
51 | "source": [
52 | "df = pd.DataFrame({\"true_y\": bodies[\"true_y\"].tolist(), \"bodies_y\":bodies[\"pred_y\"].tolist(), \"bodieswS_y\": bodieswS[\"pred_y\"].tolist(), \"subs_y\": subs[\"pred_y\"].tolist(), \"W2v_y\": W2v[\"pred_y\"].tolist(), \"W2vwS_y\": W2vwS[\"pred_y\"].tolist(), \"bodieswSdrop_y\":bodieswSdrop[\"pred_y\"].tolist() })\n",
53 | "print(df.head(5))"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 4,
59 | "metadata": {},
60 | "outputs": [
61 | {
62 | "name": "stdout",
63 | "output_type": "stream",
64 | "text": [
65 | " bodieswSdrop_y subs_y W2v_y\n",
66 | "0 0.046726 0.193810 0.093180\n",
67 | "1 0.044623 0.478376 0.178994\n",
68 | "2 0.222466 0.338182 0.892713\n",
69 | "3 0.900915 0.374291 0.856181\n",
70 | "4 0.525452 0.284349 0.461019\n"
71 | ]
72 | }
73 | ],
74 | "source": [
75 | "X = df.loc[:, [\"bodieswSdrop_y\", \"subs_y\", \"W2v_y\"]] #, \"subs_y\", \"W2v_y\"\n",
76 | "print(X.head(5))\n",
77 | "y = df.true_y"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 5,
83 | "metadata": {},
84 | "outputs": [
85 | {
86 | "name": "stdout",
87 | "output_type": "stream",
88 | "text": [
89 | "1000 1000 (1000, 3) (1000,)\n"
90 | ]
91 | }
92 | ],
93 | "source": [
94 | "X = X.to_numpy()\n",
95 | "y = y.to_numpy()\n",
96 | "print(len(X), len(y), X.shape, y.shape)"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 6,
102 | "metadata": {},
103 | "outputs": [
104 | {
105 | "name": "stdout",
106 | "output_type": "stream",
107 | "text": [
108 | "done 1\n",
109 | "done 1\n",
110 | "done 1\n",
111 | "done 1\n",
112 | "done 1\n",
113 | "done 1\n",
114 | "done 1\n",
115 | "done 1\n",
116 | "done 1\n",
117 | "done 1\n"
118 | ]
119 | }
120 | ],
121 | "source": [
122 | "svm = svm.SVC(C=1.0, kernel='poly', degree=2, gamma='scale', coef0=0.0, shrinking=True, probability=True, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1,\n",
123 | " decision_function_shape='ovr', break_ties=False, random_state=None)\n",
124 | " \n",
125 | "kf = KFold(n_splits = 10)\n",
126 | "\n",
127 | "for train_indices, test_indices in kf.split(X):\n",
128 | " svm.fit(X[train_indices], y[train_indices])\n",
129 | " print(\"done 1\")\n",
130 | "# print(svm.score(x_train[test_indices], y_train[test_indices]))\n",
131 | "# y_scoreSVM = svm.predict_proba(x_validation)[:,1]"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 8,
137 | "metadata": {},
138 | "outputs": [
139 | {
140 | "data": {
141 | "text/plain": [
142 | "['Q:\\\\tooBigToDrive\\\\data-mining\\\\kaggle\\\\my_models\\\\spaCy\\\\savedModels\\\\svm\\\\svm.sav']"
143 | ]
144 | },
145 | "execution_count": 8,
146 | "metadata": {},
147 | "output_type": "execute_result"
148 | }
149 | ],
150 | "source": [
151 | "joblib.dump(svm , r\"svm\\svm.sav\")"
152 | ]
153 | }
154 | ],
155 | "metadata": {
156 | "kernelspec": {
157 | "display_name": "Python [conda env:myEnv]",
158 | "language": "python",
159 | "name": "conda-env-myEnv-py"
160 | },
161 | "language_info": {
162 | "codemirror_mode": {
163 | "name": "ipython",
164 | "version": 3
165 | },
166 | "file_extension": ".py",
167 | "mimetype": "text/x-python",
168 | "name": "python",
169 | "nbconvert_exporter": "python",
170 | "pygments_lexer": "ipython3",
171 | "version": "3.7.6"
172 | }
173 | },
174 | "nbformat": 4,
175 | "nbformat_minor": 4
176 | }
177 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/finals/solution.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import joblib"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 2,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "subs = pd.read_csv(r\"subs_test_predictions.csv\")\n",
20 | "bodieswS = pd.read_csv(r\"bodieswS_test_predictions.csv\")\n",
21 | "W2v = pd.read_csv(r\"spacyW2v_test_predictions.csv\")"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 3,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "#svm = joblib.load(r\"Q:\\tooBigToDrive\\data-mining\\kaggle\\my_models\\spaCy\\savedModels\\svm\\svm.sav\")\n",
31 | "lr = joblib.load(r\"Q:\\tooBigToDrive\\data-mining\\kaggle\\my_models\\spaCy\\savedModels\\lr_adasyn\\lr_adasyn.sav\")"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 4,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "df = pd.DataFrame({\"subs\": subs[\"pred_y\"].tolist(), \"bodieswS\": bodieswS[\"pred_y\"].tolist(), \"W2v\": W2v[\"pred_y\"].tolist()})"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 5,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "name": "stdout",
50 | "output_type": "stream",
51 | "text": [
52 | "[[0.27114913 0.01077099 0.04565962]\n",
53 | " [0.32025427 0.95815259 0.584287 ]\n",
54 | " [0.11948037 0.0448686 0.23606443]\n",
55 | " [0.27441698 0.46897009 0.28487484]\n",
56 | " [0.1256479 0.05315585 0.78758538]]\n"
57 | ]
58 | }
59 | ],
60 | "source": [
61 | "X = df.to_numpy()\n",
62 | "print(X[0:5])"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 6,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "# sols = svm.predict_proba(X)[:,1]\n",
72 | "# print(sols[:5])"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 7,
78 | "metadata": {},
79 | "outputs": [
80 | {
81 | "name": "stdout",
82 | "output_type": "stream",
83 | "text": [
84 | "[0.14656961 0.97952294 0.17106152 0.71290015 0.28597205]\n"
85 | ]
86 | }
87 | ],
88 | "source": [
89 | "sols = lr.predict_proba(X)[:,1]\n",
90 | "print(sols[:5])"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "metadata": {},
97 | "outputs": [],
98 | "source": []
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 8,
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "name": "stdout",
107 | "output_type": "stream",
108 | "text": [
109 | " author gender\n",
110 | "0 --redbeard-- 0.146570\n",
111 | "1 -Allaina- 0.979523\n",
112 | "2 -AllonsyAlonso 0.171062\n",
113 | "3 -Beth- 0.712900\n",
114 | "4 -Greeny- 0.285972\n"
115 | ]
116 | }
117 | ],
118 | "source": [
119 | "solution = pd.DataFrame({\"author\": subs[\"author\"].tolist(), \"gender\":sols})\n",
120 | "print(solution.head())"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 9,
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "solution.to_csv(r\"Q:\\tooBigToDrive\\data-mining\\kaggle\\my_models\\spaCy\\results\\finals\\csv\\test\\lrSolution_adasyn.csv\", index = False)"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 10,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "# sols1 = [1 if s >= 0.5 else 0 for s in sols]\n",
139 | "# print(sols1[:5])"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 11,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "# solution1 = pd.DataFrame({\"author\": subs[\"author\"].tolist(), \"gender\": sols1})\n",
149 | "# solution1.to_csv(r\"Q:\\tooBigToDrive\\data-mining\\kaggle\\my_models\\spaCy\\results\\finals\\csv\\test\\lrSolution_adasyn.csv\", index = False)"
150 | ]
151 | }
152 | ],
153 | "metadata": {
154 | "kernelspec": {
155 | "display_name": "Python [conda env:myEnv]",
156 | "language": "python",
157 | "name": "conda-env-myEnv-py"
158 | },
159 | "language_info": {
160 | "codemirror_mode": {
161 | "name": "ipython",
162 | "version": 3
163 | },
164 | "file_extension": ".py",
165 | "mimetype": "text/x-python",
166 | "name": "python",
167 | "nbconvert_exporter": "python",
168 | "pygments_lexer": "ipython3",
169 | "version": "3.7.6"
170 | }
171 | },
172 | "nbformat": 4,
173 | "nbformat_minor": 4
174 | }
175 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/finals/solution_bal.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import joblib"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 4,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "subs = pd.read_csv(r\"subs_bal_test_predictions.csv\")\n",
20 | "bodieswS = pd.read_csv(r\"bodieswS_bal_test_predictions.csv\")\n",
21 | "W2v = pd.read_csv(r\"spacyW2v_bal_test_predictions.csv\")"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 5,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "lr = joblib.load(r\"Qbal_lr\\bal_lr.sav\")"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 6,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "df = pd.DataFrame({\"subs\": subs[\"pred_y\"].tolist(), \"bodieswS\": bodieswS[\"pred_y\"].tolist(), \"W2v\": W2v[\"pred_y\"].tolist()})"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 7,
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "name": "stdout",
49 | "output_type": "stream",
50 | "text": [
51 | "[[0.48052195 0.09621675 0.53980164]\n",
52 | " [0.51025856 0.98378307 0.56349511]\n",
53 | " [0.26692441 0.34320322 0.47294487]\n",
54 | " [0.40504095 0.37116724 0.69072162]\n",
55 | " [0.43180564 0.83457643 0.59301484]]\n"
56 | ]
57 | }
58 | ],
59 | "source": [
60 | "X = df.to_numpy()\n",
61 | "print(X[0:5])"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 8,
67 | "metadata": {},
68 | "outputs": [
69 | {
70 | "name": "stdout",
71 | "output_type": "stream",
72 | "text": [
73 | "[0.01094408 0.62939482 0.030943 0.06394275 0.42612817]\n"
74 | ]
75 | }
76 | ],
77 | "source": [
78 | "sols = lr.predict_proba(X)[:,1]\n",
79 | "print(sols[:5])"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 9,
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "name": "stdout",
89 | "output_type": "stream",
90 | "text": [
91 | " author gender\n",
92 | "0 --redbeard-- 0.010944\n",
93 | "1 -Allaina- 0.629395\n",
94 | "2 -AllonsyAlonso 0.030943\n",
95 | "3 -Beth- 0.063943\n",
96 | "4 -Greeny- 0.426128\n"
97 | ]
98 | }
99 | ],
100 | "source": [
101 | "solution = pd.DataFrame({\"author\": subs[\"author\"].tolist(), \"gender\":sols})\n",
102 | "print(solution.head())"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 10,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "solution.to_csv(r\"bal_lrSolution.csv\", index = False)"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": []
120 | }
121 | ],
122 | "metadata": {
123 | "kernelspec": {
124 | "display_name": "Python [conda env:myEnv]",
125 | "language": "python",
126 | "name": "conda-env-myEnv-py"
127 | },
128 | "language_info": {
129 | "codemirror_mode": {
130 | "name": "ipython",
131 | "version": 3
132 | },
133 | "file_extension": ".py",
134 | "mimetype": "text/x-python",
135 | "name": "python",
136 | "nbconvert_exporter": "python",
137 | "pygments_lexer": "ipython3",
138 | "version": "3.7.6"
139 | }
140 | },
141 | "nbformat": 4,
142 | "nbformat_minor": 4
143 | }
144 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/images/bodieswS_test_ensemble_balanced_e15_wS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pitmonticone/RedditTextClassification/fdd8b3a6e649781df9147599889c4669517f65ab/Notebooks/other-attempts/spaCy/images/bodieswS_test_ensemble_balanced_e15_wS.png
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/images/bodieswS_test_ensemble_balanced_e3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pitmonticone/RedditTextClassification/fdd8b3a6e649781df9147599889c4669517f65ab/Notebooks/other-attempts/spaCy/images/bodieswS_test_ensemble_balanced_e3.png
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/intermediate_models/.ipynb_checkpoints/ReadMe-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Description\n",
8 | "\n",
9 | "Here there are the implementations of the models used to obtain the 1000 prediction on which we trained the logistic regeression"
10 | ]
11 | }
12 | ],
13 | "metadata": {
14 | "kernelspec": {
15 | "display_name": "Python [conda env:myEnv]",
16 | "language": "python",
17 | "name": "conda-env-myEnv-py"
18 | },
19 | "language_info": {
20 | "codemirror_mode": {
21 | "name": "ipython",
22 | "version": 3
23 | },
24 | "file_extension": ".py",
25 | "mimetype": "text/x-python",
26 | "name": "python",
27 | "nbconvert_exporter": "python",
28 | "pygments_lexer": "ipython3",
29 | "version": "3.7.6"
30 | }
31 | },
32 | "nbformat": 4,
33 | "nbformat_minor": 4
34 | }
35 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/intermediate_models/.ipynb_checkpoints/spactW2v-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# W2v\n",
8 | "\n",
9 | "A classification of W2v spaCy vectors, using scikit MLP"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {
16 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
17 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
18 | },
19 | "outputs": [],
20 | "source": [
21 | "from sklearn.preprocessing import StandardScaler # For scaling\n",
22 | "from sklearn.model_selection import train_test_split # for creating valid set and train set \n",
23 | "from sklearn.neural_network import MLPClassifier\n",
24 | "from sklearn.model_selection import KFold\n",
25 | "from sklearn.svm import SVC, LinearSVC\n",
26 | "import numpy as np\n",
27 | "import matplotlib.pyplot as plt\n",
28 | "from sklearn.metrics import roc_curve, auc\n",
29 | "import os\n",
30 | "from os import listdir\n",
31 | "from os.path import isfile, join\n",
32 | "from sklearn.decomposition import PCA\n",
33 | "import pandas as pd\n",
34 | "import math\n",
35 | "from sklearn.utils import shuffle"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {
42 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
43 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
44 | },
45 | "outputs": [],
46 | "source": [
47 | "y = np.load(\"targets.npy\").tolist()\n",
48 | "files = listdir(\"npy5000/\")\n",
49 | "files = [f for f in files if f == \"lPunctNumStopLemOovAgg.npy\"]\n",
50 | "for f in files:\n",
51 | " for i in range(1):\n",
52 | " X = np.load(\"../input/mydata/npy5000/\"+f)\n",
53 | " i = 75\n",
54 | " pca = PCA(i)\n",
55 | " pca.fit(X)\n",
56 | " U = pca.transform(X)\n",
57 | " U = U.tolist()\n",
58 | " df = pd.DataFrame({\"vect\": U, \"gender\": y})\n",
59 | " # unbalnced \n",
60 | " seed = 100\n",
61 | " split = math.floor(len(df)*0.8)\n",
62 | " train_df = df.sample(split, random_state = 100)\n",
63 | " test_df = df.drop(train_df.index)\n",
64 | " x_train = np.array(train_df[\"vect\"].tolist())\n",
65 | " print(\"x_train.shape = \", x_train.shape)\n",
66 | " #print(\"x_train[0] = \", x_train[0])\n",
67 | " x_validation = np.array(test_df[\"vect\"].tolist())\n",
68 | " print(\"x_validation.shape = \", x_validation.shape)\n",
69 | " y_train = np.array(train_df[\"gender\"].tolist())\n",
70 | " print(\"y_train.shape = \", y_train.shape)\n",
71 | " y_validation = np.array(test_df[\"gender\"].tolist())\n",
72 | " print(\"y_validation.shape = \", y_validation.shape) \n",
73 | "\n",
74 | "# end of unbalanced\n",
75 | " \n",
76 | " # balanced part\n",
77 | "# U_m = df.loc[df[\"gender\"] == 0, :]\n",
78 | "# U_f = df.loc[df[\"gender\"] == 1, :]\n",
79 | "\n",
80 | "# split = math.floor(len(U_f)*0.8)\n",
81 | "# print(\"split = \",split)\n",
82 | "\n",
83 | "# seed = 100\n",
84 | "\n",
85 | "# train_data_sample_m = U_m.sample(n = split, random_state = seed)\n",
86 | "# train_vects_m =train_data_sample_m[\"vect\"].tolist()\n",
87 | "# test_data_sample_m = U_m.drop(train_data_sample_m.index)\n",
88 | "# #test_data_sample_m = test_data_sample_m.reset_index() \n",
89 | "# test_vects_m = test_data_sample_m[\"vect\"].tolist()\n",
90 | "\n",
91 | "# train_data_sample_f = U_f.sample(n = split, random_state = seed)\n",
92 | "# train_vects_f = train_data_sample_f[\"vect\"].tolist()\n",
93 | "# test_data_sample_f = U_f.drop(train_data_sample_f.index)\n",
94 | "# #test_data_sample_f = test_data_sample_f.reset_index() \n",
95 | "# test_vects_f = test_data_sample_f[\"vect\"].tolist()\n",
96 | "\n",
97 | "# train_vects = train_vects_m + train_vects_f\n",
98 | "# test_vects = test_vects_m + test_vects_f\n",
99 | "\n",
100 | "# train_labels = [0 for i in range(split)] + [1 for i in range(split)]\n",
101 | "# test_labels = [0 for i in range(len(U_m)-split)] + [1 for i in range(len(U_f)-split)]\n",
102 | "# x_train = np.array(train_vects)\n",
103 | "# print(\"x_train.shape = \", x_train.shape)\n",
104 | "# #print(\"x_train[0] = \", x_train[0])\n",
105 | "# x_validation = np.array(test_vects)\n",
106 | "# print(\"x_validation.shape = \", x_validation.shape)\n",
107 | "# y_train = np.array(train_labels)\n",
108 | "# print(\"y_train.shape = \", y_train.shape)\n",
109 | "# y_validation = np.array(test_labels)\n",
110 | "# print(\"y_validation.shape = \", y_validation.shape)\n",
111 | "# x_train, y_train = shuffle(x_train, y_train, random_state = 0)\n",
112 | " # end of balanced\n",
113 | " \n",
114 | " \n",
115 | " # model\n",
116 | " mlpClf = MLPClassifier(solver = 'adam', activation= 'relu' ,alpha = 0.02, verbose = False, early_stopping = True,\n",
117 | " learning_rate = 'invscaling', max_iter = 400)\n",
118 | "\n",
119 | " # Cross validation - 10 Fold \n",
120 | " kf = KFold(n_splits = 10)\n",
121 | "\n",
122 | " for train_indices, test_indices in kf.split(x_train):\n",
123 | " mlpClf.fit(x_train[train_indices], y_train[train_indices])\n",
124 | " print(mlpClf.score(x_train[test_indices], y_train[test_indices]))\n",
125 | " y_score = mlpClf.predict_proba(x_validation)[:,1]\n",
126 | " fpr, tpr, thresholds = roc_curve(y_validation, y_score)\n",
127 | " roc_auc = auc(fpr, tpr)\n",
128 | " roc = str(roc_auc)\n",
129 | " name = f.replace(\".npy\",\"\")+\"_\"+str(i)\n",
130 | " print(name+\" : \"+str(roc_auc))\n",
131 | " # with open( \"spacyW2vMlp\" + \".txt\", \"a\") as file: #name\n",
132 | " # file.write(\"\\t pca_\" +str(i)+ \" : \" + roc+\"\\n\")\n",
133 | " # file.close()\n",
134 | "\n",
135 | "\n",
136 | "# df_res = pd.DataFrame({\"pred_y\": y_score, \"true_y\":y_validation})\n",
137 | "# df_res.to_csv (r'../working/W2v.csv', index = False, header=True)\n",
138 | "\n",
139 | " "
140 | ]
141 | },
142 | {
143 | "cell_type": "markdown",
144 | "metadata": {},
145 | "source": [
146 | "### balanced results different pca's\n",
147 | "\n",
148 | "split = 1079
\n",
149 | "x_train.shape = (2158, 10)
\n",
150 | "x_validation.shape = (2842, 10)
\n",
151 | "y_train.shape = (2158,)
\n",
152 | "y_validation.shape = (2842,)
\n",
153 | "\n",
154 | "lPunctNumStopLemOovAgg_10 : 0.7773587350959046
\n",
155 | "lPunctNumStopLemOovAgg_20 : 0.7794985887909682
\n",
156 | "lPunctNumStopLemOovAgg_30 : 0.8106258280052993
\n",
157 | "lPunctNumStopLemOovAgg_40 : 0.8099159034617822
\n",
158 | "lPunctNumStopLemOovAgg_50 : 0.8292624272795345
\n",
159 | "lPunctNumStopLemOovAgg_60 : 0.8135677668337078
\n",
160 | "lPunctNumStopLemOovAgg_70 : 0.8236233511894476
\n",
161 | "lPunctNumStopLemOovAgg_75 : 0.8321510857669489
\n",
162 | "lPunctNumStopLemOovAgg_80 : 0.8347517424111515
\n",
163 | "lPunctNumStopLemOovAgg_85 : 0.7923204308507575
\n",
164 | "lPunctNumStopLemOovAgg_90 : 0.8280326594090203
\n",
165 | "lPunctNumStopLemOovAgg_100 : 0.8135144864927135
\n",
166 | "lPunctNumStopLemOovAgg_110 : 0.7994311963596568
\n",
167 | "lPunctNumStopLemOovAgg_120 : 0.8126332008524854
\n",
168 | "lPunctNumStopLemOovAgg_130 : 0.7794049881919244
\n",
169 | "lPunctNumStopLemOovAgg_140 : 0.8027504176026727
\n",
170 | "lPunctNumStopLemOovAgg_150 : 0.7867101549449917
\n",
171 | "lPunctNumStopLemOovAgg_160 : 0.8010440066816429
\n",
172 | "lPunctNumStopLemOovAgg_165 : 0.8039686653994587
\n",
173 | "lPunctNumStopLemOovAgg_170 : 0.8058219572605264
\n",
174 | "lPunctNumStopLemOovAgg_180 : 0.8310192385231266
\n",
175 | "lPunctNumStopLemOovAgg_190 : 0.8154628189620413
"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "# save results\n",
185 | "df_res = pd.DataFrame({\"pred_y\": y_score, \"true_y\":y_validation})\n",
186 | "df_res.to_csv (r'../working/W2v_bal.csv', index = False, header=True)"
187 | ]
188 | }
189 | ],
190 | "metadata": {
191 | "kernelspec": {
192 | "display_name": "Python 3",
193 | "language": "python",
194 | "name": "python3"
195 | },
196 | "language_info": {
197 | "codemirror_mode": {
198 | "name": "ipython",
199 | "version": 3
200 | },
201 | "file_extension": ".py",
202 | "mimetype": "text/x-python",
203 | "name": "python",
204 | "nbconvert_exporter": "python",
205 | "pygments_lexer": "ipython3",
206 | "version": "3.7.4"
207 | }
208 | },
209 | "nbformat": 4,
210 | "nbformat_minor": 4
211 | }
212 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/intermediate_models/ReadMe.md:
--------------------------------------------------------------------------------
1 | # Description
2 |
3 | Here there are the implementations of the models used to obtain the 1000 prediction on which we trained the logistic regeression
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/intermediate_models/spactW2v.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# W2v\n",
8 | "\n",
9 | "A classification of W2v spaCy vectors, using scikit MLP"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {
16 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
17 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
18 | },
19 | "outputs": [],
20 | "source": [
21 | "from sklearn.preprocessing import StandardScaler # For scaling\n",
22 | "from sklearn.model_selection import train_test_split # for creating valid set and train set \n",
23 | "from sklearn.neural_network import MLPClassifier\n",
24 | "from sklearn.model_selection import KFold\n",
25 | "from sklearn.svm import SVC, LinearSVC\n",
26 | "import numpy as np\n",
27 | "import matplotlib.pyplot as plt\n",
28 | "from sklearn.metrics import roc_curve, auc\n",
29 | "import os\n",
30 | "from os import listdir\n",
31 | "from os.path import isfile, join\n",
32 | "from sklearn.decomposition import PCA\n",
33 | "import pandas as pd\n",
34 | "import math\n",
35 | "from sklearn.utils import shuffle"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {
42 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
43 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
44 | },
45 | "outputs": [],
46 | "source": [
47 | "y = np.load(\"targets.npy\").tolist()\n",
48 | "files = listdir(\"npy5000/\")\n",
49 | "files = [f for f in files if f == \"lPunctNumStopLemOovAgg.npy\"]\n",
50 | "for f in files:\n",
51 | " for i in range(1):\n",
52 | " X = np.load(\"../input/mydata/npy5000/\"+f)\n",
53 | " i = 75\n",
54 | " pca = PCA(i)\n",
55 | " pca.fit(X)\n",
56 | " U = pca.transform(X)\n",
57 | " U = U.tolist()\n",
58 | " df = pd.DataFrame({\"vect\": U, \"gender\": y})\n",
59 | " # unbalnced \n",
60 | " seed = 100\n",
61 | " split = math.floor(len(df)*0.8)\n",
62 | " train_df = df.sample(split, random_state = 100)\n",
63 | " test_df = df.drop(train_df.index)\n",
64 | " x_train = np.array(train_df[\"vect\"].tolist())\n",
65 | " print(\"x_train.shape = \", x_train.shape)\n",
66 | " #print(\"x_train[0] = \", x_train[0])\n",
67 | " x_validation = np.array(test_df[\"vect\"].tolist())\n",
68 | " print(\"x_validation.shape = \", x_validation.shape)\n",
69 | " y_train = np.array(train_df[\"gender\"].tolist())\n",
70 | " print(\"y_train.shape = \", y_train.shape)\n",
71 | " y_validation = np.array(test_df[\"gender\"].tolist())\n",
72 | " print(\"y_validation.shape = \", y_validation.shape) \n",
73 | "\n",
74 | "# end of unbalanced\n",
75 | " \n",
76 | " # balanced part\n",
77 | "# U_m = df.loc[df[\"gender\"] == 0, :]\n",
78 | "# U_f = df.loc[df[\"gender\"] == 1, :]\n",
79 | "\n",
80 | "# split = math.floor(len(U_f)*0.8)\n",
81 | "# print(\"split = \",split)\n",
82 | "\n",
83 | "# seed = 100\n",
84 | "\n",
85 | "# train_data_sample_m = U_m.sample(n = split, random_state = seed)\n",
86 | "# train_vects_m =train_data_sample_m[\"vect\"].tolist()\n",
87 | "# test_data_sample_m = U_m.drop(train_data_sample_m.index)\n",
88 | "# #test_data_sample_m = test_data_sample_m.reset_index() \n",
89 | "# test_vects_m = test_data_sample_m[\"vect\"].tolist()\n",
90 | "\n",
91 | "# train_data_sample_f = U_f.sample(n = split, random_state = seed)\n",
92 | "# train_vects_f = train_data_sample_f[\"vect\"].tolist()\n",
93 | "# test_data_sample_f = U_f.drop(train_data_sample_f.index)\n",
94 | "# #test_data_sample_f = test_data_sample_f.reset_index() \n",
95 | "# test_vects_f = test_data_sample_f[\"vect\"].tolist()\n",
96 | "\n",
97 | "# train_vects = train_vects_m + train_vects_f\n",
98 | "# test_vects = test_vects_m + test_vects_f\n",
99 | "\n",
100 | "# train_labels = [0 for i in range(split)] + [1 for i in range(split)]\n",
101 | "# test_labels = [0 for i in range(len(U_m)-split)] + [1 for i in range(len(U_f)-split)]\n",
102 | "# x_train = np.array(train_vects)\n",
103 | "# print(\"x_train.shape = \", x_train.shape)\n",
104 | "# #print(\"x_train[0] = \", x_train[0])\n",
105 | "# x_validation = np.array(test_vects)\n",
106 | "# print(\"x_validation.shape = \", x_validation.shape)\n",
107 | "# y_train = np.array(train_labels)\n",
108 | "# print(\"y_train.shape = \", y_train.shape)\n",
109 | "# y_validation = np.array(test_labels)\n",
110 | "# print(\"y_validation.shape = \", y_validation.shape)\n",
111 | "# x_train, y_train = shuffle(x_train, y_train, random_state = 0)\n",
112 | " # end of balanced\n",
113 | " \n",
114 | " \n",
115 | " # model\n",
116 | " mlpClf = MLPClassifier(solver = 'adam', activation= 'relu' ,alpha = 0.02, verbose = False, early_stopping = True,\n",
117 | " learning_rate = 'invscaling', max_iter = 400)\n",
118 | "\n",
119 | " # Cross validation - 10 Fold \n",
120 | " kf = KFold(n_splits = 10)\n",
121 | "\n",
122 | " for train_indices, test_indices in kf.split(x_train):\n",
123 | " mlpClf.fit(x_train[train_indices], y_train[train_indices])\n",
124 | " print(mlpClf.score(x_train[test_indices], y_train[test_indices]))\n",
125 | " y_score = mlpClf.predict_proba(x_validation)[:,1]\n",
126 | " fpr, tpr, thresholds = roc_curve(y_validation, y_score)\n",
127 | " roc_auc = auc(fpr, tpr)\n",
128 | " roc = str(roc_auc)\n",
129 | " name = f.replace(\".npy\",\"\")+\"_\"+str(i)\n",
130 | " print(name+\" : \"+str(roc_auc))\n",
131 | " # with open( \"spacyW2vMlp\" + \".txt\", \"a\") as file: #name\n",
132 | " # file.write(\"\\t pca_\" +str(i)+ \" : \" + roc+\"\\n\")\n",
133 | " # file.close()\n",
134 | "\n",
135 | "\n",
136 | "# df_res = pd.DataFrame({\"pred_y\": y_score, \"true_y\":y_validation})\n",
137 | "# df_res.to_csv (r'../working/W2v.csv', index = False, header=True)\n",
138 | "\n",
139 | " "
140 | ]
141 | },
142 | {
143 | "cell_type": "markdown",
144 | "metadata": {},
145 | "source": [
146 | "### balanced results different pca's\n",
147 | "\n",
148 | "split = 1079
\n",
149 | "x_train.shape = (2158, 10)
\n",
150 | "x_validation.shape = (2842, 10)
\n",
151 | "y_train.shape = (2158,)
\n",
152 | "y_validation.shape = (2842,)
\n",
153 | "\n",
154 | "lPunctNumStopLemOovAgg_10 : 0.7773587350959046
\n",
155 | "lPunctNumStopLemOovAgg_20 : 0.7794985887909682
\n",
156 | "lPunctNumStopLemOovAgg_30 : 0.8106258280052993
\n",
157 | "lPunctNumStopLemOovAgg_40 : 0.8099159034617822
\n",
158 | "lPunctNumStopLemOovAgg_50 : 0.8292624272795345
\n",
159 | "lPunctNumStopLemOovAgg_60 : 0.8135677668337078
\n",
160 | "lPunctNumStopLemOovAgg_70 : 0.8236233511894476
\n",
161 | "lPunctNumStopLemOovAgg_75 : 0.8321510857669489
\n",
162 | "lPunctNumStopLemOovAgg_80 : 0.8347517424111515
\n",
163 | "lPunctNumStopLemOovAgg_85 : 0.7923204308507575
\n",
164 | "lPunctNumStopLemOovAgg_90 : 0.8280326594090203
\n",
165 | "lPunctNumStopLemOovAgg_100 : 0.8135144864927135
\n",
166 | "lPunctNumStopLemOovAgg_110 : 0.7994311963596568
\n",
167 | "lPunctNumStopLemOovAgg_120 : 0.8126332008524854
\n",
168 | "lPunctNumStopLemOovAgg_130 : 0.7794049881919244
\n",
169 | "lPunctNumStopLemOovAgg_140 : 0.8027504176026727
\n",
170 | "lPunctNumStopLemOovAgg_150 : 0.7867101549449917
\n",
171 | "lPunctNumStopLemOovAgg_160 : 0.8010440066816429
\n",
172 | "lPunctNumStopLemOovAgg_165 : 0.8039686653994587
\n",
173 | "lPunctNumStopLemOovAgg_170 : 0.8058219572605264
\n",
174 | "lPunctNumStopLemOovAgg_180 : 0.8310192385231266
\n",
175 | "lPunctNumStopLemOovAgg_190 : 0.8154628189620413
"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "# save results\n",
185 | "df_res = pd.DataFrame({\"pred_y\": y_score, \"true_y\":y_validation})\n",
186 | "df_res.to_csv (r'../working/W2v_bal.csv', index = False, header=True)"
187 | ]
188 | }
189 | ],
190 | "metadata": {
191 | "kernelspec": {
192 | "display_name": "Python 3",
193 | "language": "python",
194 | "name": "python3"
195 | },
196 | "language_info": {
197 | "codemirror_mode": {
198 | "name": "ipython",
199 | "version": 3
200 | },
201 | "file_extension": ".py",
202 | "mimetype": "text/x-python",
203 | "name": "python",
204 | "nbconvert_exporter": "python",
205 | "pygments_lexer": "ipython3",
206 | "version": "3.7.4"
207 | }
208 | },
209 | "nbformat": 4,
210 | "nbformat_minor": 4
211 | }
212 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/bow_bal_lPunctAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | bow_bal_lPunctAgg
2 | epoch = 0, losses = {'textcat': 9.083782873424466}, roc = 0.9084506940844422
3 | epoch = 1, losses = {'textcat': 12.229801848536603}, roc = 0.88266228903865
4 | epoch = 2, losses = {'textcat': 13.809510084057532}, roc = 0.9050421922700305
5 | epoch = 3, losses = {'textcat': 14.714626950012775}, roc = 0.91474713438166
6 | epoch = 4, losses = {'textcat': 15.311987071944198}, roc = 0.9039636253672024
7 | epoch = 5, losses = {'textcat': 15.714608116661427}, roc = 0.9031608202292495
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/bow_bal_lPunctNumAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | bow_bal_lPunctNumAgg
2 | epoch = 0, losses = {'textcat': 9.070996835451297}, roc = 0.8518705719716606
3 | epoch = 1, losses = {'textcat': 12.505073593091929}, roc = 0.843479638269685
4 | epoch = 2, losses = {'textcat': 14.681906531202715}, roc = 0.8455186913196244
5 | epoch = 3, losses = {'textcat': 15.9343872602596}, roc = 0.8499308795576291
6 | epoch = 4, losses = {'textcat': 16.56670094780488}, roc = 0.8508596855019872
7 | epoch = 5, losses = {'textcat': 16.947070740996637}, roc = 0.848119347963827
8 | epoch = 6, losses = {'textcat': 17.27540420358229}, roc = 0.8479767870514371
9 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/bow_dlPunctNumStopLemOovAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | bow_dlPunctNumStopLemOovAgg
2 | epoch = 0, losses = {'textcat': 8.904687333793845}, roc = 0.8557315879618302
3 | epoch = 1, losses = {'textcat': 12.338626464243827}, roc = 0.8615503425495473
4 | epoch = 2, losses = {'textcat': 14.526519826557568}, roc = 0.8663368607780768
5 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/bow_lPunctAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctAgg
2 | epoch = 0, losses = {'textcat': 10.300575028954164}, roc = 0.8472442561930431
3 | epoch = 1, losses = {'textcat': 15.44146652551996}, roc = 0.8600077011936851
4 | epoch = 2, losses = {'textcat': 18.253285435726045}, roc = 0.8596842510589141
5 | epoch = 3, losses = {'textcat': 20.024297386326218}, roc = 0.8545860608394301
6 | epoch = 4, losses = {'textcat': 21.157677383200436}, roc = 0.8565370299063022
7 | epoch = 5, losses = {'textcat': 22.03504304502588}, roc = 0.8572660762418175
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/bow_lPunctNumAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctNumAgg
2 | epoch = 0, losses = {'textcat': 10.316491311206551}, roc = 0.8487459889616223
3 | epoch = 1, losses = {'textcat': 15.3834985842316}, roc = 0.8674675908099089
4 | epoch = 2, losses = {'textcat': 18.139316414716088}, roc = 0.8616275189321012
5 | epoch = 3, losses = {'textcat': 19.851637275560474}, roc = 0.8596585804132975
6 | epoch = 4, losses = {'textcat': 20.907842139506286}, roc = 0.8587010653317931
7 | epoch = 5, losses = {'textcat': 21.722357641635657}, roc = 0.8530252855859325
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/bow_lPunctNumLemAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctNumLemAgg
2 | epoch = 0, losses = {'textcat': 10.29036844101829}, roc = 0.8553972532409191
3 | epoch = 1, losses = {'textcat': 15.721021297912802}, roc = 0.8621486330381211
4 | epoch = 2, losses = {'textcat': 18.663440811308213}, roc = 0.8572506738544474
5 | epoch = 3, losses = {'textcat': 20.400520129909953}, roc = 0.8703195995379285
6 | epoch = 4, losses = {'textcat': 21.658334924989223}, roc = 0.8669901168014378
7 | epoch = 5, losses = {'textcat': 22.603086117967706}, roc = 0.8675908099088694
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/bow_lPunctNumLemOovAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctNumLemOovAgg
2 | epoch = 0, losses = {'textcat': 10.197543423597457}, roc = 0.8483275574380695
3 | epoch = 1, losses = {'textcat': 15.81021447241982}, roc = 0.8581542805801566
4 | epoch = 2, losses = {'textcat': 19.228670098632225}, roc = 0.8616968296752663
5 | epoch = 3, losses = {'textcat': 21.795570858355152}, roc = 0.8520806058272367
6 | epoch = 4, losses = {'textcat': 23.384679663050104}, roc = 0.8497907842382235
7 | epoch = 5, losses = {'textcat': 24.688360156274754}, roc = 0.8550686689770248
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/bow_lPunctNumOovAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctNumOovAgg
2 | epoch = 0, losses = {'textcat': 10.606671018825214}, roc = 0.840441535104608
3 | epoch = 1, losses = {'textcat': 16.482640654656333}, roc = 0.8509151585162368
4 | epoch = 2, losses = {'textcat': 19.598626431889233}, roc = 0.8592555512771146
5 | epoch = 3, losses = {'textcat': 21.42554874333712}, roc = 0.865665511487614
6 | epoch = 4, losses = {'textcat': 22.87796032469087}, roc = 0.8701116673084329
7 | epoch = 5, losses = {'textcat': 23.912508471060537}, roc = 0.8683788987293031
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/bow_lPunctNumPersAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctNumPersAgg
2 | epoch = 0, losses = {'textcat': 9.659992030909795}, roc = 0.8598382749326147
3 | epoch = 1, losses = {'textcat': 13.254407076947524}, roc = 0.8739340264407651
4 | epoch = 2, losses = {'textcat': 15.111549782759436}, roc = 0.8792940572455397
5 | epoch = 3, losses = {'textcat': 15.997021635469657}, roc = 0.877930945963291
6 | epoch = 4, losses = {'textcat': 16.575501656098304}, roc = 0.8770196380438967
7 | epoch = 5, losses = {'textcat': 16.928584417800174}, roc = 0.8734591194968553
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/bow_lPunctNumPersLemAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctNumPersLemAgg
2 | epoch = 0, losses = {'textcat': 9.383625615081417}, roc = 0.8685303555384417
3 | epoch = 1, losses = {'textcat': 13.28497734584512}, roc = 0.8756565267616481
4 | epoch = 2, losses = {'textcat': 15.407424367942612}, roc = 0.8725272750609678
5 | epoch = 3, losses = {'textcat': 16.60949956353454}, roc = 0.8703658067000386
6 | epoch = 4, losses = {'textcat': 17.3632590999101}, roc = 0.8677140290078296
7 | epoch = 5, losses = {'textcat': 17.834758405944516}, roc = 0.8680580156590938
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/bow_lPunctNumPersLemOovAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctNumPersLemOovAgg
2 | epoch = 0, losses = {'textcat': 9.513648208222001}, roc = 0.8517058144012322
3 | epoch = 1, losses = {'textcat': 13.863807656152193}, roc = 0.864453857014504
4 | epoch = 2, losses = {'textcat': 16.46012714357848}, roc = 0.8673289693235784
5 | epoch = 3, losses = {'textcat': 18.004219502897023}, roc = 0.8680041073032987
6 | epoch = 4, losses = {'textcat': 19.085807765813307}, roc = 0.8678192786548582
7 | epoch = 5, losses = {'textcat': 19.84717152449107}, roc = 0.8641509433962263
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/bow_lPunctNumStopLemAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctNumStopLemAgg
2 | epoch = 0, losses = {'textcat': 8.883919595161183}, roc = 0.8709690668720317
3 | epoch = 1, losses = {'textcat': 12.155519061331876}, roc = 0.8812706969580286
4 | epoch = 2, losses = {'textcat': 13.865362251984102}, roc = 0.8741060197663971
5 | epoch = 3, losses = {'textcat': 14.811687069882568}, roc = 0.8711564625850341
6 | epoch = 4, losses = {'textcat': 15.499547603402688}, roc = 0.8669721473495058
7 | epoch = 5, losses = {'textcat': 15.88002811915746}, roc = 0.862133230650751
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/bow_lPunctNumStopLemOovAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctNumStopLemOovAgg
2 | epoch = 0, losses = {'textcat': 8.991991002214835}, roc = 0.8685611603131819
3 | epoch = 1, losses = {'textcat': 12.712942150187756}, roc = 0.8808060582723656
4 | epoch = 2, losses = {'textcat': 14.934007483348498}, roc = 0.8775561545372866
5 | epoch = 3, losses = {'textcat': 16.26776459135772}, roc = 0.8723527146707739
6 | epoch = 4, losses = {'textcat': 17.113776076989428}, roc = 0.868710050057759
7 | epoch = 5, losses = {'textcat': 17.63977204713988}, roc = 0.8639199075856757
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/bow_lPunctNumStopOovAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctNumStopOovAgg
2 | epoch = 0, losses = {'textcat': 9.135657884018336}, roc = 0.8800667436786035
3 | epoch = 1, losses = {'textcat': 12.349658746887858}, roc = 0.8848029777948915
4 | epoch = 2, losses = {'textcat': 14.074350827866946}, roc = 0.8828571428571428
5 | epoch = 3, losses = {'textcat': 14.991081261942854}, roc = 0.8824207418816584
6 | epoch = 4, losses = {'textcat': 15.623346206312354}, roc = 0.8787061994609164
7 | epoch = 5, losses = {'textcat': 16.009936011346397}, roc = 0.8759440379925556
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/ensemble_bal_lPunctAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | ensemble_bal_lPunctAgg
2 | epoch = 0, losses = {'textcat': 10.771006600931287}, roc = 0.7958095731812684
3 | epoch = 1, losses = {'textcat': 17.779007678705966}, roc = 0.8307917170669892
4 | epoch = 2, losses = {'textcat': 21.071200552220034}, roc = 0.8122213582166926
5 | epoch = 3, losses = {'textcat': 22.43127129951489}, roc = 0.7965295777892979
6 | epoch = 4, losses = {'textcat': 23.157470632704094}, roc = 0.8051638730487874
7 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/ensemble_bal_lPunctNumStopLemOovAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | ensemble_bal_lPunctNumStopLemOovAgg
2 | epoch = 0, losses = {'textcat': 10.73626277083531}, roc = 0.8194502044813087
3 | epoch = 1, losses = {'textcat': 17.72877553733997}, roc = 0.8599965439778816
4 | epoch = 2, losses = {'textcat': 21.301636069205415}, roc = 0.8570013248084787
5 | epoch = 3, losses = {'textcat': 22.83226279049137}, roc = 0.852296814699614
6 | epoch = 4, losses = {'textcat': 23.535492210306984}, roc = 0.8362507920050689
7 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/ensemble_dlPunctNumLemOovAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | bow_dlPunctNumLemOovAgg
2 | epoch = 0, losses = {'textcat': 11.14930248935707}, roc = 0.8058885898376968
3 | epoch = 1, losses = {'textcat': 18.137150909838965}, roc = 0.870167604599951
4 | epoch = 2, losses = {'textcat': 21.942909762162685}, roc = 0.8624806296386918
5 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/ensemble_dlPunctNumStopLemOovAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | ensemble_bal_dlPunctNumStopLemOovAgg
2 | epoch = 0, losses = {'textcat': 10.573204169631936}, roc = 0.8677208221189137
3 | epoch = 1, losses = {'textcat': 16.051280780535308}, roc = 0.8713655085229588
4 | epoch = 2, losses = {'textcat': 19.00531985885982}, roc = 0.8542482260827013
5 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/ensemble_lPunctAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctAgg epoch = 0, losses = {'textcat': 10.729639297001995}, roc = 0.842746759080991
2 | epoch = 1, losses = {'textcat': 17.388430001898087}, roc = 0.8567423950712361
3 | epoch = 2, losses = {'textcat': 21.088454915356124}, roc = 0.8632678731870107
4 | epoch = 3, losses = {'textcat': 22.955513816137795}, roc = 0.846782184571942
5 | epoch = 4, losses = {'textcat': 23.7844230104636}, roc = 0.8460377358490567
6 | epoch = 5, losses = {'textcat': 24.454516666314753}, roc = 0.8463355153382108
7 | epoch = 6, losses = {'textcat': 24.73753832080388}, roc = 0.8378025927352073
8 | epoch = 7, losses = {'textcat': 25.07438993472748}, roc = 0.8332126812989348
9 | epoch = 8, losses = {'textcat': 25.190393376835182}, roc = 0.8299011680143754
10 | epoch = 9, losses = {'textcat': 25.372851968512258}, roc = 0.8320677705044282
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/ensemble_lPunctNumAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctNumAgg epoch = 0, losses = {'textcat': 10.849251543346327}, roc = 0.8439994865870877
2 | epoch = 1, losses = {'textcat': 17.44105708837742}, roc = 0.8709896033885252
3 | epoch = 2, losses = {'textcat': 21.194835011254327}, roc = 0.8760467205750225
4 | epoch = 3, losses = {'textcat': 23.208214861448596}, roc = 0.8752714670773971
5 | epoch = 4, losses = {'textcat': 24.236906560875724}, roc = 0.8654550121935566
6 | epoch = 5, losses = {'textcat': 24.778005035438593}, roc = 0.8700346553715826
7 | epoch = 6, losses = {'textcat': 25.133107319834863}, roc = 0.8669747144140675
8 | epoch = 7, losses = {'textcat': 25.325295451989195}, roc = 0.8677037607495829
9 | epoch = 8, losses = {'textcat': 25.620281110071257}, roc = 0.8670722628674111
10 | epoch = 9, losses = {'textcat': 25.646207855614715}, roc = 0.8636786035168784
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/ensemble_lPunctNumLemAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctNumLemAgg epoch = 0, losses = {'textcat': 10.678632560709957}, roc = 0.8591400333718393
2 | epoch = 1, losses = {'textcat': 17.103498125987244}, roc = 0.8714414067513797
3 | epoch = 2, losses = {'textcat': 20.854591814086234}, roc = 0.8708920549351816
4 | epoch = 3, losses = {'textcat': 22.87714549644079}, roc = 0.8660043640097548
5 | epoch = 4, losses = {'textcat': 23.852497358807643}, roc = 0.8584161211654473
6 | epoch = 5, losses = {'textcat': 24.415554903051355}, roc = 0.8565627005519187
7 | epoch = 6, losses = {'textcat': 24.75346492489465}, roc = 0.8536721858554742
8 | epoch = 7, losses = {'textcat': 24.919653205470567}, roc = 0.8565832370684123
9 | epoch = 8, losses = {'textcat': 24.99600611099764}, roc = 0.8565113592606854
10 | epoch = 9, losses = {'textcat': 25.150001712737367}, roc = 0.8512129380053909
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/ensemble_lPunctNumLemOovAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctNumLemOovAgg epoch = 0, losses = {'textcat': 10.632765143818688}, roc = 0.8680785521755873
2 | epoch = 1, losses = {'textcat': 16.603710685754777}, roc = 0.8756873315363882
3 | epoch = 2, losses = {'textcat': 20.098456279241873}, roc = 0.8748402002310357
4 | epoch = 3, losses = {'textcat': 22.197325268262148}, roc = 0.8670260557053011
5 | epoch = 4, losses = {'textcat': 23.376831758485014}, roc = 0.8527788473880118
6 | epoch = 5, losses = {'textcat': 24.049448740595956}, roc = 0.8547862918752405
7 | epoch = 6, losses = {'textcat': 24.454368137002774}, roc = 0.8548992427159544
8 | epoch = 7, losses = {'textcat': 24.841723376879056}, roc = 0.8579643178025927
9 | epoch = 8, losses = {'textcat': 24.95344296212421}, roc = 0.8519573867282763
10 | epoch = 9, losses = {'textcat': 25.03913710188129}, roc = 0.8460582723655501
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/ensemble_lPunctNumOovAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctNumOovAgg epoch = 0, losses = {'textcat': 10.849000597256236}, roc = 0.8273905788730586
2 | epoch = 1, losses = {'textcat': 17.615710976722767}, roc = 0.8691618534206136
3 | epoch = 2, losses = {'textcat': 21.519193997845832}, roc = 0.8813810807341804
4 | epoch = 3, losses = {'textcat': 23.555163298897355}, roc = 0.8833423180592992
5 | epoch = 4, losses = {'textcat': 24.592904370654782}, roc = 0.8853908355795148
6 | epoch = 5, losses = {'textcat': 25.232358423387602}, roc = 0.8834604030291362
7 | epoch = 6, losses = {'textcat': 25.71876220397092}, roc = 0.8715902964959569
8 | epoch = 7, losses = {'textcat': 25.852575093551987}, roc = 0.8668823000898472
9 | epoch = 8, losses = {'textcat': 26.09547118551534}, roc = 0.8692619689385188
10 | epoch = 9, losses = {'textcat': 26.272543905184413}, roc = 0.8659581568476448
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/ensemble_lPunctNumPersAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctNumPersAgg epoch = 0, losses = {'textcat': 10.358336042787414}, roc = 0.8718983442433577
2 | epoch = 1, losses = {'textcat': 15.565269033104414}, roc = 0.8941599281221924
3 | epoch = 2, losses = {'textcat': 17.932983758057844}, roc = 0.8852214093184444
4 | epoch = 3, losses = {'textcat': 19.04217331202392}, roc = 0.8730381209087408
5 | epoch = 4, losses = {'textcat': 19.687709225343976}, roc = 0.874799127198049
6 | epoch = 5, losses = {'textcat': 20.034264428140364}, roc = 0.8718829418559876
7 | epoch = 6, losses = {'textcat': 20.304004829154785}, roc = 0.870219484020023
8 | epoch = 7, losses = {'textcat': 20.454276567252627}, roc = 0.8683506610191246
9 | epoch = 8, losses = {'textcat': 20.554914588447136}, roc = 0.8717391862405339
10 | epoch = 9, losses = {'textcat': 20.633671653256233}, roc = 0.8760826594788859
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/ensemble_lPunctNumPersLemAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctNumPersLemAgg epoch = 0, losses = {'textcat': 10.325701178051531}, roc = 0.8664304967269927
2 | epoch = 1, losses = {'textcat': 15.913291760210996}, roc = 0.8915158516236684
3 | epoch = 2, losses = {'textcat': 18.7300274250465}, roc = 0.8803131818765243
4 | epoch = 3, losses = {'textcat': 20.120089923865635}, roc = 0.8712360415864459
5 | epoch = 4, losses = {'textcat': 20.804775718720222}, roc = 0.8688127326402258
6 | epoch = 5, losses = {'textcat': 21.14956892632512}, roc = 0.8665588499550764
7 | epoch = 6, losses = {'textcat': 21.423461501962542}, roc = 0.8723090745732256
8 | epoch = 7, losses = {'textcat': 21.56527505127891}, roc = 0.8700295212424592
9 | epoch = 8, losses = {'textcat': 21.688171283806636}, roc = 0.8668258246694905
10 | epoch = 9, losses = {'textcat': 21.79666106960388}, roc = 0.8704813246053137
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/ensemble_lPunctNumPersLemOovAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctNumPersLemOovAgg epoch = 0, losses = {'textcat': 10.597144522122107}, roc = 0.8578462328327556
2 | epoch = 1, losses = {'textcat': 16.965219413206796}, roc = 0.8762982929020665
3 | epoch = 2, losses = {'textcat': 20.944239850628946}, roc = 0.8677756385573098
4 | epoch = 3, losses = {'textcat': 23.205930521911796}, roc = 0.8661275831087152
5 | epoch = 4, losses = {'textcat': 24.416154009721595}, roc = 0.8681504299833142
6 | epoch = 5, losses = {'textcat': 25.26161684111277}, roc = 0.8606956744962136
7 | epoch = 6, losses = {'textcat': 25.85344494000153}, roc = 0.8537440636632012
8 | epoch = 7, losses = {'textcat': 26.17937598666605}, roc = 0.8558952637658837
9 | epoch = 8, losses = {'textcat': 26.458660723634825}, roc = 0.8617533050956231
10 | epoch = 9, losses = {'textcat': 26.5519901394971}, roc = 0.8586676934924914
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/ensemble_lPunctNumStopLemAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctNumStopLemAgg epoch = 0, losses = {'textcat': 10.419062311004382}, roc = 0.8478706199460917
2 | epoch = 1, losses = {'textcat': 16.18246567517781}, roc = 0.8796816839943525
3 | epoch = 2, losses = {'textcat': 19.30511438575013}, roc = 0.8775920934411501
4 | epoch = 3, losses = {'textcat': 20.81128679516405}, roc = 0.8779412142215377
5 | epoch = 4, losses = {'textcat': 21.478999641421396}, roc = 0.8684687459889617
6 | epoch = 5, losses = {'textcat': 21.86197143290219}, roc = 0.8719034783724811
7 | epoch = 6, losses = {'textcat': 21.966874151151934}, roc = 0.866907970735464
8 | epoch = 7, losses = {'textcat': 22.130305263283137}, roc = 0.865177769220896
9 | epoch = 8, losses = {'textcat': 22.24113396179771}, roc = 0.8653112565781029
10 | epoch = 9, losses = {'textcat': 22.4314443554733}, roc = 0.8684841483763316
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/ensemble_lPunctNumStopLemOovAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctNumStopLemOovAgg epoch = 0, losses = {'textcat': 10.346970351412892}, roc = 0.8683763316647415
2 | epoch = 1, losses = {'textcat': 15.972997819677403}, roc = 0.9038069567449621
3 | epoch = 2, losses = {'textcat': 18.941932999511664}, roc = 0.8922859709921704
4 | epoch = 3, losses = {'textcat': 20.461612598504217}, roc = 0.8880811192401488
5 | epoch = 4, losses = {'textcat': 21.24935047177315}, roc = 0.8843691438839687
6 | epoch = 5, losses = {'textcat': 21.649178645025486}, roc = 0.8818636888717752
7 | epoch = 6, losses = {'textcat': 21.950326816203656}, roc = 0.8808984725965858
8 | epoch = 7, losses = {'textcat': 22.071821111654856}, roc = 0.8750198947503529
9 | epoch = 8, losses = {'textcat': 22.14719987215875}, roc = 0.8718213323065076
10 | epoch = 9, losses = {'textcat': 22.282656720771637}, roc = 0.8829880631497883
11 | ensemble_lPunctNumStopLemOovAgg
12 | epoch = 0, losses = {'textcat': 10.490000442718156}, roc = 0.8498061866255937
13 | epoch = 1, losses = {'textcat': 16.547183903574478}, roc = 0.8772121678860224
14 | epoch = 2, losses = {'textcat': 19.78883196215702}, roc = 0.8746707739699653
15 | epoch = 3, losses = {'textcat': 21.212445096770466}, roc = 0.8524759337697343
16 | epoch = 4, losses = {'textcat': 21.841305388159622}, roc = 0.8331664741368244
17 | epoch = 5, losses = {'textcat': 22.164312648375436}, roc = 0.8297214734950583
18 | epoch = 6, losses = {'textcat': 22.348416818236934}, roc = 0.826420228468746
19 | epoch = 7, losses = {'textcat': 22.57289976127649}, roc = 0.8245514054678474
20 | epoch = 8, losses = {'textcat': 22.755370378420064}, roc = 0.8239917853934027
21 | epoch = 9, losses = {'textcat': 22.91652452440097}, roc = 0.8339417276344502
22 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/.ipynb_checkpoints/ensemble_lPunctNumStopOovAgg-checkpoint.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctNumStopOovAgg epoch = 0, losses = {'textcat': 10.528761210793164}, roc = 0.8673289693235784
2 | epoch = 1, losses = {'textcat': 16.21045877024153}, roc = 0.889338980875369
3 | epoch = 2, losses = {'textcat': 19.006704837200232}, roc = 0.8886818123475806
4 | epoch = 3, losses = {'textcat': 20.306096649514416}, roc = 0.87391092285971
5 | epoch = 4, losses = {'textcat': 20.95130760752823}, roc = 0.8736644846617894
6 | epoch = 5, losses = {'textcat': 21.27921311019021}, roc = 0.8782441278398152
7 | epoch = 6, losses = {'textcat': 21.510209205060605}, roc = 0.8773328199204212
8 | epoch = 7, losses = {'textcat': 21.645089315511605}, roc = 0.8748350661019125
9 | epoch = 8, losses = {'textcat': 21.741663373617406}, roc = 0.8733923758182519
10 | epoch = 9, losses = {'textcat': 21.76332084200426}, roc = 0.8705532024130407
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_bal_lPunctAgg.txt:
--------------------------------------------------------------------------------
1 | bow_bal_lPunctAgg
2 | epoch = 0, losses = {'textcat': 9.302060287109384}, roc = 0.822655664996256
3 | epoch = 1, losses = {'textcat': 13.162341818209484}, roc = 0.830693076435689
4 | epoch = 2, losses = {'textcat': 15.271727846679754}, roc = 0.8276380968838201
5 | epoch = 3, losses = {'textcat': 16.73283363978179}, roc = 0.8206403720983815
6 | epoch = 4, losses = {'textcat': 17.417289779973682}, roc = 0.8212559760382466
7 | epoch = 5, losses = {'textcat': 17.877576482184697}, roc = 0.8197432463567768
8 | epoch = 6, losses = {'textcat': 18.1894339097069}, roc = 0.8176357928690744
9 | bow_bal_lPunctAgg
10 | epoch = 0, losses = {'textcat': 9.32271635598022}, roc = 0.8531601002246415
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_bal_lPunctNumAgg.txt:
--------------------------------------------------------------------------------
1 | bow_bal_lPunctNumAgg
2 | epoch = 0, losses = {'textcat': 9.070996835451297}, roc = 0.8518705719716606
3 | epoch = 1, losses = {'textcat': 12.505073593091929}, roc = 0.843479638269685
4 | epoch = 2, losses = {'textcat': 14.681906531202715}, roc = 0.8455186913196244
5 | epoch = 3, losses = {'textcat': 15.9343872602596}, roc = 0.8499308795576291
6 | epoch = 4, losses = {'textcat': 16.56670094780488}, roc = 0.8508596855019872
7 | epoch = 5, losses = {'textcat': 16.947070740996637}, roc = 0.848119347963827
8 | epoch = 6, losses = {'textcat': 17.27540420358229}, roc = 0.8479767870514371
9 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_bal_lPunctNumLemAgg.txt:
--------------------------------------------------------------------------------
1 | bow_bal_lPunctNumLemAgg
2 | epoch = 0, losses = {'textcat': 8.920310128462006}, roc = 0.8499539197050862
3 | epoch = 1, losses = {'textcat': 12.793419339724329}, roc = 0.8537411439433212
4 | epoch = 2, losses = {'textcat': 15.015833610607764}, roc = 0.8455287713841368
5 | epoch = 3, losses = {'textcat': 16.48632318486966}, roc = 0.8568537238638327
6 | epoch = 4, losses = {'textcat': 17.36400533343001}, roc = 0.857495967974195
7 | epoch = 5, losses = {'textcat': 17.890244026477234}, roc = 0.8565642820114049
8 | epoch = 6, losses = {'textcat': 18.371415728931197}, roc = 0.851052646736939
9 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_bal_lPunctNumLemOovAgg.txt:
--------------------------------------------------------------------------------
1 | bow_bal_lPunctNumLemOovAgg
2 | epoch = 0, losses = {'textcat': 8.70647652117259}, roc = 0.84155218593399
3 | epoch = 1, losses = {'textcat': 13.266189976038829}, roc = 0.848024307355567
4 | epoch = 2, losses = {'textcat': 16.120750505303732}, roc = 0.8411057830770117
5 | epoch = 3, losses = {'textcat': 18.030914867106905}, roc = 0.8529383388053684
6 | epoch = 4, losses = {'textcat': 19.245343244869073}, roc = 0.8566902828178098
7 | epoch = 5, losses = {'textcat': 19.979103932247735}, roc = 0.8562568400437762
8 | epoch = 6, losses = {'textcat': 20.6539379920545}, roc = 0.8522132941650826
9 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_bal_lPunctNumOovAgg.txt:
--------------------------------------------------------------------------------
1 | bow_bal_lPunctNumOovAgg
2 | epoch = 0, losses = {'textcat': 8.897850947932056}, roc = 0.8423024307355567
3 | epoch = 1, losses = {'textcat': 12.809824797757333}, roc = 0.8437762801681931
4 | epoch = 2, losses = {'textcat': 15.446070999403444}, roc = 0.8499942399631357
5 | epoch = 3, losses = {'textcat': 17.059131857684534}, roc = 0.8522507344047002
6 | epoch = 4, losses = {'textcat': 18.157829035962052}, roc = 0.8514767294510686
7 | epoch = 5, losses = {'textcat': 18.70078135735473}, roc = 0.8495960774148954
8 | epoch = 6, losses = {'textcat': 19.13037149310106}, roc = 0.8482842290190657
9 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_bal_lPunctNumPersAgg.txt:
--------------------------------------------------------------------------------
1 | bow_bal_lPunctNumPersAgg
2 | epoch = 0, losses = {'textcat': 7.942127413491107}, roc = 0.8563216404584989
3 | epoch = 1, losses = {'textcat': 10.657648807610341}, roc = 0.8576126087206958
4 | epoch = 2, losses = {'textcat': 12.023256314509185}, roc = 0.8550933125972007
5 | epoch = 3, losses = {'textcat': 12.718376671023494}, roc = 0.853201140487299
6 | epoch = 4, losses = {'textcat': 13.205239917829605}, roc = 0.8536230631876045
7 | epoch = 5, losses = {'textcat': 13.471313663067724}, roc = 0.8510332066125224
8 | epoch = 6, losses = {'textcat': 13.686726476120343}, roc = 0.8495355970278211
9 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_bal_lPunctNumPersLemAgg.txt:
--------------------------------------------------------------------------------
1 | bow_bal_lPunctNumPersLemAgg
2 | epoch = 0, losses = {'textcat': 8.12144905304558}, roc = 0.8489804734750304
3 | epoch = 1, losses = {'textcat': 11.43250018897337}, roc = 0.8533357813490006
4 | epoch = 2, losses = {'textcat': 13.055823825370439}, roc = 0.8501929612349518
5 | epoch = 3, losses = {'textcat': 14.094245906230416}, roc = 0.8575866885548068
6 | epoch = 4, losses = {'textcat': 14.65576545783411}, roc = 0.8568710039744254
7 | epoch = 5, losses = {'textcat': 15.022920232249685}, roc = 0.8543229076666092
8 | epoch = 6, losses = {'textcat': 15.303569548450263}, roc = 0.8525725764644894
9 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_bal_lPunctNumPersLemOovAgg.txt:
--------------------------------------------------------------------------------
1 | bow_bal_lPunctNumPersLemOovAgg
2 | epoch = 0, losses = {'textcat': 7.9846539189748}, roc = 0.857177005932838
3 | epoch = 1, losses = {'textcat': 11.407339283906628}, roc = 0.8603176660330626
4 | epoch = 2, losses = {'textcat': 13.37722663999433}, roc = 0.8533451414089049
5 | epoch = 3, losses = {'textcat': 14.584860815160596}, roc = 0.858619175162721
6 | epoch = 4, losses = {'textcat': 15.31267213681097}, roc = 0.8586213351765453
7 | epoch = 5, losses = {'textcat': 15.78524821235868}, roc = 0.8571561257992052
8 | epoch = 6, losses = {'textcat': 16.206703309529505}, roc = 0.8544035481827085
9 | bow_bal_lPunctNumPersLemOovAgg
10 | epoch = 0, losses = {'textcat': 10.543100330862217}, roc = 0.8660865100757283
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_bal_lPunctNumStopLemAgg.txt:
--------------------------------------------------------------------------------
1 | bow_bal_lPunctNumStopLemAgg
2 | epoch = 0, losses = {'textcat': 7.6965910377621185}, roc = 0.8627325614883935
3 | epoch = 1, losses = {'textcat': 10.470721331600874}, roc = 0.8617792753873624
4 | epoch = 2, losses = {'textcat': 11.779465222827628}, roc = 0.8552315534819421
5 | epoch = 3, losses = {'textcat': 12.66895769556537}, roc = 0.8541529865791141
6 | epoch = 4, losses = {'textcat': 13.206042752635458}, roc = 0.8515631300040319
7 | epoch = 5, losses = {'textcat': 13.554137631988128}, roc = 0.8463330165313057
8 | epoch = 6, losses = {'textcat': 13.809782768729407}, roc = 0.8421296296296296
9 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_bal_lPunctNumStopLemOovAgg.txt:
--------------------------------------------------------------------------------
1 | bow_bal_lPunctNumStopLemOovAgg
2 | epoch = 0, losses = {'textcat': 7.722838822774065}, roc = 0.8669237083117332
3 | epoch = 1, losses = {'textcat': 10.851907326930231}, roc = 0.8626447209262138
4 | epoch = 2, losses = {'textcat': 12.364781665208959}, roc = 0.8523414549853119
5 | epoch = 3, losses = {'textcat': 13.364213726853716}, roc = 0.8538973849432637
6 | epoch = 4, losses = {'textcat': 14.003232195298567}, roc = 0.8549809918783481
7 | epoch = 5, losses = {'textcat': 14.470020648783343}, roc = 0.8541781867403951
8 | epoch = 6, losses = {'textcat': 14.829334140441915}, roc = 0.8510605667876274
9 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_bal_lPunctNumStopOovAgg.txt:
--------------------------------------------------------------------------------
1 | bow_bal_lPunctNumStopOovAgg
2 | epoch = 0, losses = {'textcat': 7.644319462408021}, roc = 0.8646297736305512
3 | epoch = 1, losses = {'textcat': 10.299294181489332}, roc = 0.8604047865906342
4 | epoch = 2, losses = {'textcat': 11.559321389345481}, roc = 0.8573375669604286
5 | epoch = 3, losses = {'textcat': 12.309172212446663}, roc = 0.8520440930821958
6 | epoch = 4, losses = {'textcat': 12.788964540282382}, roc = 0.8490193537238638
7 | epoch = 5, losses = {'textcat': 13.128131629973968}, roc = 0.8473633431253961
8 | epoch = 6, losses = {'textcat': 13.391003333925637}, roc = 0.8459773342549393
9 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_dlPunctNumStopLemOovAgg.txt:
--------------------------------------------------------------------------------
1 | bow_dlPunctNumStopLemOovAgg
2 | epoch = 0, losses = {'textcat': 8.904687333793845}, roc = 0.8557315879618302
3 | epoch = 1, losses = {'textcat': 12.338626464243827}, roc = 0.8615503425495473
4 | epoch = 2, losses = {'textcat': 14.526519826557568}, roc = 0.8663368607780768
5 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_lPunctAgg.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctAgg
2 | epoch = 0, losses = {'textcat': 10.300575028954164}, roc = 0.8472442561930431
3 | epoch = 1, losses = {'textcat': 15.44146652551996}, roc = 0.8600077011936851
4 | epoch = 2, losses = {'textcat': 18.253285435726045}, roc = 0.8596842510589141
5 | epoch = 3, losses = {'textcat': 20.024297386326218}, roc = 0.8545860608394301
6 | epoch = 4, losses = {'textcat': 21.157677383200436}, roc = 0.8565370299063022
7 | epoch = 5, losses = {'textcat': 22.03504304502588}, roc = 0.8572660762418175
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_lPunctNumAgg.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctNumAgg
2 | epoch = 0, losses = {'textcat': 10.316491311206551}, roc = 0.8487459889616223
3 | epoch = 1, losses = {'textcat': 15.3834985842316}, roc = 0.8674675908099089
4 | epoch = 2, losses = {'textcat': 18.139316414716088}, roc = 0.8616275189321012
5 | epoch = 3, losses = {'textcat': 19.851637275560474}, roc = 0.8596585804132975
6 | epoch = 4, losses = {'textcat': 20.907842139506286}, roc = 0.8587010653317931
7 | epoch = 5, losses = {'textcat': 21.722357641635657}, roc = 0.8530252855859325
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_lPunctNumLemAgg.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctNumLemAgg
2 | epoch = 0, losses = {'textcat': 10.29036844101829}, roc = 0.8553972532409191
3 | epoch = 1, losses = {'textcat': 15.721021297912802}, roc = 0.8621486330381211
4 | epoch = 2, losses = {'textcat': 18.663440811308213}, roc = 0.8572506738544474
5 | epoch = 3, losses = {'textcat': 20.400520129909953}, roc = 0.8703195995379285
6 | epoch = 4, losses = {'textcat': 21.658334924989223}, roc = 0.8669901168014378
7 | epoch = 5, losses = {'textcat': 22.603086117967706}, roc = 0.8675908099088694
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_lPunctNumLemOovAgg.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctNumLemOovAgg
2 | epoch = 0, losses = {'textcat': 10.197543423597457}, roc = 0.8483275574380695
3 | epoch = 1, losses = {'textcat': 15.81021447241982}, roc = 0.8581542805801566
4 | epoch = 2, losses = {'textcat': 19.228670098632225}, roc = 0.8616968296752663
5 | epoch = 3, losses = {'textcat': 21.795570858355152}, roc = 0.8520806058272367
6 | epoch = 4, losses = {'textcat': 23.384679663050104}, roc = 0.8497907842382235
7 | epoch = 5, losses = {'textcat': 24.688360156274754}, roc = 0.8550686689770248
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_lPunctNumOovAgg.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctNumOovAgg
2 | epoch = 0, losses = {'textcat': 10.606671018825214}, roc = 0.840441535104608
3 | epoch = 1, losses = {'textcat': 16.482640654656333}, roc = 0.8509151585162368
4 | epoch = 2, losses = {'textcat': 19.598626431889233}, roc = 0.8592555512771146
5 | epoch = 3, losses = {'textcat': 21.42554874333712}, roc = 0.865665511487614
6 | epoch = 4, losses = {'textcat': 22.87796032469087}, roc = 0.8701116673084329
7 | epoch = 5, losses = {'textcat': 23.912508471060537}, roc = 0.8683788987293031
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_lPunctNumPersAgg.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctNumPersAgg
2 | epoch = 0, losses = {'textcat': 9.659992030909795}, roc = 0.8598382749326147
3 | epoch = 1, losses = {'textcat': 13.254407076947524}, roc = 0.8739340264407651
4 | epoch = 2, losses = {'textcat': 15.111549782759436}, roc = 0.8792940572455397
5 | epoch = 3, losses = {'textcat': 15.997021635469657}, roc = 0.877930945963291
6 | epoch = 4, losses = {'textcat': 16.575501656098304}, roc = 0.8770196380438967
7 | epoch = 5, losses = {'textcat': 16.928584417800174}, roc = 0.8734591194968553
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_lPunctNumPersLemAgg.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctNumPersLemAgg
2 | epoch = 0, losses = {'textcat': 9.383625615081417}, roc = 0.8685303555384417
3 | epoch = 1, losses = {'textcat': 13.28497734584512}, roc = 0.8756565267616481
4 | epoch = 2, losses = {'textcat': 15.407424367942612}, roc = 0.8725272750609678
5 | epoch = 3, losses = {'textcat': 16.60949956353454}, roc = 0.8703658067000386
6 | epoch = 4, losses = {'textcat': 17.3632590999101}, roc = 0.8677140290078296
7 | epoch = 5, losses = {'textcat': 17.834758405944516}, roc = 0.8680580156590938
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_lPunctNumPersLemOovAgg.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctNumPersLemOovAgg
2 | epoch = 0, losses = {'textcat': 9.513648208222001}, roc = 0.8517058144012322
3 | epoch = 1, losses = {'textcat': 13.863807656152193}, roc = 0.864453857014504
4 | epoch = 2, losses = {'textcat': 16.46012714357848}, roc = 0.8673289693235784
5 | epoch = 3, losses = {'textcat': 18.004219502897023}, roc = 0.8680041073032987
6 | epoch = 4, losses = {'textcat': 19.085807765813307}, roc = 0.8678192786548582
7 | epoch = 5, losses = {'textcat': 19.84717152449107}, roc = 0.8641509433962263
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_lPunctNumStopLemAgg.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctNumStopLemAgg
2 | epoch = 0, losses = {'textcat': 8.883919595161183}, roc = 0.8709690668720317
3 | epoch = 1, losses = {'textcat': 12.155519061331876}, roc = 0.8812706969580286
4 | epoch = 2, losses = {'textcat': 13.865362251984102}, roc = 0.8741060197663971
5 | epoch = 3, losses = {'textcat': 14.811687069882568}, roc = 0.8711564625850341
6 | epoch = 4, losses = {'textcat': 15.499547603402688}, roc = 0.8669721473495058
7 | epoch = 5, losses = {'textcat': 15.88002811915746}, roc = 0.862133230650751
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_lPunctNumStopLemOovAgg.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctNumStopLemOovAgg
2 | epoch = 0, losses = {'textcat': 8.991991002214835}, roc = 0.8685611603131819
3 | epoch = 1, losses = {'textcat': 12.712942150187756}, roc = 0.8808060582723656
4 | epoch = 2, losses = {'textcat': 14.934007483348498}, roc = 0.8775561545372866
5 | epoch = 3, losses = {'textcat': 16.26776459135772}, roc = 0.8723527146707739
6 | epoch = 4, losses = {'textcat': 17.113776076989428}, roc = 0.868710050057759
7 | epoch = 5, losses = {'textcat': 17.63977204713988}, roc = 0.8639199075856757
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/bow_lPunctNumStopOovAgg.txt:
--------------------------------------------------------------------------------
1 | bow_lPunctNumStopOovAgg
2 | epoch = 0, losses = {'textcat': 9.135657884018336}, roc = 0.8800667436786035
3 | epoch = 1, losses = {'textcat': 12.349658746887858}, roc = 0.8848029777948915
4 | epoch = 2, losses = {'textcat': 14.074350827866946}, roc = 0.8828571428571428
5 | epoch = 3, losses = {'textcat': 14.991081261942854}, roc = 0.8824207418816584
6 | epoch = 4, losses = {'textcat': 15.623346206312354}, roc = 0.8787061994609164
7 | epoch = 5, losses = {'textcat': 16.009936011346397}, roc = 0.8759440379925556
8 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_bal_lPunctAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_bal_lPunctAgg
2 | epoch = 0, losses = {'textcat': 10.771006600931287}, roc = 0.7958095731812684
3 | epoch = 1, losses = {'textcat': 17.779007678705966}, roc = 0.8307917170669892
4 | epoch = 2, losses = {'textcat': 21.071200552220034}, roc = 0.8122213582166926
5 | epoch = 3, losses = {'textcat': 22.43127129951489}, roc = 0.7965295777892979
6 | epoch = 4, losses = {'textcat': 23.157470632704094}, roc = 0.8051638730487874
7 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_bal_lPunctNumAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_bal_lPunctNumAgg
2 | epoch = 0, losses = {'textcat': 11.111040544696152}, roc = 0.7908804216346984
3 | epoch = 1, losses = {'textcat': 19.10246680257842}, roc = 0.8461609354299866
4 | epoch = 2, losses = {'textcat': 23.33828358113533}, roc = 0.8425306721963021
5 | epoch = 3, losses = {'textcat': 25.14938404349232}, roc = 0.828738263924889
6 | epoch = 4, losses = {'textcat': 25.967542636937917}, roc = 0.8076925292321872
7 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_bal_lPunctNumLemAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_bal_lPunctNumLemAgg
2 | epoch = 0, losses = {'textcat': 10.921858021989465}, roc = 0.8557384367259951
3 | epoch = 1, losses = {'textcat': 18.427781807025895}, roc = 0.8557974771038537
4 | epoch = 2, losses = {'textcat': 22.933604589139577}, roc = 0.8830294913887449
5 | epoch = 3, losses = {'textcat': 25.160109569373162}, roc = 0.869435804389148
6 | epoch = 4, losses = {'textcat': 26.235555890430874}, roc = 0.854176026726571
7 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_bal_lPunctNumLemOovAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_bal_lPunctNumLemOovAgg
2 | epoch = 0, losses = {'textcat': 11.09798441780731}, roc = 0.8316643626519211
3 | epoch = 1, losses = {'textcat': 19.27897498011589}, roc = 0.8633877656817004
4 | epoch = 2, losses = {'textcat': 24.195329733367544}, roc = 0.8631213639767296
5 | epoch = 3, losses = {'textcat': 26.952326160404482}, roc = 0.8620413570646853
6 | epoch = 4, losses = {'textcat': 28.221509296149407}, roc = 0.8533926617130351
7 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_bal_lPunctNumOovAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_bal_lPunctNumOovAgg
2 | epoch = 0, losses = {'textcat': 10.964338312391192}, roc = 0.7805598755832037
3 | epoch = 1, losses = {'textcat': 18.715853770030662}, roc = 0.8555757156845805
4 | epoch = 2, losses = {'textcat': 23.049163569317898}, roc = 0.8676962732561488
5 | epoch = 3, losses = {'textcat': 25.176179951930294}, roc = 0.860621507977651
6 | epoch = 4, losses = {'textcat': 26.10593884226084}, roc = 0.8471415817061229
7 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_bal_lPunctNumPersAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_bal_lPunctNumPersAgg
2 | epoch = 0, losses = {'textcat': 10.61521192966029}, roc = 0.8436812395599332
3 | epoch = 1, losses = {'textcat': 17.363522986648604}, roc = 0.8743044755486434
4 | epoch = 2, losses = {'textcat': 20.616124440028216}, roc = 0.8676768331317319
5 | epoch = 3, losses = {'textcat': 21.97993107588991}, roc = 0.864473532630609
6 | epoch = 4, losses = {'textcat': 22.626839527907563}, roc = 0.8439764414492251
7 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_bal_lPunctNumPersLemAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_bal_lPunctNumPersLemAgg
2 | epoch = 0, losses = {'textcat': 10.641894780797884}, roc = 0.8053179540349057
3 | epoch = 1, losses = {'textcat': 17.899752411001828}, roc = 0.8604811070790852
4 | epoch = 2, losses = {'textcat': 21.62845124416799}, roc = 0.8628232820690054
5 | epoch = 3, losses = {'textcat': 23.424733716310357}, roc = 0.8736708714935776
6 | epoch = 4, losses = {'textcat': 24.162155677251732}, roc = 0.8674586717354992
7 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_bal_lPunctNumPersLemOovAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_bal_lPunctNumPersLemOovAgg
2 | epoch = 0, losses = {'textcat': 10.861616820562631}, roc = 0.8464489372731985
3 | epoch = 1, losses = {'textcat': 18.343847768148407}, roc = 0.870999654397788
4 | epoch = 2, losses = {'textcat': 22.567757138167508}, roc = 0.8573786072230863
5 | epoch = 3, losses = {'textcat': 24.70317370561679}, roc = 0.8381976844651806
6 | epoch = 4, losses = {'textcat': 25.663438785077915}, roc = 0.8116050342722194
7 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_bal_lPunctNumStopLemAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_bal_lPunctNumStopLemAgg
2 | epoch = 0, losses = {'textcat': 10.737517139408737}, roc = 0.8420871493577559
3 | epoch = 1, losses = {'textcat': 17.890409937361255}, roc = 0.8593499798398709
4 | epoch = 2, losses = {'textcat': 21.311185453138023}, roc = 0.8693004435228385
5 | epoch = 3, losses = {'textcat': 22.775944610608576}, roc = 0.8635216865387938
6 | epoch = 4, losses = {'textcat': 23.41604380202767}, roc = 0.8496759979263868
7 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_bal_lPunctNumStopLemOovAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_bal_lPunctNumStopLemOovAgg
2 | epoch = 0, losses = {'textcat': 10.73626277083531}, roc = 0.8194502044813087
3 | epoch = 1, losses = {'textcat': 17.72877553733997}, roc = 0.8599965439778816
4 | epoch = 2, losses = {'textcat': 21.301636069205415}, roc = 0.8570013248084787
5 | epoch = 3, losses = {'textcat': 22.83226279049137}, roc = 0.852296814699614
6 | epoch = 4, losses = {'textcat': 23.535492210306984}, roc = 0.8362507920050689
7 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_bal_lPunctNumStopOovAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_bal_lPunctNumStopOovAgg
2 | epoch = 0, losses = {'textcat': 10.784322501625866}, roc = 0.8235895109728703
3 | epoch = 1, losses = {'textcat': 17.672744434559718}, roc = 0.8856099879039225
4 | epoch = 2, losses = {'textcat': 20.84777028957251}, roc = 0.8836371752779217
5 | epoch = 3, losses = {'textcat': 22.28241401606772}, roc = 0.8754190426818732
6 | epoch = 4, losses = {'textcat': 22.99313979086327}, roc = 0.8619981567882034
7 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_dlPunctNumLemOovAgg.txt:
--------------------------------------------------------------------------------
1 | bow_dlPunctNumLemOovAgg
2 | epoch = 0, losses = {'textcat': 11.14930248935707}, roc = 0.8058885898376968
3 | epoch = 1, losses = {'textcat': 18.137150909838965}, roc = 0.870167604599951
4 | epoch = 2, losses = {'textcat': 21.942909762162685}, roc = 0.8624806296386918
5 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_dlPunctNumStopLemOovAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_bal_dlPunctNumStopLemOovAgg
2 | epoch = 0, losses = {'textcat': 10.573204169631936}, roc = 0.8677208221189137
3 | epoch = 1, losses = {'textcat': 16.051280780535308}, roc = 0.8713655085229588
4 | epoch = 2, losses = {'textcat': 19.00531985885982}, roc = 0.8542482260827013
5 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_lPunctAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctAgg epoch = 0, losses = {'textcat': 10.729639297001995}, roc = 0.842746759080991
2 | epoch = 1, losses = {'textcat': 17.388430001898087}, roc = 0.8567423950712361
3 | epoch = 2, losses = {'textcat': 21.088454915356124}, roc = 0.8632678731870107
4 | epoch = 3, losses = {'textcat': 22.955513816137795}, roc = 0.846782184571942
5 | epoch = 4, losses = {'textcat': 23.7844230104636}, roc = 0.8460377358490567
6 | epoch = 5, losses = {'textcat': 24.454516666314753}, roc = 0.8463355153382108
7 | epoch = 6, losses = {'textcat': 24.73753832080388}, roc = 0.8378025927352073
8 | epoch = 7, losses = {'textcat': 25.07438993472748}, roc = 0.8332126812989348
9 | epoch = 8, losses = {'textcat': 25.190393376835182}, roc = 0.8299011680143754
10 | epoch = 9, losses = {'textcat': 25.372851968512258}, roc = 0.8320677705044282
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_lPunctNumAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctNumAgg epoch = 0, losses = {'textcat': 10.849251543346327}, roc = 0.8439994865870877
2 | epoch = 1, losses = {'textcat': 17.44105708837742}, roc = 0.8709896033885252
3 | epoch = 2, losses = {'textcat': 21.194835011254327}, roc = 0.8760467205750225
4 | epoch = 3, losses = {'textcat': 23.208214861448596}, roc = 0.8752714670773971
5 | epoch = 4, losses = {'textcat': 24.236906560875724}, roc = 0.8654550121935566
6 | epoch = 5, losses = {'textcat': 24.778005035438593}, roc = 0.8700346553715826
7 | epoch = 6, losses = {'textcat': 25.133107319834863}, roc = 0.8669747144140675
8 | epoch = 7, losses = {'textcat': 25.325295451989195}, roc = 0.8677037607495829
9 | epoch = 8, losses = {'textcat': 25.620281110071257}, roc = 0.8670722628674111
10 | epoch = 9, losses = {'textcat': 25.646207855614715}, roc = 0.8636786035168784
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_lPunctNumLemAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctNumLemAgg epoch = 0, losses = {'textcat': 10.678632560709957}, roc = 0.8591400333718393
2 | epoch = 1, losses = {'textcat': 17.103498125987244}, roc = 0.8714414067513797
3 | epoch = 2, losses = {'textcat': 20.854591814086234}, roc = 0.8708920549351816
4 | epoch = 3, losses = {'textcat': 22.87714549644079}, roc = 0.8660043640097548
5 | epoch = 4, losses = {'textcat': 23.852497358807643}, roc = 0.8584161211654473
6 | epoch = 5, losses = {'textcat': 24.415554903051355}, roc = 0.8565627005519187
7 | epoch = 6, losses = {'textcat': 24.75346492489465}, roc = 0.8536721858554742
8 | epoch = 7, losses = {'textcat': 24.919653205470567}, roc = 0.8565832370684123
9 | epoch = 8, losses = {'textcat': 24.99600611099764}, roc = 0.8565113592606854
10 | epoch = 9, losses = {'textcat': 25.150001712737367}, roc = 0.8512129380053909
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_lPunctNumLemOovAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctNumLemOovAgg epoch = 0, losses = {'textcat': 10.632765143818688}, roc = 0.8680785521755873
2 | epoch = 1, losses = {'textcat': 16.603710685754777}, roc = 0.8756873315363882
3 | epoch = 2, losses = {'textcat': 20.098456279241873}, roc = 0.8748402002310357
4 | epoch = 3, losses = {'textcat': 22.197325268262148}, roc = 0.8670260557053011
5 | epoch = 4, losses = {'textcat': 23.376831758485014}, roc = 0.8527788473880118
6 | epoch = 5, losses = {'textcat': 24.049448740595956}, roc = 0.8547862918752405
7 | epoch = 6, losses = {'textcat': 24.454368137002774}, roc = 0.8548992427159544
8 | epoch = 7, losses = {'textcat': 24.841723376879056}, roc = 0.8579643178025927
9 | epoch = 8, losses = {'textcat': 24.95344296212421}, roc = 0.8519573867282763
10 | epoch = 9, losses = {'textcat': 25.03913710188129}, roc = 0.8460582723655501
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_lPunctNumOovAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctNumOovAgg epoch = 0, losses = {'textcat': 10.849000597256236}, roc = 0.8273905788730586
2 | epoch = 1, losses = {'textcat': 17.615710976722767}, roc = 0.8691618534206136
3 | epoch = 2, losses = {'textcat': 21.519193997845832}, roc = 0.8813810807341804
4 | epoch = 3, losses = {'textcat': 23.555163298897355}, roc = 0.8833423180592992
5 | epoch = 4, losses = {'textcat': 24.592904370654782}, roc = 0.8853908355795148
6 | epoch = 5, losses = {'textcat': 25.232358423387602}, roc = 0.8834604030291362
7 | epoch = 6, losses = {'textcat': 25.71876220397092}, roc = 0.8715902964959569
8 | epoch = 7, losses = {'textcat': 25.852575093551987}, roc = 0.8668823000898472
9 | epoch = 8, losses = {'textcat': 26.09547118551534}, roc = 0.8692619689385188
10 | epoch = 9, losses = {'textcat': 26.272543905184413}, roc = 0.8659581568476448
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_lPunctNumPersAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctNumPersAgg epoch = 0, losses = {'textcat': 10.358336042787414}, roc = 0.8718983442433577
2 | epoch = 1, losses = {'textcat': 15.565269033104414}, roc = 0.8941599281221924
3 | epoch = 2, losses = {'textcat': 17.932983758057844}, roc = 0.8852214093184444
4 | epoch = 3, losses = {'textcat': 19.04217331202392}, roc = 0.8730381209087408
5 | epoch = 4, losses = {'textcat': 19.687709225343976}, roc = 0.874799127198049
6 | epoch = 5, losses = {'textcat': 20.034264428140364}, roc = 0.8718829418559876
7 | epoch = 6, losses = {'textcat': 20.304004829154785}, roc = 0.870219484020023
8 | epoch = 7, losses = {'textcat': 20.454276567252627}, roc = 0.8683506610191246
9 | epoch = 8, losses = {'textcat': 20.554914588447136}, roc = 0.8717391862405339
10 | epoch = 9, losses = {'textcat': 20.633671653256233}, roc = 0.8760826594788859
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_lPunctNumPersLemAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctNumPersLemAgg epoch = 0, losses = {'textcat': 10.325701178051531}, roc = 0.8664304967269927
2 | epoch = 1, losses = {'textcat': 15.913291760210996}, roc = 0.8915158516236684
3 | epoch = 2, losses = {'textcat': 18.7300274250465}, roc = 0.8803131818765243
4 | epoch = 3, losses = {'textcat': 20.120089923865635}, roc = 0.8712360415864459
5 | epoch = 4, losses = {'textcat': 20.804775718720222}, roc = 0.8688127326402258
6 | epoch = 5, losses = {'textcat': 21.14956892632512}, roc = 0.8665588499550764
7 | epoch = 6, losses = {'textcat': 21.423461501962542}, roc = 0.8723090745732256
8 | epoch = 7, losses = {'textcat': 21.56527505127891}, roc = 0.8700295212424592
9 | epoch = 8, losses = {'textcat': 21.688171283806636}, roc = 0.8668258246694905
10 | epoch = 9, losses = {'textcat': 21.79666106960388}, roc = 0.8704813246053137
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_lPunctNumPersLemOovAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctNumPersLemOovAgg epoch = 0, losses = {'textcat': 10.597144522122107}, roc = 0.8578462328327556
2 | epoch = 1, losses = {'textcat': 16.965219413206796}, roc = 0.8762982929020665
3 | epoch = 2, losses = {'textcat': 20.944239850628946}, roc = 0.8677756385573098
4 | epoch = 3, losses = {'textcat': 23.205930521911796}, roc = 0.8661275831087152
5 | epoch = 4, losses = {'textcat': 24.416154009721595}, roc = 0.8681504299833142
6 | epoch = 5, losses = {'textcat': 25.26161684111277}, roc = 0.8606956744962136
7 | epoch = 6, losses = {'textcat': 25.85344494000153}, roc = 0.8537440636632012
8 | epoch = 7, losses = {'textcat': 26.17937598666605}, roc = 0.8558952637658837
9 | epoch = 8, losses = {'textcat': 26.458660723634825}, roc = 0.8617533050956231
10 | epoch = 9, losses = {'textcat': 26.5519901394971}, roc = 0.8586676934924914
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_lPunctNumStopLemAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctNumStopLemAgg epoch = 0, losses = {'textcat': 10.419062311004382}, roc = 0.8478706199460917
2 | epoch = 1, losses = {'textcat': 16.18246567517781}, roc = 0.8796816839943525
3 | epoch = 2, losses = {'textcat': 19.30511438575013}, roc = 0.8775920934411501
4 | epoch = 3, losses = {'textcat': 20.81128679516405}, roc = 0.8779412142215377
5 | epoch = 4, losses = {'textcat': 21.478999641421396}, roc = 0.8684687459889617
6 | epoch = 5, losses = {'textcat': 21.86197143290219}, roc = 0.8719034783724811
7 | epoch = 6, losses = {'textcat': 21.966874151151934}, roc = 0.866907970735464
8 | epoch = 7, losses = {'textcat': 22.130305263283137}, roc = 0.865177769220896
9 | epoch = 8, losses = {'textcat': 22.24113396179771}, roc = 0.8653112565781029
10 | epoch = 9, losses = {'textcat': 22.4314443554733}, roc = 0.8684841483763316
11 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_lPunctNumStopLemOovAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctNumStopLemOovAgg epoch = 0, losses = {'textcat': 10.346970351412892}, roc = 0.8683763316647415
2 | epoch = 1, losses = {'textcat': 15.972997819677403}, roc = 0.9038069567449621
3 | epoch = 2, losses = {'textcat': 18.941932999511664}, roc = 0.8922859709921704
4 | epoch = 3, losses = {'textcat': 20.461612598504217}, roc = 0.8880811192401488
5 | epoch = 4, losses = {'textcat': 21.24935047177315}, roc = 0.8843691438839687
6 | epoch = 5, losses = {'textcat': 21.649178645025486}, roc = 0.8818636888717752
7 | epoch = 6, losses = {'textcat': 21.950326816203656}, roc = 0.8808984725965858
8 | epoch = 7, losses = {'textcat': 22.071821111654856}, roc = 0.8750198947503529
9 | epoch = 8, losses = {'textcat': 22.14719987215875}, roc = 0.8718213323065076
10 | epoch = 9, losses = {'textcat': 22.282656720771637}, roc = 0.8829880631497883
11 | ensemble_lPunctNumStopLemOovAgg
12 | epoch = 0, losses = {'textcat': 10.490000442718156}, roc = 0.8498061866255937
13 | epoch = 1, losses = {'textcat': 16.547183903574478}, roc = 0.8772121678860224
14 | epoch = 2, losses = {'textcat': 19.78883196215702}, roc = 0.8746707739699653
15 | epoch = 3, losses = {'textcat': 21.212445096770466}, roc = 0.8524759337697343
16 | epoch = 4, losses = {'textcat': 21.841305388159622}, roc = 0.8331664741368244
17 | epoch = 5, losses = {'textcat': 22.164312648375436}, roc = 0.8297214734950583
18 | epoch = 6, losses = {'textcat': 22.348416818236934}, roc = 0.826420228468746
19 | epoch = 7, losses = {'textcat': 22.57289976127649}, roc = 0.8245514054678474
20 | epoch = 8, losses = {'textcat': 22.755370378420064}, roc = 0.8239917853934027
21 | epoch = 9, losses = {'textcat': 22.91652452440097}, roc = 0.8339417276344502
22 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/ensemble_lPunctNumStopOovAgg.txt:
--------------------------------------------------------------------------------
1 | ensemble_lPunctNumStopOovAgg epoch = 0, losses = {'textcat': 10.528761210793164}, roc = 0.8673289693235784
2 | epoch = 1, losses = {'textcat': 16.21045877024153}, roc = 0.889338980875369
3 | epoch = 2, losses = {'textcat': 19.006704837200232}, roc = 0.8886818123475806
4 | epoch = 3, losses = {'textcat': 20.306096649514416}, roc = 0.87391092285971
5 | epoch = 4, losses = {'textcat': 20.95130760752823}, roc = 0.8736644846617894
6 | epoch = 5, losses = {'textcat': 21.27921311019021}, roc = 0.8782441278398152
7 | epoch = 6, losses = {'textcat': 21.510209205060605}, roc = 0.8773328199204212
8 | epoch = 7, losses = {'textcat': 21.645089315511605}, roc = 0.8748350661019125
9 | epoch = 8, losses = {'textcat': 21.741663373617406}, roc = 0.8733923758182519
10 | epoch = 9, losses = {'textcat': 21.76332084200426}, roc = 0.8705532024130407
11 | ensemble_lPunctNumStopOovAgg
12 | epoch = 0, losses = {'textcat': 10.605348191806115}, roc = 0.8644795276601206
13 | epoch = 1, losses = {'textcat': 16.36319550999906}, roc = 0.8788910281093569
14 | epoch = 2, losses = {'textcat': 19.37329041323028}, roc = 0.8780028237710179
15 | epoch = 3, losses = {'textcat': 20.752987753380012}, roc = 0.8651315620587856
16 | epoch = 4, losses = {'textcat': 21.361254807152925}, roc = 0.859535361314337
17 | epoch = 5, losses = {'textcat': 21.65353330239163}, roc = 0.8487896290591709
18 | epoch = 6, losses = {'textcat': 21.97102079426821}, roc = 0.8549197792324477
19 | epoch = 7, losses = {'textcat': 22.053787052982}, roc = 0.8488461044795277
20 | epoch = 8, losses = {'textcat': 22.097230001885983}, roc = 0.838906430496727
21 | epoch = 9, losses = {'textcat': 22.243994371655553}, roc = 0.8437376460017969
22 |
--------------------------------------------------------------------------------
/Notebooks/other-attempts/spaCy/outputs/softmax_bert_lPunctNumStopLemOovAgg.txt:
--------------------------------------------------------------------------------
1 | softmax_bert_lPunctNumStopLemOovAgg
2 | epoch = 0, roc = 0.803136085548301
3 | epoch = 1, roc = 0.6947483419875585
4 |
--------------------------------------------------------------------------------
/Notebooks/successful-models/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pitmonticone/RedditTextClassification/fdd8b3a6e649781df9147599889c4669517f65ab/Notebooks/successful-models/.DS_Store
--------------------------------------------------------------------------------
/Notebooks/successful-models/.ipynb_checkpoints/mlp-subreddits-5000-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Mining Challange: *Reddit Gender Text-Classification* (MLP) \n",
8 | "\n",
9 | "### Modules"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [
17 | {
18 | "name": "stdout",
19 | "output_type": "stream",
20 | "text": [
21 | "Populating the interactive namespace from numpy and matplotlib\n"
22 | ]
23 | }
24 | ],
25 | "source": [
26 | "# Numpy & matplotlib for notebooks \n",
27 | "%pylab inline\n",
28 | "\n",
29 | "# Pandas for data analysis and manipulation \n",
30 | "import pandas as pd \n",
31 | "\n",
32 | "# Sklearn \n",
33 | "from sklearn.preprocessing import StandardScaler # to standardize features by removing the mean and scaling to unit variance (z=(x-u)/s)\n",
34 | "from sklearn.neural_network import MLPClassifier # Multi-layer Perceptron classifier which optimizes the log-loss function using LBFGS or sdg.\n",
35 | "from sklearn.model_selection import train_test_split # to split arrays or matrices into random train and test subsets\n",
36 | "from sklearn.model_selection import KFold # K-Folds cross-validator providing train/test indices to split data in train/test sets.\n",
37 | "from sklearn.decomposition import PCA, TruncatedSVD # Principal component analysis (PCA); dimensionality reduction using truncated SVD.\n",
38 | "from sklearn.linear_model import LogisticRegression \n",
39 | "from sklearn.naive_bayes import MultinomialNB # Naive Bayes classifier for multinomial models\n",
40 | "from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts\n",
41 | "from sklearn.metrics import roc_auc_score as roc # Compute Area Under the Receiver Operating Characteristic Curve from prediction scores\n",
42 | "from sklearn.metrics import roc_curve, auc # Compute ROC; Compute Area Under the Curve (AUC) using the trapezoidal rule\n",
43 | "\n",
44 | "# Matplotlib\n",
45 | "import matplotlib # Data visualization\n",
46 | "import matplotlib.pyplot as plt \n",
47 | "import matplotlib.patches as mpatches \n",
48 | "\n",
49 | "# Seaborn\n",
50 | "import seaborn as sns # Statistical data visualization (based on matplotlib)"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "### Data Collection "
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 2,
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "# Import the training dataset, test dataset and target\n",
67 | "\n",
68 | "# Import the training dataset\n",
69 | "train_data = pd.read_csv(\"../input/dataset/train_data.csv\", encoding=\"utf8\")\n",
70 | "\n",
71 | "# Import the test dataset\n",
72 | "test_data = pd.read_csv(\"../input/dataset/test_data.csv\", encoding=\"utf8\")\n",
73 | "\n",
74 | "# Import the target\n",
75 | "target = pd.read_csv(\"../input/dataset/train_target.csv\")\n",
76 | "\n",
77 | "# Create a dictionary of authors\n",
78 | "author_gender = {}\n",
79 | "for i in range(len(target)):\n",
80 | " author_gender[target.author[i]] = target.gender[i]"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "### Data Manipulation "
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 3,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "# Create a list of aggregated binary subreddits \n",
97 | "Xs = []\n",
98 | "# Create a list of genders\n",
99 | "y = []\n",
100 | "# Create a list of authors\n",
101 | "a = []\n",
102 | "\n",
103 | "# Populate the lists \n",
104 | "for author, group in train_data.groupby(\"author\"):\n",
105 | " Xs.append(group.subreddit.str.cat(sep = \" \"))\n",
106 | " y.append(author_gender[author])\n",
107 | " a.append(author)\n",
108 | " \n",
109 | "# Lower text in comments \n",
110 | "clean_train_subreddits = [xs.lower() for xs in Xs]"
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {},
116 | "source": [
117 | "### Models Definition & Training\n",
118 | "\n",
119 | "#### CountVectorizer"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 4,
125 | "metadata": {},
126 | "outputs": [],
127 | "source": [
128 | "# Define CountVectorizer \n",
129 | "vectorizer_ = CountVectorizer(analyzer = \"word\", \n",
130 | " tokenizer = None, \n",
131 | " preprocessor = None, \n",
132 | " stop_words = None,\n",
133 | " binary=True\n",
134 | " ) #500\n",
135 | "# Train CountVectorizer \n",
136 | "train_data_subreddits = vectorizer_.fit_transform(clean_train_subreddits).toarray()\n",
137 | "\n",
138 | "sum(train_data_subreddits[1])\n",
139 | "\n",
140 | "y = np.array(y)"
141 | ]
142 | },
143 | {
144 | "cell_type": "markdown",
145 | "metadata": {},
146 | "source": [
147 | "#### MLP Classifier"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": 5,
153 | "metadata": {
154 | "collapsed": true,
155 | "jupyter": {
156 | "outputs_hidden": true
157 | }
158 | },
159 | "outputs": [
160 | {
161 | "name": "stdout",
162 | "output_type": "stream",
163 | "text": [
164 | "Iteration 1, loss = 0.59613047\n",
165 | "Validation score: 0.734000\n",
166 | "Iteration 2, loss = 0.47953355\n",
167 | "Validation score: 0.814000\n",
168 | "Iteration 3, loss = 0.39179575\n",
169 | "Validation score: 0.864000\n",
170 | "Iteration 4, loss = 0.33398556\n",
171 | "Validation score: 0.870000\n",
172 | "Iteration 5, loss = 0.29788160\n",
173 | "Validation score: 0.860000\n",
174 | "Iteration 6, loss = 0.27413851\n",
175 | "Validation score: 0.858000\n",
176 | "Iteration 7, loss = 0.25758394\n",
177 | "Validation score: 0.856000\n",
178 | "Iteration 8, loss = 0.24291078\n",
179 | "Validation score: 0.858000\n",
180 | "Iteration 9, loss = 0.23275980\n",
181 | "Validation score: 0.864000\n",
182 | "Iteration 10, loss = 0.22383857\n",
183 | "Validation score: 0.860000\n",
184 | "Iteration 11, loss = 0.21650923\n",
185 | "Validation score: 0.860000\n",
186 | "Iteration 12, loss = 0.21024405\n",
187 | "Validation score: 0.850000\n",
188 | "Iteration 13, loss = 0.20492907\n",
189 | "Validation score: 0.850000\n",
190 | "Iteration 14, loss = 0.20017990\n",
191 | "Validation score: 0.848000\n",
192 | "Iteration 15, loss = 0.19573230\n",
193 | "Validation score: 0.850000\n",
194 | "Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.\n"
195 | ]
196 | },
197 | {
198 | "data": {
199 | "text/plain": [
200 | "MLPClassifier(activation='relu', alpha=0.05, batch_size='auto', beta_1=0.9,\n",
201 | " beta_2=0.999, early_stopping=True, epsilon=1e-08,\n",
202 | " hidden_layer_sizes=(100,), learning_rate='invscaling',\n",
203 | " learning_rate_init=0.001, max_fun=15000, max_iter=400,\n",
204 | " momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,\n",
205 | " power_t=0.5, random_state=0, shuffle=True, solver='adam',\n",
206 | " tol=0.0001, validation_fraction=0.1, verbose=True,\n",
207 | " warm_start=False)"
208 | ]
209 | },
210 | "execution_count": 5,
211 | "metadata": {},
212 | "output_type": "execute_result"
213 | }
214 | ],
215 | "source": [
216 | "# Define MLP Classifier:\n",
217 | "## Activation function for the hidden layer: \"rectified linear unit function\"\n",
218 | "## Solver for weight optimization: \"stochastic gradient-based optimizer\"\n",
219 | "## Alpha: regularization parameter\n",
220 | "## Learning rate schedule for weight updates: \"gradually decreases the learning rate at each time step t using an inverse scaling exponent of power_t\"\n",
221 | "## Verbose: \"True\" in order to print progress messages to stdout.\n",
222 | "## Early stopping: \"True\" in order to use early stopping to terminate training when validation score is not improving. It automatically sets aside 10% of training data as validation and terminate training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs.\n",
223 | "\n",
224 | "mlpClf = MLPClassifier(activation= 'relu', solver = 'adam', \n",
225 | " alpha = 0.05, learning_rate = 'invscaling', verbose = True, \n",
226 | " early_stopping = True, max_iter = 400, random_state=0)\n",
227 | "\n",
228 | " \n",
229 | "# K fold per la cross-validation\n",
230 | "kfold = KFold(n_splits = 10)\n",
231 | "\n",
232 | "# Training and validation on all K folds\n",
233 | "# for train_indices, test_indices in kf.split(train_data_subreddits):\n",
234 | "# mlpClf.fit(train_data_subreddits[train_indices], y[train_indices])\n",
235 | "# print(mlpClf.score(train_data_subreddits[test_indices], y[test_indices]))\n",
236 | " \n",
237 | "# cross_val_score resets parameters of my_model and fits it on X_train and t_train with cross validation (we did it for consistency).\n",
238 | "# results = cross_val_score(my_model, s, y, cv=kfold, scoring='roc_auc')\n",
239 | "# print(\"roc = \", np.mean(results))\n",
240 | " \n",
241 | "# Model fit\n",
242 | "mlpClf.fit(train_data_subreddits, y)"
243 | ]
244 | },
245 | {
246 | "cell_type": "markdown",
247 | "metadata": {},
248 | "source": [
249 | "### Prediction "
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 6,
255 | "metadata": {},
256 | "outputs": [],
257 | "source": [
258 | "Xs_test = []\n",
259 | "for author, group in test_data.groupby(\"author\"):\n",
260 | " Xs_test.append(group.subreddit.str.cat(sep = \" \"))\n",
261 | " \n",
262 | "clean_test_subreddits = [xs.lower() for xs in Xs_test]\n",
263 | "\n",
264 | "test_data_subreddits = vectorizer_.transform(clean_test_subreddits).toarray()\n",
265 | "\n",
266 | "y_score = mlpClf.predict_proba(test_data_subreddits)[:,1]\n",
267 | "\n",
268 | "np.save(\"y_testMLPs\",y_score)"
269 | ]
270 | }
271 | ],
272 | "metadata": {
273 | "kernelspec": {
274 | "display_name": "Python 3",
275 | "language": "python",
276 | "name": "python3"
277 | },
278 | "language_info": {
279 | "codemirror_mode": {
280 | "name": "ipython",
281 | "version": 3
282 | },
283 | "file_extension": ".py",
284 | "mimetype": "text/x-python",
285 | "name": "python",
286 | "nbconvert_exporter": "python",
287 | "pygments_lexer": "ipython3",
288 | "version": "3.7.4"
289 | }
290 | },
291 | "nbformat": 4,
292 | "nbformat_minor": 4
293 | }
294 |
--------------------------------------------------------------------------------
/Notebooks/successful-models/.ipynb_checkpoints/xgb-gridsearch-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Mining Challange: *Reddit Gender Text-Classification* \n",
8 | "\n",
9 | "## Modules"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "%%time\n",
19 | "\n",
20 | "#Numpy\n",
21 | "import numpy as np\n",
22 | "\n",
23 | "#Sklearn\n",
24 | "from sklearn.model_selection import RandomizedSearchCV, GridSearchCV # Exhaustive search over specified parameter values for a given estimator\n",
25 | "from sklearn.model_selection import cross_val_score # Evaluate a score by cross-validation\n",
26 | "from sklearn.model_selection import KFold # K-Folds cross-validator providing train/test indices to split data in train/test sets.\n",
27 | "from sklearn.model_selection import StratifiedKFold\n",
28 | "from sklearn.metrics import roc_auc_score # Compute Area Under the Receiver Operating Characteristic Curve from prediction scores\n",
29 | "from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts\n",
30 | "\n",
31 | "#XGBoost\n",
32 | "from xgboost import XGBRegressor\n",
33 | "\n",
34 | "# Matplotlib\n",
35 | "import matplotlib # Data visualization\n",
36 | "import matplotlib.pyplot as plt \n",
37 | "import matplotlib.patches as mpatches \n",
38 | "\n",
39 | "#Pickle\n",
40 | "import pickle # To load files\n",
41 | "\n",
42 | "# Joblib\n",
43 | "import joblib # To save models "
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "## Data Collection"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "# load preprocessed data to save tine\n",
60 | "with open(\"../input/challengedadata/comments.txt\", \"rb\") as f:\n",
61 | " clean_train_comments = pickle.load(f) \n",
62 | " f.close()\n",
63 | "\n",
64 | "with open(\"../input/challengedadata/targets.txt\", \"rb\") as ft:\n",
65 | " y = pickle.load(ft) \n",
66 | " ft.close()"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {},
72 | "source": [
73 | "## Data Manipulation"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "vectorizer = CountVectorizer(analyzer = \"word\",\n",
83 | " max_features = 2000, ngram_range=(1, 2)) \n",
84 | "# converts in np array\n",
85 | "train_data_features = vectorizer.fit_transform(clean_train_comments).toarray()\n",
86 | "\n",
87 | "# create vocabulary\n",
88 | "vocab = vectorizer.get_feature_names()\n",
89 | "\n",
90 | "# counts how many times a word appears\n",
91 | "dist = np.sum(train_data_features, axis=0)\n",
92 | "\n",
93 | "# removes the 40 most utilized words\n",
94 | "for _ in range(40):\n",
95 | " index = np.argmax(dist)\n",
96 | " train_data_features = np.delete(train_data_features, index, axis = 1)\n",
97 | " \n",
98 | "X_len = [[len(x)] for x in train_data_features] \n",
99 | "s = np.concatenate((train_data_features,np.array(X_len)),axis = 1)\n",
100 | "\n",
101 | "# 5000 rows (one per author), and 2000-40+1 (X_len) features\n",
102 | "s.shape\n",
103 | "\n",
104 | "y = np.array(y) "
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "## Model Exploration"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "parameters = {\"learning_rate\":[0.03,0.05,0.07,0.01,0.15,0.2,0.25,0.3],'min_child_weight': [1,4,5,8],'gamma': [0.0, 0.1,0.2, 0.3,0.4,0.5,0.6,0.8],\n",
121 | " 'subsample': [0.6,0.7,0.8,0.9,1], 'colsample_bytree': [0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1],\n",
122 | " 'max_depth': [2,3,4,5,6,7,8,10,12,15], 'scale_pos_weight': [1,2.70, 10, 25, 50, 75, 100, 1000] }\n",
123 | "\n",
124 | "parameters0 = {'min_child_weight': [1,8],'gamma': [0.6,0.8],\n",
125 | " 'subsample': [0.9], 'colsample_bytree': [0.6],\n",
126 | " 'max_depth': [4], 'scale_pos_weight': [1,2.70, 10, 25, 50, 75, 100, 1000] }\n",
127 | "\n",
128 | " \n",
129 | "xgb = XGBRegressor(objective = \"reg:logistic\", n_estimators=10000, \n",
130 | " tree_method = \"gpu_hist\", gpu_id = 0)\n",
131 | "\n",
132 | "\n",
133 | "# Model exploration\n",
134 | "xgbClf = GridSearchCV(xgb, param_grid = parameters0, cv = StratifiedKFold(n_splits=10, shuffle = True, random_state = 1001), scoring = \"roc_auc\" ,verbose=True, n_jobs=-1)\n",
135 | "\n",
136 | "# Model fit\n",
137 | "xgbClf.fit(s, y, verbose=False)\n",
138 | "\n",
139 | "# Save model\n",
140 | "joblib.dump(xgbClf, '../working/xgbClf.pkl')\n",
141 | "\n",
142 | "print(\"xgbCLf.best_score = \", xgbClf.best_score_)\n",
143 | "print(\"xgbCLf.best_estimator_ = \", xgbClf.best_estimator_)"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {},
149 | "source": [
150 | "```Fitting 10 folds for each of 32 candidates, totalling 320 fits\n",
151 | "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.\n",
152 | "[Parallel(n_jobs=-1)]: Done 46 tasks | elapsed: 24.1min\n",
153 | "[Parallel(n_jobs=-1)]: Done 196 tasks | elapsed: 105.6min\n",
154 | "[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed: 172.3min finished\n",
155 | "xgbCLf.best_score = 0.8425215483825477\n",
156 | "xgbCLf.best_estimator_ = XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,\n",
157 | " colsample_bynode=1, colsample_bytree=0.6, gamma=0.8, gpu_id=0,\n",
158 | " importance_type='gain', interaction_constraints=None,\n",
159 | " learning_rate=0.300000012, max_delta_step=0, max_depth=4,\n",
160 | " min_child_weight=1, missing=nan, monotone_constraints=None,\n",
161 | " n_estimators=10000, n_jobs=0, num_parallel_tree=1,\n",
162 | " objective='reg:logistic', random_state=0, reg_alpha=0,\n",
163 | " reg_lambda=1, scale_pos_weight=1, subsample=0.9,\n",
164 | " tree_method='gpu_hist', validate_parameters=False, verbosity=None)\n",
165 | "CPU times: user 1min, sys: 14.5 s, total: 1min 14s\n",
166 | "Wall time: 2h 53min 29s\n",
167 | "```"
168 | ]
169 | }
170 | ],
171 | "metadata": {
172 | "kernelspec": {
173 | "display_name": "Python 3",
174 | "language": "python",
175 | "name": "python3"
176 | },
177 | "language_info": {
178 | "codemirror_mode": {
179 | "name": "ipython",
180 | "version": 3
181 | },
182 | "file_extension": ".py",
183 | "mimetype": "text/x-python",
184 | "name": "python",
185 | "nbconvert_exporter": "python",
186 | "pygments_lexer": "ipython3",
187 | "version": "3.7.4"
188 | }
189 | },
190 | "nbformat": 4,
191 | "nbformat_minor": 4
192 | }
193 |
--------------------------------------------------------------------------------
/Notebooks/successful-models/mlp-subreddits-5000.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Mining Challange: *Reddit Gender Text-Classification* (MLP) \n",
8 | "\n",
9 | "### Modules"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [
17 | {
18 | "name": "stdout",
19 | "output_type": "stream",
20 | "text": [
21 | "Populating the interactive namespace from numpy and matplotlib\n"
22 | ]
23 | }
24 | ],
25 | "source": [
26 | "# Numpy & matplotlib for notebooks \n",
27 | "%pylab inline\n",
28 | "\n",
29 | "# Pandas for data analysis and manipulation \n",
30 | "import pandas as pd \n",
31 | "\n",
32 | "# Sklearn \n",
33 | "from sklearn.preprocessing import StandardScaler # to standardize features by removing the mean and scaling to unit variance (z=(x-u)/s)\n",
34 | "from sklearn.neural_network import MLPClassifier # Multi-layer Perceptron classifier which optimizes the log-loss function using LBFGS or sdg.\n",
35 | "from sklearn.model_selection import train_test_split # to split arrays or matrices into random train and test subsets\n",
36 | "from sklearn.model_selection import KFold # K-Folds cross-validator providing train/test indices to split data in train/test sets.\n",
37 | "from sklearn.decomposition import PCA, TruncatedSVD # Principal component analysis (PCA); dimensionality reduction using truncated SVD.\n",
38 | "from sklearn.linear_model import LogisticRegression \n",
39 | "from sklearn.naive_bayes import MultinomialNB # Naive Bayes classifier for multinomial models\n",
40 | "from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts\n",
41 | "from sklearn.metrics import roc_auc_score as roc # Compute Area Under the Receiver Operating Characteristic Curve from prediction scores\n",
42 | "from sklearn.metrics import roc_curve, auc # Compute ROC; Compute Area Under the Curve (AUC) using the trapezoidal rule\n",
43 | "\n",
44 | "# Matplotlib\n",
45 | "import matplotlib # Data visualization\n",
46 | "import matplotlib.pyplot as plt \n",
47 | "import matplotlib.patches as mpatches \n",
48 | "\n",
49 | "# Seaborn\n",
50 | "import seaborn as sns # Statistical data visualization (based on matplotlib)"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "### Data Collection "
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 2,
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "# Import the training dataset, test dataset and target\n",
67 | "\n",
68 | "# Import the training dataset\n",
69 | "train_data = pd.read_csv(\"../input/dataset/train_data.csv\", encoding=\"utf8\")\n",
70 | "\n",
71 | "# Import the test dataset\n",
72 | "test_data = pd.read_csv(\"../input/dataset/test_data.csv\", encoding=\"utf8\")\n",
73 | "\n",
74 | "# Import the target\n",
75 | "target = pd.read_csv(\"../input/dataset/train_target.csv\")\n",
76 | "\n",
77 | "# Create a dictionary of authors\n",
78 | "author_gender = {}\n",
79 | "for i in range(len(target)):\n",
80 | " author_gender[target.author[i]] = target.gender[i]"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "### Data Manipulation "
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 3,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "# Create a list of aggregated binary subreddits \n",
97 | "Xs = []\n",
98 | "# Create a list of genders\n",
99 | "y = []\n",
100 | "# Create a list of authors\n",
101 | "a = []\n",
102 | "\n",
103 | "# Populate the lists \n",
104 | "for author, group in train_data.groupby(\"author\"):\n",
105 | " Xs.append(group.subreddit.str.cat(sep = \" \"))\n",
106 | " y.append(author_gender[author])\n",
107 | " a.append(author)\n",
108 | " \n",
109 | "# Lower text in comments \n",
110 | "clean_train_subreddits = [xs.lower() for xs in Xs]"
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {},
116 | "source": [
117 | "### Models Definition & Training\n",
118 | "\n",
119 | "#### CountVectorizer"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 4,
125 | "metadata": {},
126 | "outputs": [],
127 | "source": [
128 | "# Define CountVectorizer \n",
129 | "vectorizer_ = CountVectorizer(analyzer = \"word\", \n",
130 | " tokenizer = None, \n",
131 | " preprocessor = None, \n",
132 | " stop_words = None,\n",
133 | " binary=True\n",
134 | " ) #500\n",
135 | "# Train CountVectorizer \n",
136 | "train_data_subreddits = vectorizer_.fit_transform(clean_train_subreddits).toarray()\n",
137 | "\n",
138 | "sum(train_data_subreddits[1])\n",
139 | "\n",
140 | "y = np.array(y)"
141 | ]
142 | },
143 | {
144 | "cell_type": "markdown",
145 | "metadata": {},
146 | "source": [
147 | "#### MLP Classifier"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": 5,
153 | "metadata": {
154 | "collapsed": true,
155 | "jupyter": {
156 | "outputs_hidden": true
157 | }
158 | },
159 | "outputs": [
160 | {
161 | "name": "stdout",
162 | "output_type": "stream",
163 | "text": [
164 | "Iteration 1, loss = 0.59613047\n",
165 | "Validation score: 0.734000\n",
166 | "Iteration 2, loss = 0.47953355\n",
167 | "Validation score: 0.814000\n",
168 | "Iteration 3, loss = 0.39179575\n",
169 | "Validation score: 0.864000\n",
170 | "Iteration 4, loss = 0.33398556\n",
171 | "Validation score: 0.870000\n",
172 | "Iteration 5, loss = 0.29788160\n",
173 | "Validation score: 0.860000\n",
174 | "Iteration 6, loss = 0.27413851\n",
175 | "Validation score: 0.858000\n",
176 | "Iteration 7, loss = 0.25758394\n",
177 | "Validation score: 0.856000\n",
178 | "Iteration 8, loss = 0.24291078\n",
179 | "Validation score: 0.858000\n",
180 | "Iteration 9, loss = 0.23275980\n",
181 | "Validation score: 0.864000\n",
182 | "Iteration 10, loss = 0.22383857\n",
183 | "Validation score: 0.860000\n",
184 | "Iteration 11, loss = 0.21650923\n",
185 | "Validation score: 0.860000\n",
186 | "Iteration 12, loss = 0.21024405\n",
187 | "Validation score: 0.850000\n",
188 | "Iteration 13, loss = 0.20492907\n",
189 | "Validation score: 0.850000\n",
190 | "Iteration 14, loss = 0.20017990\n",
191 | "Validation score: 0.848000\n",
192 | "Iteration 15, loss = 0.19573230\n",
193 | "Validation score: 0.850000\n",
194 | "Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.\n"
195 | ]
196 | },
197 | {
198 | "data": {
199 | "text/plain": [
200 | "MLPClassifier(activation='relu', alpha=0.05, batch_size='auto', beta_1=0.9,\n",
201 | " beta_2=0.999, early_stopping=True, epsilon=1e-08,\n",
202 | " hidden_layer_sizes=(100,), learning_rate='invscaling',\n",
203 | " learning_rate_init=0.001, max_fun=15000, max_iter=400,\n",
204 | " momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,\n",
205 | " power_t=0.5, random_state=0, shuffle=True, solver='adam',\n",
206 | " tol=0.0001, validation_fraction=0.1, verbose=True,\n",
207 | " warm_start=False)"
208 | ]
209 | },
210 | "execution_count": 5,
211 | "metadata": {},
212 | "output_type": "execute_result"
213 | }
214 | ],
215 | "source": [
216 | "# Define MLP Classifier:\n",
217 | "## Activation function for the hidden layer: \"rectified linear unit function\"\n",
218 | "## Solver for weight optimization: \"stochastic gradient-based optimizer\"\n",
219 | "## Alpha: regularization parameter\n",
220 | "## Learning rate schedule for weight updates: \"gradually decreases the learning rate at each time step t using an inverse scaling exponent of power_t\"\n",
221 | "## Verbose: \"True\" in order to print progress messages to stdout.\n",
222 | "## Early stopping: \"True\" in order to use early stopping to terminate training when validation score is not improving. It automatically sets aside 10% of training data as validation and terminate training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs.\n",
223 | "\n",
224 | "mlpClf = MLPClassifier(activation= 'relu', solver = 'adam', \n",
225 | " alpha = 0.05, learning_rate = 'invscaling', verbose = True, \n",
226 | " early_stopping = True, max_iter = 400, random_state=0)\n",
227 | "\n",
228 | " \n",
229 | "# K fold per la cross-validation\n",
230 | "kfold = KFold(n_splits = 10)\n",
231 | "\n",
232 | "# Training and validation on all K folds\n",
233 | "# for train_indices, test_indices in kf.split(train_data_subreddits):\n",
234 | "# mlpClf.fit(train_data_subreddits[train_indices], y[train_indices])\n",
235 | "# print(mlpClf.score(train_data_subreddits[test_indices], y[test_indices]))\n",
236 | " \n",
237 | "# cross_val_score resets parameters of my_model and fits it on X_train and t_train with cross validation (we did it for consistency).\n",
238 | "# results = cross_val_score(my_model, s, y, cv=kfold, scoring='roc_auc')\n",
239 | "# print(\"roc = \", np.mean(results))\n",
240 | " \n",
241 | "# Model fit\n",
242 | "mlpClf.fit(train_data_subreddits, y)"
243 | ]
244 | },
245 | {
246 | "cell_type": "markdown",
247 | "metadata": {},
248 | "source": [
249 | "### Prediction "
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 6,
255 | "metadata": {},
256 | "outputs": [],
257 | "source": [
258 | "Xs_test = []\n",
259 | "for author, group in test_data.groupby(\"author\"):\n",
260 | " Xs_test.append(group.subreddit.str.cat(sep = \" \"))\n",
261 | " \n",
262 | "clean_test_subreddits = [xs.lower() for xs in Xs_test]\n",
263 | "\n",
264 | "test_data_subreddits = vectorizer_.transform(clean_test_subreddits).toarray()\n",
265 | "\n",
266 | "y_score = mlpClf.predict_proba(test_data_subreddits)[:,1]\n",
267 | "\n",
268 | "np.save(\"y_testMLPs\",y_score)"
269 | ]
270 | }
271 | ],
272 | "metadata": {
273 | "kernelspec": {
274 | "display_name": "Python 3",
275 | "language": "python",
276 | "name": "python3"
277 | },
278 | "language_info": {
279 | "codemirror_mode": {
280 | "name": "ipython",
281 | "version": 3
282 | },
283 | "file_extension": ".py",
284 | "mimetype": "text/x-python",
285 | "name": "python",
286 | "nbconvert_exporter": "python",
287 | "pygments_lexer": "ipython3",
288 | "version": "3.7.4"
289 | }
290 | },
291 | "nbformat": 4,
292 | "nbformat_minor": 4
293 | }
294 |
--------------------------------------------------------------------------------
/Notebooks/successful-models/xgb-gridsearch.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Mining Challange: *Reddit Gender Text-Classification* \n",
8 | "\n",
9 | "## Modules"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "%%time\n",
19 | "\n",
20 | "#Numpy\n",
21 | "import numpy as np\n",
22 | "\n",
23 | "#Sklearn\n",
24 | "from sklearn.model_selection import RandomizedSearchCV, GridSearchCV # Exhaustive search over specified parameter values for a given estimator\n",
25 | "from sklearn.model_selection import cross_val_score # Evaluate a score by cross-validation\n",
26 | "from sklearn.model_selection import KFold # K-Folds cross-validator providing train/test indices to split data in train/test sets.\n",
27 | "from sklearn.model_selection import StratifiedKFold\n",
28 | "from sklearn.metrics import roc_auc_score # Compute Area Under the Receiver Operating Characteristic Curve from prediction scores\n",
29 | "from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts\n",
30 | "\n",
31 | "#XGBoost\n",
32 | "from xgboost import XGBRegressor\n",
33 | "\n",
34 | "# Matplotlib\n",
35 | "import matplotlib # Data visualization\n",
36 | "import matplotlib.pyplot as plt \n",
37 | "import matplotlib.patches as mpatches \n",
38 | "\n",
39 | "#Pickle\n",
40 | "import pickle # To load files\n",
41 | "\n",
42 | "# Joblib\n",
43 | "import joblib # To save models "
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "## Data Collection"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "# load preprocessed data to save tine\n",
60 | "with open(\"../input/challengedadata/comments.txt\", \"rb\") as f:\n",
61 | " clean_train_comments = pickle.load(f) \n",
62 | " f.close()\n",
63 | "\n",
64 | "with open(\"../input/challengedadata/targets.txt\", \"rb\") as ft:\n",
65 | " y = pickle.load(ft) \n",
66 | " ft.close()"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {},
72 | "source": [
73 | "## Data Manipulation"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "vectorizer = CountVectorizer(analyzer = \"word\",\n",
83 | " max_features = 2000, ngram_range=(1, 2)) \n",
84 | "# converts in np array\n",
85 | "train_data_features = vectorizer.fit_transform(clean_train_comments).toarray()\n",
86 | "\n",
87 | "# create vocabulary\n",
88 | "vocab = vectorizer.get_feature_names()\n",
89 | "\n",
90 | "# counts how many times a word appears\n",
91 | "dist = np.sum(train_data_features, axis=0)\n",
92 | "\n",
93 | "# removes the 40 most utilized words\n",
94 | "for _ in range(40):\n",
95 | " index = np.argmax(dist)\n",
96 | " train_data_features = np.delete(train_data_features, index, axis = 1)\n",
97 | " \n",
98 | "X_len = [[len(x)] for x in train_data_features] \n",
99 | "s = np.concatenate((train_data_features,np.array(X_len)),axis = 1)\n",
100 | "\n",
101 | "# 5000 rows (one per author), and 2000-40+1 (X_len) features\n",
102 | "s.shape\n",
103 | "\n",
104 | "y = np.array(y) "
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "## Model Exploration"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "parameters = {\"learning_rate\":[0.03,0.05,0.07,0.01,0.15,0.2,0.25,0.3],'min_child_weight': [1,4,5,8],'gamma': [0.0, 0.1,0.2, 0.3,0.4,0.5,0.6,0.8],\n",
121 | " 'subsample': [0.6,0.7,0.8,0.9,1], 'colsample_bytree': [0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1],\n",
122 | " 'max_depth': [2,3,4,5,6,7,8,10,12,15], 'scale_pos_weight': [1,2.70, 10, 25, 50, 75, 100, 1000] }\n",
123 | "\n",
124 | "parameters0 = {'min_child_weight': [1,8],'gamma': [0.6,0.8],\n",
125 | " 'subsample': [0.9], 'colsample_bytree': [0.6],\n",
126 | " 'max_depth': [4], 'scale_pos_weight': [1,2.70, 10, 25, 50, 75, 100, 1000] }\n",
127 | "\n",
128 | " \n",
129 | "xgb = XGBRegressor(objective = \"reg:logistic\", n_estimators=10000, \n",
130 | " tree_method = \"gpu_hist\", gpu_id = 0)\n",
131 | "\n",
132 | "\n",
133 | "# Model exploration\n",
134 | "xgbClf = GridSearchCV(xgb, param_grid = parameters0, cv = StratifiedKFold(n_splits=10, shuffle = True, random_state = 1001), scoring = \"roc_auc\" ,verbose=True, n_jobs=-1)\n",
135 | "\n",
136 | "# Model fit\n",
137 | "xgbClf.fit(s, y, verbose=False)\n",
138 | "\n",
139 | "# Save model\n",
140 | "joblib.dump(xgbClf, '../working/xgbClf.pkl')\n",
141 | "\n",
142 | "print(\"xgbCLf.best_score = \", xgbClf.best_score_)\n",
143 | "print(\"xgbCLf.best_estimator_ = \", xgbClf.best_estimator_)"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {},
149 | "source": [
150 | "```Fitting 10 folds for each of 32 candidates, totalling 320 fits\n",
151 | "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.\n",
152 | "[Parallel(n_jobs=-1)]: Done 46 tasks | elapsed: 24.1min\n",
153 | "[Parallel(n_jobs=-1)]: Done 196 tasks | elapsed: 105.6min\n",
154 | "[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed: 172.3min finished\n",
155 | "xgbCLf.best_score = 0.8425215483825477\n",
156 | "xgbCLf.best_estimator_ = XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,\n",
157 | " colsample_bynode=1, colsample_bytree=0.6, gamma=0.8, gpu_id=0,\n",
158 | " importance_type='gain', interaction_constraints=None,\n",
159 | " learning_rate=0.300000012, max_delta_step=0, max_depth=4,\n",
160 | " min_child_weight=1, missing=nan, monotone_constraints=None,\n",
161 | " n_estimators=10000, n_jobs=0, num_parallel_tree=1,\n",
162 | " objective='reg:logistic', random_state=0, reg_alpha=0,\n",
163 | " reg_lambda=1, scale_pos_weight=1, subsample=0.9,\n",
164 | " tree_method='gpu_hist', validate_parameters=False, verbosity=None)\n",
165 | "CPU times: user 1min, sys: 14.5 s, total: 1min 14s\n",
166 | "Wall time: 2h 53min 29s\n",
167 | "```"
168 | ]
169 | }
170 | ],
171 | "metadata": {
172 | "kernelspec": {
173 | "display_name": "Python 3",
174 | "language": "python",
175 | "name": "python3"
176 | },
177 | "language_info": {
178 | "codemirror_mode": {
179 | "name": "ipython",
180 | "version": 3
181 | },
182 | "file_extension": ".py",
183 | "mimetype": "text/x-python",
184 | "name": "python",
185 | "nbconvert_exporter": "python",
186 | "pygments_lexer": "ipython3",
187 | "version": "3.7.4"
188 | }
189 | },
190 | "nbformat": 4,
191 | "nbformat_minor": 4
192 | }
193 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
20 |
21 |