├── Basics └── Python Functions.ipynb ├── DataAnalysis ├── Add New Data to a Pretrained Model.ipynb ├── AutoML.ipynb ├── BaselineComparison.ipynb ├── BreakpointAnalysis │ ├── Breakpoints Analysis.ipynb │ └── Giallozafferano.csv ├── Data Analysis with pycaret.ipynb ├── Data Analysis.ipynb ├── KNN │ ├── KNN.ipynb │ └── data │ │ ├── gender_submission.csv │ │ ├── test.csv │ │ └── train.csv ├── Overfitting.ipynb ├── SARIMA.ipynb ├── SpeedUp Scikit.ipynb ├── Time Series Analysis.ipynb ├── TrainTest Split.ipynb ├── covid-19 │ ├── Comparison of the first and the second waves of COVID-19.ipynb │ ├── Italian_covid-19_analysis.ipynb │ └── Italian_covid-19_analysis_skewnorm.ipynb ├── source │ ├── diabetes.csv │ ├── heart.csv │ └── tourist_arrivals.csv └── tourism │ ├── AnalyseTourism 1.ipynb │ └── data │ └── eurostat.csv ├── DataCollection ├── PDF │ ├── .ipynb_checkpoints │ │ └── get_pdf-checkpoint.ipynb │ ├── get_pdf.ipynb │ └── source │ │ ├── .DS_Store │ │ └── Bolletino-sorveglianza-integrata-COVID-19_17-marzo-2020_appendix.pdf ├── Twitter │ ├── config.py │ └── get_tweets.ipynb └── Web │ ├── Nested Scraping.ipynb │ ├── Pandas read_html.ipynb │ └── source │ └── euro2020_groups.html ├── DataNarrative ├── Covid-19 Infections.ipynb ├── DataNarrative.ipynb └── source │ ├── car_sales.csv │ ├── conjunctivitis.csv │ ├── cough.csv │ ├── fever.csv │ ├── paracetamol.csv │ └── sore throat.csv ├── DataVisualization ├── .ipynb_checkpoints │ ├── Autoplotter-checkpoint.ipynb │ └── PlotlyVSMatplotlib-checkpoint.ipynb ├── Altair │ ├── Altair Decluttering.ipynb │ ├── AltairAnimation.py │ ├── Geographical maps.ipynb │ ├── Simple Bar Chart.ipynb │ └── data │ │ └── airports.csv ├── Autoplotter.ipynb ├── D3Graphs │ ├── badBarChart.html │ ├── barChart.html │ ├── choroplethMap.html │ ├── js │ │ ├── chart.js │ │ ├── chart2.js │ │ ├── line.js │ │ ├── line2.js │ │ └── map.js │ ├── line.html │ └── simpleLine.html ├── Django │ ├── Model Creation.ipynb │ └── mywebsite │ │ ├── manage.py │ │ ├── myapp │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── admin.cpython-38.pyc │ │ │ ├── apps.cpython-38.pyc │ │ │ └── models.cpython-38.pyc │ │ ├── admin.py │ │ ├── apps.py │ │ ├── migrations │ │ │ ├── 0001_initial.py │ │ │ ├── __init__.py │ │ │ └── __pycache__ │ │ │ │ ├── 0001_initial.cpython-38.pyc │ │ │ │ └── __init__.cpython-38.pyc │ │ ├── models.py │ │ ├── tests.py │ │ └── views.py │ │ └── mywebsite │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── settings.cpython-38.pyc │ │ ├── urls.cpython-38.pyc │ │ └── wsgi.cpython-38.pyc │ │ ├── asgi.py │ │ ├── settings.py │ │ ├── urls.py │ │ └── wsgi.py ├── Folium │ └── GeoData Extraction.ipynb ├── GenerativeAI │ ├── build_chart.py │ ├── chart.html │ ├── red.png │ └── tsc00001.csv ├── Gradio │ └── Gradio Example.ipynb ├── Plotly │ ├── .DS_Store │ ├── .ipynb_checkpoints │ │ └── Plot.ly-checkpoint.ipynb │ ├── Plot.ly.ipynb │ ├── js │ │ ├── chart-part1.js │ │ └── chart-part2.js │ └── positivi.html ├── PlotlyVSMatplotlib.ipynb ├── ipyvizzu │ └── ipyvizzu Example.ipynb ├── presenza_negli_alberghi.csv └── sources │ ├── eu_live_births.xlsx │ ├── eu_regions.csv │ └── tourist_arrivals.csv ├── Datasets ├── Animals.sql ├── Animals_nested.sql ├── Shakespeare.txt ├── capitals1.csv ├── capitals2.csv ├── monthly_temperature.csv └── rainfall.xlsx ├── EnvironmentSetup └── Docker │ ├── Dockerfile │ ├── app.py │ ├── requirements.txt │ └── source │ └── heart.csv ├── Preprocessing ├── Balancing │ ├── Balancing.ipynb │ ├── Multiclass Balancing.ipynb │ └── glass.csv ├── Binning │ ├── .ipynb_checkpoints │ │ └── Data Preprocessing - Binning-checkpoint.ipynb │ ├── Data Preprocessing - Binning.ipynb │ └── cupcake.csv ├── DataCleaning │ └── Data Cleaning.ipynb ├── DataSampling │ └── Data Sampling.ipynb ├── Formatting │ ├── Data Preprocessing - Formatting.ipynb │ └── tweets.csv ├── GeoPandas │ └── GeoPandas.ipynb ├── HugeDatasets │ ├── Load Huge Datasets in Python Pandas.ipynb │ └── hepatitis.csv ├── MissingValues │ ├── Data Preprocessing - Missing Values.ipynb │ ├── Data Preprocessing with scikit-learn - Part 1 Missing Values.ipynb │ ├── cupcake.csv │ └── hepatitis.csv ├── Normalization │ ├── .ipynb_checkpoints │ │ └── Data Preprocessing - Normalization-checkpoint.ipynb │ ├── Data Preprocessing - Normalization with scikit-learn.ipynb │ └── Data Preprocessing - Normalization.ipynb ├── OutliersDetection │ └── Outliers Detection.ipynb ├── RemoveDuplicates │ ├── Remove Duplicates.ipynb │ └── cupcake_duplicates.csv ├── SQLDF │ ├── .ipynb_checkpoints │ │ └── SQL DF-checkpoint.ipynb │ └── SQL DF.ipynb ├── SpeedUp │ ├── Numba.ipynb │ └── PySpark.ipynb └── Standardization │ ├── .ipynb_checkpoints │ └── Data Preprocessing - Standardization-checkpoint.ipynb │ └── Data Preprocessing - Standardization.ipynb ├── README.md ├── Tests ├── Calculator │ ├── calculator.py │ └── test_calculator.py └── CalculatorFixture │ ├── calculator.py │ └── test_calculator.py ├── TextAnalysis ├── .DS_Store ├── .ipynb_checkpoints │ ├── Anonymise Places-checkpoint.ipynb │ └── Spark NLP-checkpoint.ipynb ├── Anonymise Places.ipynb ├── Render Original Layout of Text Document after Manipulation.ipynb ├── Spark NLP.ipynb ├── Structured Information.ipynb ├── register.txt ├── source │ ├── .DS_Store │ ├── IT.txt │ ├── Tweets.csv │ └── WilliamShakespeare.docx └── string-similarity │ ├── jaro_similarity.py │ ├── jaro_soundex_similarity.py │ ├── leven_similarity.py │ └── surnames.csv └── TimeSeriesAnalysis └── Types of Graphs.ipynb /Basics/Python Functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Three Tricks on Python Functions that You Should Know\n", 8 | "\n", 9 | "This tutorial covers the following three advanced programming tricks on Python functions:\n", 10 | "* nested functions\n", 11 | "* variable parameters\n", 12 | "* lambda functions\n", 13 | "\n", 14 | "## Nested Function\n", 15 | "A nested function is a function within another function. Due to scope rules, usually a nested function cannot be invoked outside the container function.\n", 16 | "\n", 17 | "Nested functions can be used when a repeated operation should be run inside and only inside a function. The following example defines a function, which receives as input two strings, manipulates them and returns them." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "def manipulate_strings(a,b):\n", 27 | " \n", 28 | " def inner(s):\n", 29 | " s = s.lower()\n", 30 | " return s[::-1]\n", 31 | " \n", 32 | " return inner(a), inner(b)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "Test the function on two strings:" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/plain": [ 50 | "('olleh', 'dlrow')" 51 | ] 52 | }, 53 | "execution_count": 2, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "a = \"HELLO\"\n", 60 | "b = \"WORLD\"\n", 61 | "manipulate_strings(a,b)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "A nested function can be also returned by the outer function. \n", 69 | "\n", 70 | "Consider the following trivial function, which receives as input a number and returns as a function, which converts a string to lowercase and, then, if the string length is greater than n, it truncates the string to n-1. The function returns the inner function." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 81, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "def manipulate_string(n):\n", 80 | " \n", 81 | " def inner(a):\n", 82 | " a = a.lower()\n", 83 | " if n < len(a):\n", 84 | " a = a[:n]\n", 85 | " return a\n", 86 | " \n", 87 | " return inner" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "Now, I can invoke the function and assign the returning value to a variable, which will contain the inner function." 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 82, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "manipulate = manipulate_string(3)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "And then, I can invoke the inner function with different strings:" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 85, 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/plain": [ 121 | "'hel'" 122 | ] 123 | }, 124 | "execution_count": 85, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "a = \"HELLO\"\n", 131 | "manipulate(a)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "The previous example demonstrates how nested functions can be used when you have some general parameters, which can be initialisated by the outer function, and then specific parameters can be used within the inner function." 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "## Variable Parameters\n", 146 | "Usually a function is invoked with a fixed number of parameters, including default ones. However, Python provides a mechanism, which permits to invoke a function with a potential unlimited number of parameters. \n", 147 | "\n", 148 | "There are two types of variable parameters:\n", 149 | "* tuples (list of item) - passed as * parameter, e.g. `*args`\n", 150 | "* dictionaries (key-value pairs) - passed as ** parameter, e.g. `**kargs`.\n", 151 | "\n", 152 | "The following example shows how to concatenate a variable number of strings by exploiting `*args`:" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 30, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "def concatenate(*args):\n", 162 | " output = ''\n", 163 | " for item in args:\n", 164 | " output = output + item\n", 165 | " return output" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "And now I test the function with a variable number of parameters:" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 31, 178 | "metadata": {}, 179 | "outputs": [ 180 | { 181 | "data": { 182 | "text/plain": [ 183 | "'dogcat'" 184 | ] 185 | }, 186 | "execution_count": 31, 187 | "metadata": {}, 188 | "output_type": "execute_result" 189 | } 190 | ], 191 | "source": [ 192 | "concatenate('dog', 'cat')" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 32, 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "data": { 202 | "text/plain": [ 203 | "'greenredyellow'" 204 | ] 205 | }, 206 | "execution_count": 32, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "concatenate('green', 'red', 'yellow')" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "The following example shows how to exploit the `**kargs` parameter. I define a class `Configuration`, which contains three parameters: alpha, beta and gamma. The class provides method, called `configure()` which can receive as input a variable number of parameters, which correspond to the configuration parameters of the class. The user can decide whether to set all the configuration parameters or only a subset of them." 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 71, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "class Configuration:\n", 229 | " \n", 230 | " def __init__(self):\n", 231 | " self.p = {}\n", 232 | " self.p['alpha'] = None\n", 233 | " self.p['beta'] = None\n", 234 | " self.p['gamma'] = None\n", 235 | " \n", 236 | " def configure(self,**kargs):\n", 237 | " for k,v in kargs.items():\n", 238 | " self.p[k] = v\n", 239 | " \n", 240 | " def print_configuration(self):\n", 241 | " for k,v in self.p.items():\n", 242 | " print(k + ': ' + str(v))" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "The class `Configuration` also provides a method, called `print_configuration()`, which prints the current status of the instance.\n", 250 | "\n", 251 | "I can create a `Configuration()` object and then I can decide, for example, to set only the alpha parameter:" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 72, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "config = Configuration()" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 73, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "config.configure(alpha = 2)" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "I print the current configuration to make sure that the alpha parameter has been set:" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 74, 282 | "metadata": {}, 283 | "outputs": [ 284 | { 285 | "name": "stdout", 286 | "output_type": "stream", 287 | "text": [ 288 | "alpha: 2\n", 289 | "beta: None\n", 290 | "gamma: None\n" 291 | ] 292 | } 293 | ], 294 | "source": [ 295 | "config.print_configuration()" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "Now I can set the alpha and the beta parameters:" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 75, 308 | "metadata": {}, 309 | "outputs": [ 310 | { 311 | "name": "stdout", 312 | "output_type": "stream", 313 | "text": [ 314 | "alpha: 2\n", 315 | "beta: 4\n", 316 | "gamma: None\n" 317 | ] 318 | } 319 | ], 320 | "source": [ 321 | "config.configure(alpha = 2, beta = 4)\n", 322 | "config.print_configuration()" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "# Lambda Function\n", 330 | "A lambda function is an inline function, which can be used to run simple and repetitive operations, such well-known math operations.\n", 331 | "\n", 332 | "The following snippet of code shows how to calculate the Pythagorean theorem through a lambda function:" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 76, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "from math import sqrt\n", 342 | "pythagora = lambda x,y : sqrt(x**2 + y**2)" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 77, 348 | "metadata": {}, 349 | "outputs": [ 350 | { 351 | "data": { 352 | "text/plain": [ 353 | "5.0" 354 | ] 355 | }, 356 | "execution_count": 77, 357 | "metadata": {}, 358 | "output_type": "execute_result" 359 | } 360 | ], 361 | "source": [ 362 | "pythagora(3,4)" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 78, 368 | "metadata": {}, 369 | "outputs": [ 370 | { 371 | "data": { 372 | "text/plain": [ 373 | "7.615773105863909" 374 | ] 375 | }, 376 | "execution_count": 78, 377 | "metadata": {}, 378 | "output_type": "execute_result" 379 | } 380 | ], 381 | "source": [ 382 | "pythagora(3,7)" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [] 391 | } 392 | ], 393 | "metadata": { 394 | "kernelspec": { 395 | "display_name": "Python 3", 396 | "language": "python", 397 | "name": "python3" 398 | }, 399 | "language_info": { 400 | "codemirror_mode": { 401 | "name": "ipython", 402 | "version": 3 403 | }, 404 | "file_extension": ".py", 405 | "mimetype": "text/x-python", 406 | "name": "python", 407 | "nbconvert_exporter": "python", 408 | "pygments_lexer": "ipython3", 409 | "version": "3.8.1" 410 | } 411 | }, 412 | "nbformat": 4, 413 | "nbformat_minor": 4 414 | } 415 | -------------------------------------------------------------------------------- /DataAnalysis/Add New Data to a Pretrained Model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "47b91fc0", 6 | "metadata": {}, 7 | "source": [ 8 | "# Add new data to a pretrained model\n", 9 | "\n", 10 | "## Import data" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "c81c245c", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from sklearn import datasets\n", 21 | "\n", 22 | "iris = datasets.load_iris()\n", 23 | "X = iris.data\n", 24 | "y = iris.target" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "bcab8984", 30 | "metadata": {}, 31 | "source": [ 32 | "## split data in three parts\n", 33 | "\n", 34 | "* X_train, y_train - training set 80% of 40% of data\n", 35 | "* X_test, y_test - test set 20% of 40 of data\n", 36 | "* X2 - y2 - new samples (60% of data)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 41, 42 | "id": "e99f494c", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "from sklearn.model_selection import train_test_split\n", 47 | "\n", 48 | "X1, X2, y1, y2 = train_test_split(X, y, test_size=0.60, random_state=42)\n", 49 | "X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.20, random_state=42)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 69, 55 | "id": "1aaf0729", 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/plain": [ 61 | "(48, 12, 90)" 62 | ] 63 | }, 64 | "execution_count": 69, 65 | "metadata": {}, 66 | "output_type": "execute_result" 67 | } 68 | ], 69 | "source": [ 70 | "len(X_train), len(X_test), len(X2)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "id": "3b59234d", 76 | "metadata": {}, 77 | "source": [ 78 | "## First strategy - warm_start\n", 79 | "\n", 80 | "First training - warm_start = False. When I add new samples, training is done from scratch" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 89, 86 | "id": "0768e41f", 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "data": { 91 | "text/plain": [ 92 | "RandomForestClassifier(max_depth=2, n_estimators=1, random_state=0)" 93 | ] 94 | }, 95 | "execution_count": 89, 96 | "metadata": {}, 97 | "output_type": "execute_result" 98 | } 99 | ], 100 | "source": [ 101 | "from sklearn.ensemble import RandomForestClassifier\n", 102 | "\n", 103 | "model = RandomForestClassifier(max_depth=2, random_state=0, warm_start=False, n_estimators=1)\n", 104 | "model.fit(X_train, y_train)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 90, 110 | "id": "798958d1", 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "data": { 115 | "text/plain": [ 116 | "0.75" 117 | ] 118 | }, 119 | "execution_count": 90, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "model.score(X_test, y_test)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "id": "def6fcc5", 131 | "metadata": {}, 132 | "source": [ 133 | "Fit the model on new data" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 91, 139 | "id": "6613f658", 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "data": { 144 | "text/plain": [ 145 | "RandomForestClassifier(max_depth=2, n_estimators=1, random_state=0)" 146 | ] 147 | }, 148 | "execution_count": 91, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "model.fit(X2, y2)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 92, 160 | "id": "77ef4fcb", 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "0.8333333333333334" 167 | ] 168 | }, 169 | "execution_count": 92, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "model.score(X_test, y_test)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "id": "26808de2", 181 | "metadata": {}, 182 | "source": [ 183 | "warm_start = True. When I add new samples, training is incremental" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 75, 189 | "id": "127af6b5", 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "data": { 194 | "text/plain": [ 195 | "0.75" 196 | ] 197 | }, 198 | "execution_count": 75, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "model = RandomForestClassifier(max_depth=2, random_state=0, warm_start=True, n_estimators=1)\n", 205 | "model.fit(X_train, y_train)\n", 206 | "model.score(X_test, y_test)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 76, 212 | "id": "02097780", 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "data": { 217 | "text/plain": [ 218 | "0.9166666666666666" 219 | ] 220 | }, 221 | "execution_count": 76, 222 | "metadata": {}, 223 | "output_type": "execute_result" 224 | } 225 | ], 226 | "source": [ 227 | "model.n_estimators+=1\n", 228 | "model.fit(X2, y2)\n", 229 | "model.score(X_test, y_test)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "id": "1029433b", 235 | "metadata": {}, 236 | "source": [ 237 | "## Second strategy - partial fit" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 86, 243 | "id": "847e9b4b", 244 | "metadata": {}, 245 | "outputs": [ 246 | { 247 | "data": { 248 | "text/plain": [ 249 | "SGDClassifier()" 250 | ] 251 | }, 252 | "execution_count": 86, 253 | "metadata": {}, 254 | "output_type": "execute_result" 255 | } 256 | ], 257 | "source": [ 258 | "from sklearn.linear_model import SGDClassifier\n", 259 | "import numpy as np\n", 260 | "\n", 261 | "model = SGDClassifier() \n", 262 | "model.partial_fit(X_train, y_train, classes=np.unique(y))" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 87, 268 | "id": "fb303467", 269 | "metadata": {}, 270 | "outputs": [ 271 | { 272 | "data": { 273 | "text/plain": [ 274 | "0.4166666666666667" 275 | ] 276 | }, 277 | "execution_count": 87, 278 | "metadata": {}, 279 | "output_type": "execute_result" 280 | } 281 | ], 282 | "source": [ 283 | "model.score(X_test, y_test)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 88, 289 | "id": "38031504", 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "data": { 294 | "text/plain": [ 295 | "0.8333333333333334" 296 | ] 297 | }, 298 | "execution_count": 88, 299 | "metadata": {}, 300 | "output_type": "execute_result" 301 | } 302 | ], 303 | "source": [ 304 | "model.partial_fit(X2, y2)\n", 305 | "model.score(X_test, y_test)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "id": "f01bfa51", 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [] 315 | } 316 | ], 317 | "metadata": { 318 | "kernelspec": { 319 | "display_name": "Python 3 (ipykernel)", 320 | "language": "python", 321 | "name": "python3" 322 | }, 323 | "language_info": { 324 | "codemirror_mode": { 325 | "name": "ipython", 326 | "version": 3 327 | }, 328 | "file_extension": ".py", 329 | "mimetype": "text/x-python", 330 | "name": "python", 331 | "nbconvert_exporter": "python", 332 | "pygments_lexer": "ipython3", 333 | "version": "3.8.10" 334 | } 335 | }, 336 | "nbformat": 4, 337 | "nbformat_minor": 5 338 | } 339 | -------------------------------------------------------------------------------- /DataAnalysis/BreakpointAnalysis/Giallozafferano.csv: -------------------------------------------------------------------------------- 1 | "date","audience" 2 | "2015-1-01","750823.0" 3 | "2015-2-01","823766.0" 4 | "2015-3-01","736273.0" 5 | "2015-4-01","663810.0" 6 | "2015-5-01","558158.0" 7 | "2015-6-01","523362.0" 8 | "2015-7-01","488415.0" 9 | "2015-8-01","520900.0" 10 | "2015-9-01","653422.0" 11 | "2015-10-01","755161.0" 12 | "2015-11-01","777377.0" 13 | "2015-12-01","780741.0" 14 | "2016-1-01","730955.0" 15 | "2016-2-01","761964.0" 16 | "2016-3-01","702277.0" 17 | "2016-4-01","554768.0" 18 | "2016-5-01","551632.0" 19 | "2016-6-01","484720.0" 20 | "2016-7-01","483712.0" 21 | "2016-8-01","441778.0" 22 | "2016-9-01","548845.0" 23 | "2016-10-01","583818.0" 24 | "2016-11-01","585894.0" 25 | "2016-12-01","593539.0" 26 | "2017-1-01","506504.0" 27 | "2017-2-01","550790.0" 28 | "2017-3-01","483189.0" 29 | "2017-4-01","465497.0" 30 | "2017-5-01","435161.0" 31 | "2017-6-01","375244.0" 32 | "2017-7-01","346830.0" 33 | "2017-8-01","359812.0" 34 | "2017-9-01","464405.0" 35 | "2017-10-01","528087.0" 36 | "2017-11-01","503457.0" 37 | "2017-12-01","632639.0" 38 | "2018-1-01","534149.0" 39 | "2018-2-01","653555.0" 40 | "2018-3-01","599916.0" 41 | "2018-4-01","890675.0" 42 | "2018-5-01","826407.0" 43 | "2018-6-01","1058788.0" 44 | "2018-7-01","1085114.0" 45 | "2018-8-01","1050619.0" 46 | "2018-9-01","1244511.0" 47 | "2018-10-01","1442592.0" 48 | "2018-11-01","1588954.0" 49 | "2018-12-01","1450385.0" 50 | "2019-1-01","1347488.0" 51 | "2019-2-01","1297969.0" 52 | "2019-3-01","1164651.0" 53 | "2019-4-01","1207549.0" 54 | "2019-5-01","1129548.0" 55 | "2019-6-01","1040899.0" 56 | "2019-7-01","1040065.0" 57 | "2019-8-01","1016274.0" 58 | "2019-9-01","1158630.0" 59 | "2019-10-01","1367487.0" 60 | "2019-11-01","1609979.0" 61 | "2019-12-01","1508977.0" 62 | "2020-1-01","1336777.0" 63 | "2020-2-01","1388018.0" 64 | "2020-3-01","2572820.0" 65 | "2020-4-01","3053974.0" 66 | "2020-5-01","1874335.0" 67 | "2020-6-01","1431240.0" 68 | "2020-7-01","1148040.0" 69 | "2020-8-01","1231572.0" 70 | "2020-9-01","1287384.0" 71 | "2020-10-01","1443484.0" 72 | "2020-11-01","1754085.0" 73 | -------------------------------------------------------------------------------- /DataAnalysis/KNN/data/gender_submission.csv: -------------------------------------------------------------------------------- 1 | PassengerId,Survived 2 | 892,0 3 | 893,1 4 | 894,0 5 | 895,0 6 | 896,1 7 | 897,0 8 | 898,1 9 | 899,0 10 | 900,1 11 | 901,0 12 | 902,0 13 | 903,0 14 | 904,1 15 | 905,0 16 | 906,1 17 | 907,1 18 | 908,0 19 | 909,0 20 | 910,1 21 | 911,1 22 | 912,0 23 | 913,0 24 | 914,1 25 | 915,0 26 | 916,1 27 | 917,0 28 | 918,1 29 | 919,0 30 | 920,0 31 | 921,0 32 | 922,0 33 | 923,0 34 | 924,1 35 | 925,1 36 | 926,0 37 | 927,0 38 | 928,1 39 | 929,1 40 | 930,0 41 | 931,0 42 | 932,0 43 | 933,0 44 | 934,0 45 | 935,1 46 | 936,1 47 | 937,0 48 | 938,0 49 | 939,0 50 | 940,1 51 | 941,1 52 | 942,0 53 | 943,0 54 | 944,1 55 | 945,1 56 | 946,0 57 | 947,0 58 | 948,0 59 | 949,0 60 | 950,0 61 | 951,1 62 | 952,0 63 | 953,0 64 | 954,0 65 | 955,1 66 | 956,0 67 | 957,1 68 | 958,1 69 | 959,0 70 | 960,0 71 | 961,1 72 | 962,1 73 | 963,0 74 | 964,1 75 | 965,0 76 | 966,1 77 | 967,0 78 | 968,0 79 | 969,1 80 | 970,0 81 | 971,1 82 | 972,0 83 | 973,0 84 | 974,0 85 | 975,0 86 | 976,0 87 | 977,0 88 | 978,1 89 | 979,1 90 | 980,1 91 | 981,0 92 | 982,1 93 | 983,0 94 | 984,1 95 | 985,0 96 | 986,0 97 | 987,0 98 | 988,1 99 | 989,0 100 | 990,1 101 | 991,0 102 | 992,1 103 | 993,0 104 | 994,0 105 | 995,0 106 | 996,1 107 | 997,0 108 | 998,0 109 | 999,0 110 | 1000,0 111 | 1001,0 112 | 1002,0 113 | 1003,1 114 | 1004,1 115 | 1005,1 116 | 1006,1 117 | 1007,0 118 | 1008,0 119 | 1009,1 120 | 1010,0 121 | 1011,1 122 | 1012,1 123 | 1013,0 124 | 1014,1 125 | 1015,0 126 | 1016,0 127 | 1017,1 128 | 1018,0 129 | 1019,1 130 | 1020,0 131 | 1021,0 132 | 1022,0 133 | 1023,0 134 | 1024,1 135 | 1025,0 136 | 1026,0 137 | 1027,0 138 | 1028,0 139 | 1029,0 140 | 1030,1 141 | 1031,0 142 | 1032,1 143 | 1033,1 144 | 1034,0 145 | 1035,0 146 | 1036,0 147 | 1037,0 148 | 1038,0 149 | 1039,0 150 | 1040,0 151 | 1041,0 152 | 1042,1 153 | 1043,0 154 | 1044,0 155 | 1045,1 156 | 1046,0 157 | 1047,0 158 | 1048,1 159 | 1049,1 160 | 1050,0 161 | 1051,1 162 | 1052,1 163 | 1053,0 164 | 1054,1 165 | 1055,0 166 | 1056,0 167 | 1057,1 168 | 1058,0 169 | 1059,0 170 | 1060,1 171 | 1061,1 172 | 1062,0 173 | 1063,0 174 | 1064,0 175 | 1065,0 176 | 1066,0 177 | 1067,1 178 | 1068,1 179 | 1069,0 180 | 1070,1 181 | 1071,1 182 | 1072,0 183 | 1073,0 184 | 1074,1 185 | 1075,0 186 | 1076,1 187 | 1077,0 188 | 1078,1 189 | 1079,0 190 | 1080,1 191 | 1081,0 192 | 1082,0 193 | 1083,0 194 | 1084,0 195 | 1085,0 196 | 1086,0 197 | 1087,0 198 | 1088,0 199 | 1089,1 200 | 1090,0 201 | 1091,1 202 | 1092,1 203 | 1093,0 204 | 1094,0 205 | 1095,1 206 | 1096,0 207 | 1097,0 208 | 1098,1 209 | 1099,0 210 | 1100,1 211 | 1101,0 212 | 1102,0 213 | 1103,0 214 | 1104,0 215 | 1105,1 216 | 1106,1 217 | 1107,0 218 | 1108,1 219 | 1109,0 220 | 1110,1 221 | 1111,0 222 | 1112,1 223 | 1113,0 224 | 1114,1 225 | 1115,0 226 | 1116,1 227 | 1117,1 228 | 1118,0 229 | 1119,1 230 | 1120,0 231 | 1121,0 232 | 1122,0 233 | 1123,1 234 | 1124,0 235 | 1125,0 236 | 1126,0 237 | 1127,0 238 | 1128,0 239 | 1129,0 240 | 1130,1 241 | 1131,1 242 | 1132,1 243 | 1133,1 244 | 1134,0 245 | 1135,0 246 | 1136,0 247 | 1137,0 248 | 1138,1 249 | 1139,0 250 | 1140,1 251 | 1141,1 252 | 1142,1 253 | 1143,0 254 | 1144,0 255 | 1145,0 256 | 1146,0 257 | 1147,0 258 | 1148,0 259 | 1149,0 260 | 1150,1 261 | 1151,0 262 | 1152,0 263 | 1153,0 264 | 1154,1 265 | 1155,1 266 | 1156,0 267 | 1157,0 268 | 1158,0 269 | 1159,0 270 | 1160,1 271 | 1161,0 272 | 1162,0 273 | 1163,0 274 | 1164,1 275 | 1165,1 276 | 1166,0 277 | 1167,1 278 | 1168,0 279 | 1169,0 280 | 1170,0 281 | 1171,0 282 | 1172,1 283 | 1173,0 284 | 1174,1 285 | 1175,1 286 | 1176,1 287 | 1177,0 288 | 1178,0 289 | 1179,0 290 | 1180,0 291 | 1181,0 292 | 1182,0 293 | 1183,1 294 | 1184,0 295 | 1185,0 296 | 1186,0 297 | 1187,0 298 | 1188,1 299 | 1189,0 300 | 1190,0 301 | 1191,0 302 | 1192,0 303 | 1193,0 304 | 1194,0 305 | 1195,0 306 | 1196,1 307 | 1197,1 308 | 1198,0 309 | 1199,0 310 | 1200,0 311 | 1201,1 312 | 1202,0 313 | 1203,0 314 | 1204,0 315 | 1205,1 316 | 1206,1 317 | 1207,1 318 | 1208,0 319 | 1209,0 320 | 1210,0 321 | 1211,0 322 | 1212,0 323 | 1213,0 324 | 1214,0 325 | 1215,0 326 | 1216,1 327 | 1217,0 328 | 1218,1 329 | 1219,0 330 | 1220,0 331 | 1221,0 332 | 1222,1 333 | 1223,0 334 | 1224,0 335 | 1225,1 336 | 1226,0 337 | 1227,0 338 | 1228,0 339 | 1229,0 340 | 1230,0 341 | 1231,0 342 | 1232,0 343 | 1233,0 344 | 1234,0 345 | 1235,1 346 | 1236,0 347 | 1237,1 348 | 1238,0 349 | 1239,1 350 | 1240,0 351 | 1241,1 352 | 1242,1 353 | 1243,0 354 | 1244,0 355 | 1245,0 356 | 1246,1 357 | 1247,0 358 | 1248,1 359 | 1249,0 360 | 1250,0 361 | 1251,1 362 | 1252,0 363 | 1253,1 364 | 1254,1 365 | 1255,0 366 | 1256,1 367 | 1257,1 368 | 1258,0 369 | 1259,1 370 | 1260,1 371 | 1261,0 372 | 1262,0 373 | 1263,1 374 | 1264,0 375 | 1265,0 376 | 1266,1 377 | 1267,1 378 | 1268,1 379 | 1269,0 380 | 1270,0 381 | 1271,0 382 | 1272,0 383 | 1273,0 384 | 1274,1 385 | 1275,1 386 | 1276,0 387 | 1277,1 388 | 1278,0 389 | 1279,0 390 | 1280,0 391 | 1281,0 392 | 1282,0 393 | 1283,1 394 | 1284,0 395 | 1285,0 396 | 1286,0 397 | 1287,1 398 | 1288,0 399 | 1289,1 400 | 1290,0 401 | 1291,0 402 | 1292,1 403 | 1293,0 404 | 1294,1 405 | 1295,0 406 | 1296,0 407 | 1297,0 408 | 1298,0 409 | 1299,0 410 | 1300,1 411 | 1301,1 412 | 1302,1 413 | 1303,1 414 | 1304,1 415 | 1305,0 416 | 1306,1 417 | 1307,0 418 | 1308,0 419 | 1309,0 420 | -------------------------------------------------------------------------------- /DataAnalysis/source/heart.csv: -------------------------------------------------------------------------------- 1 | age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output 2 | 63,1,3,145,233,1,0,150,0,2.3,0,0,1,1 3 | 37,1,2,130,250,0,1,187,0,3.5,0,0,2,1 4 | 41,0,1,130,204,0,0,172,0,1.4,2,0,2,1 5 | 56,1,1,120,236,0,1,178,0,0.8,2,0,2,1 6 | 57,0,0,120,354,0,1,163,1,0.6,2,0,2,1 7 | 57,1,0,140,192,0,1,148,0,0.4,1,0,1,1 8 | 56,0,1,140,294,0,0,153,0,1.3,1,0,2,1 9 | 44,1,1,120,263,0,1,173,0,0,2,0,3,1 10 | 52,1,2,172,199,1,1,162,0,0.5,2,0,3,1 11 | 57,1,2,150,168,0,1,174,0,1.6,2,0,2,1 12 | 54,1,0,140,239,0,1,160,0,1.2,2,0,2,1 13 | 48,0,2,130,275,0,1,139,0,0.2,2,0,2,1 14 | 49,1,1,130,266,0,1,171,0,0.6,2,0,2,1 15 | 64,1,3,110,211,0,0,144,1,1.8,1,0,2,1 16 | 58,0,3,150,283,1,0,162,0,1,2,0,2,1 17 | 50,0,2,120,219,0,1,158,0,1.6,1,0,2,1 18 | 58,0,2,120,340,0,1,172,0,0,2,0,2,1 19 | 66,0,3,150,226,0,1,114,0,2.6,0,0,2,1 20 | 43,1,0,150,247,0,1,171,0,1.5,2,0,2,1 21 | 69,0,3,140,239,0,1,151,0,1.8,2,2,2,1 22 | 59,1,0,135,234,0,1,161,0,0.5,1,0,3,1 23 | 44,1,2,130,233,0,1,179,1,0.4,2,0,2,1 24 | 42,1,0,140,226,0,1,178,0,0,2,0,2,1 25 | 61,1,2,150,243,1,1,137,1,1,1,0,2,1 26 | 40,1,3,140,199,0,1,178,1,1.4,2,0,3,1 27 | 71,0,1,160,302,0,1,162,0,0.4,2,2,2,1 28 | 59,1,2,150,212,1,1,157,0,1.6,2,0,2,1 29 | 51,1,2,110,175,0,1,123,0,0.6,2,0,2,1 30 | 65,0,2,140,417,1,0,157,0,0.8,2,1,2,1 31 | 53,1,2,130,197,1,0,152,0,1.2,0,0,2,1 32 | 41,0,1,105,198,0,1,168,0,0,2,1,2,1 33 | 65,1,0,120,177,0,1,140,0,0.4,2,0,3,1 34 | 44,1,1,130,219,0,0,188,0,0,2,0,2,1 35 | 54,1,2,125,273,0,0,152,0,0.5,0,1,2,1 36 | 51,1,3,125,213,0,0,125,1,1.4,2,1,2,1 37 | 46,0,2,142,177,0,0,160,1,1.4,0,0,2,1 38 | 54,0,2,135,304,1,1,170,0,0,2,0,2,1 39 | 54,1,2,150,232,0,0,165,0,1.6,2,0,3,1 40 | 65,0,2,155,269,0,1,148,0,0.8,2,0,2,1 41 | 65,0,2,160,360,0,0,151,0,0.8,2,0,2,1 42 | 51,0,2,140,308,0,0,142,0,1.5,2,1,2,1 43 | 48,1,1,130,245,0,0,180,0,0.2,1,0,2,1 44 | 45,1,0,104,208,0,0,148,1,3,1,0,2,1 45 | 53,0,0,130,264,0,0,143,0,0.4,1,0,2,1 46 | 39,1,2,140,321,0,0,182,0,0,2,0,2,1 47 | 52,1,1,120,325,0,1,172,0,0.2,2,0,2,1 48 | 44,1,2,140,235,0,0,180,0,0,2,0,2,1 49 | 47,1,2,138,257,0,0,156,0,0,2,0,2,1 50 | 53,0,2,128,216,0,0,115,0,0,2,0,0,1 51 | 53,0,0,138,234,0,0,160,0,0,2,0,2,1 52 | 51,0,2,130,256,0,0,149,0,0.5,2,0,2,1 53 | 66,1,0,120,302,0,0,151,0,0.4,1,0,2,1 54 | 62,1,2,130,231,0,1,146,0,1.8,1,3,3,1 55 | 44,0,2,108,141,0,1,175,0,0.6,1,0,2,1 56 | 63,0,2,135,252,0,0,172,0,0,2,0,2,1 57 | 52,1,1,134,201,0,1,158,0,0.8,2,1,2,1 58 | 48,1,0,122,222,0,0,186,0,0,2,0,2,1 59 | 45,1,0,115,260,0,0,185,0,0,2,0,2,1 60 | 34,1,3,118,182,0,0,174,0,0,2,0,2,1 61 | 57,0,0,128,303,0,0,159,0,0,2,1,2,1 62 | 71,0,2,110,265,1,0,130,0,0,2,1,2,1 63 | 54,1,1,108,309,0,1,156,0,0,2,0,3,1 64 | 52,1,3,118,186,0,0,190,0,0,1,0,1,1 65 | 41,1,1,135,203,0,1,132,0,0,1,0,1,1 66 | 58,1,2,140,211,1,0,165,0,0,2,0,2,1 67 | 35,0,0,138,183,0,1,182,0,1.4,2,0,2,1 68 | 51,1,2,100,222,0,1,143,1,1.2,1,0,2,1 69 | 45,0,1,130,234,0,0,175,0,0.6,1,0,2,1 70 | 44,1,1,120,220,0,1,170,0,0,2,0,2,1 71 | 62,0,0,124,209,0,1,163,0,0,2,0,2,1 72 | 54,1,2,120,258,0,0,147,0,0.4,1,0,3,1 73 | 51,1,2,94,227,0,1,154,1,0,2,1,3,1 74 | 29,1,1,130,204,0,0,202,0,0,2,0,2,1 75 | 51,1,0,140,261,0,0,186,1,0,2,0,2,1 76 | 43,0,2,122,213,0,1,165,0,0.2,1,0,2,1 77 | 55,0,1,135,250,0,0,161,0,1.4,1,0,2,1 78 | 51,1,2,125,245,1,0,166,0,2.4,1,0,2,1 79 | 59,1,1,140,221,0,1,164,1,0,2,0,2,1 80 | 52,1,1,128,205,1,1,184,0,0,2,0,2,1 81 | 58,1,2,105,240,0,0,154,1,0.6,1,0,3,1 82 | 41,1,2,112,250,0,1,179,0,0,2,0,2,1 83 | 45,1,1,128,308,0,0,170,0,0,2,0,2,1 84 | 60,0,2,102,318,0,1,160,0,0,2,1,2,1 85 | 52,1,3,152,298,1,1,178,0,1.2,1,0,3,1 86 | 42,0,0,102,265,0,0,122,0,0.6,1,0,2,1 87 | 67,0,2,115,564,0,0,160,0,1.6,1,0,3,1 88 | 68,1,2,118,277,0,1,151,0,1,2,1,3,1 89 | 46,1,1,101,197,1,1,156,0,0,2,0,3,1 90 | 54,0,2,110,214,0,1,158,0,1.6,1,0,2,1 91 | 58,0,0,100,248,0,0,122,0,1,1,0,2,1 92 | 48,1,2,124,255,1,1,175,0,0,2,2,2,1 93 | 57,1,0,132,207,0,1,168,1,0,2,0,3,1 94 | 52,1,2,138,223,0,1,169,0,0,2,4,2,1 95 | 54,0,1,132,288,1,0,159,1,0,2,1,2,1 96 | 45,0,1,112,160,0,1,138,0,0,1,0,2,1 97 | 53,1,0,142,226,0,0,111,1,0,2,0,3,1 98 | 62,0,0,140,394,0,0,157,0,1.2,1,0,2,1 99 | 52,1,0,108,233,1,1,147,0,0.1,2,3,3,1 100 | 43,1,2,130,315,0,1,162,0,1.9,2,1,2,1 101 | 53,1,2,130,246,1,0,173,0,0,2,3,2,1 102 | 42,1,3,148,244,0,0,178,0,0.8,2,2,2,1 103 | 59,1,3,178,270,0,0,145,0,4.2,0,0,3,1 104 | 63,0,1,140,195,0,1,179,0,0,2,2,2,1 105 | 42,1,2,120,240,1,1,194,0,0.8,0,0,3,1 106 | 50,1,2,129,196,0,1,163,0,0,2,0,2,1 107 | 68,0,2,120,211,0,0,115,0,1.5,1,0,2,1 108 | 69,1,3,160,234,1,0,131,0,0.1,1,1,2,1 109 | 45,0,0,138,236,0,0,152,1,0.2,1,0,2,1 110 | 50,0,1,120,244,0,1,162,0,1.1,2,0,2,1 111 | 50,0,0,110,254,0,0,159,0,0,2,0,2,1 112 | 64,0,0,180,325,0,1,154,1,0,2,0,2,1 113 | 57,1,2,150,126,1,1,173,0,0.2,2,1,3,1 114 | 64,0,2,140,313,0,1,133,0,0.2,2,0,3,1 115 | 43,1,0,110,211,0,1,161,0,0,2,0,3,1 116 | 55,1,1,130,262,0,1,155,0,0,2,0,2,1 117 | 37,0,2,120,215,0,1,170,0,0,2,0,2,1 118 | 41,1,2,130,214,0,0,168,0,2,1,0,2,1 119 | 56,1,3,120,193,0,0,162,0,1.9,1,0,3,1 120 | 46,0,1,105,204,0,1,172,0,0,2,0,2,1 121 | 46,0,0,138,243,0,0,152,1,0,1,0,2,1 122 | 64,0,0,130,303,0,1,122,0,2,1,2,2,1 123 | 59,1,0,138,271,0,0,182,0,0,2,0,2,1 124 | 41,0,2,112,268,0,0,172,1,0,2,0,2,1 125 | 54,0,2,108,267,0,0,167,0,0,2,0,2,1 126 | 39,0,2,94,199,0,1,179,0,0,2,0,2,1 127 | 34,0,1,118,210,0,1,192,0,0.7,2,0,2,1 128 | 47,1,0,112,204,0,1,143,0,0.1,2,0,2,1 129 | 67,0,2,152,277,0,1,172,0,0,2,1,2,1 130 | 52,0,2,136,196,0,0,169,0,0.1,1,0,2,1 131 | 74,0,1,120,269,0,0,121,1,0.2,2,1,2,1 132 | 54,0,2,160,201,0,1,163,0,0,2,1,2,1 133 | 49,0,1,134,271,0,1,162,0,0,1,0,2,1 134 | 42,1,1,120,295,0,1,162,0,0,2,0,2,1 135 | 41,1,1,110,235,0,1,153,0,0,2,0,2,1 136 | 41,0,1,126,306,0,1,163,0,0,2,0,2,1 137 | 49,0,0,130,269,0,1,163,0,0,2,0,2,1 138 | 60,0,2,120,178,1,1,96,0,0,2,0,2,1 139 | 62,1,1,128,208,1,0,140,0,0,2,0,2,1 140 | 57,1,0,110,201,0,1,126,1,1.5,1,0,1,1 141 | 64,1,0,128,263,0,1,105,1,0.2,1,1,3,1 142 | 51,0,2,120,295,0,0,157,0,0.6,2,0,2,1 143 | 43,1,0,115,303,0,1,181,0,1.2,1,0,2,1 144 | 42,0,2,120,209,0,1,173,0,0,1,0,2,1 145 | 67,0,0,106,223,0,1,142,0,0.3,2,2,2,1 146 | 76,0,2,140,197,0,2,116,0,1.1,1,0,2,1 147 | 70,1,1,156,245,0,0,143,0,0,2,0,2,1 148 | 44,0,2,118,242,0,1,149,0,0.3,1,1,2,1 149 | 60,0,3,150,240,0,1,171,0,0.9,2,0,2,1 150 | 44,1,2,120,226,0,1,169,0,0,2,0,2,1 151 | 42,1,2,130,180,0,1,150,0,0,2,0,2,1 152 | 66,1,0,160,228,0,0,138,0,2.3,2,0,1,1 153 | 71,0,0,112,149,0,1,125,0,1.6,1,0,2,1 154 | 64,1,3,170,227,0,0,155,0,0.6,1,0,3,1 155 | 66,0,2,146,278,0,0,152,0,0,1,1,2,1 156 | 39,0,2,138,220,0,1,152,0,0,1,0,2,1 157 | 58,0,0,130,197,0,1,131,0,0.6,1,0,2,1 158 | 47,1,2,130,253,0,1,179,0,0,2,0,2,1 159 | 35,1,1,122,192,0,1,174,0,0,2,0,2,1 160 | 58,1,1,125,220,0,1,144,0,0.4,1,4,3,1 161 | 56,1,1,130,221,0,0,163,0,0,2,0,3,1 162 | 56,1,1,120,240,0,1,169,0,0,0,0,2,1 163 | 55,0,1,132,342,0,1,166,0,1.2,2,0,2,1 164 | 41,1,1,120,157,0,1,182,0,0,2,0,2,1 165 | 38,1,2,138,175,0,1,173,0,0,2,4,2,1 166 | 38,1,2,138,175,0,1,173,0,0,2,4,2,1 167 | 67,1,0,160,286,0,0,108,1,1.5,1,3,2,0 168 | 67,1,0,120,229,0,0,129,1,2.6,1,2,3,0 169 | 62,0,0,140,268,0,0,160,0,3.6,0,2,2,0 170 | 63,1,0,130,254,0,0,147,0,1.4,1,1,3,0 171 | 53,1,0,140,203,1,0,155,1,3.1,0,0,3,0 172 | 56,1,2,130,256,1,0,142,1,0.6,1,1,1,0 173 | 48,1,1,110,229,0,1,168,0,1,0,0,3,0 174 | 58,1,1,120,284,0,0,160,0,1.8,1,0,2,0 175 | 58,1,2,132,224,0,0,173,0,3.2,2,2,3,0 176 | 60,1,0,130,206,0,0,132,1,2.4,1,2,3,0 177 | 40,1,0,110,167,0,0,114,1,2,1,0,3,0 178 | 60,1,0,117,230,1,1,160,1,1.4,2,2,3,0 179 | 64,1,2,140,335,0,1,158,0,0,2,0,2,0 180 | 43,1,0,120,177,0,0,120,1,2.5,1,0,3,0 181 | 57,1,0,150,276,0,0,112,1,0.6,1,1,1,0 182 | 55,1,0,132,353,0,1,132,1,1.2,1,1,3,0 183 | 65,0,0,150,225,0,0,114,0,1,1,3,3,0 184 | 61,0,0,130,330,0,0,169,0,0,2,0,2,0 185 | 58,1,2,112,230,0,0,165,0,2.5,1,1,3,0 186 | 50,1,0,150,243,0,0,128,0,2.6,1,0,3,0 187 | 44,1,0,112,290,0,0,153,0,0,2,1,2,0 188 | 60,1,0,130,253,0,1,144,1,1.4,2,1,3,0 189 | 54,1,0,124,266,0,0,109,1,2.2,1,1,3,0 190 | 50,1,2,140,233,0,1,163,0,0.6,1,1,3,0 191 | 41,1,0,110,172,0,0,158,0,0,2,0,3,0 192 | 51,0,0,130,305,0,1,142,1,1.2,1,0,3,0 193 | 58,1,0,128,216,0,0,131,1,2.2,1,3,3,0 194 | 54,1,0,120,188,0,1,113,0,1.4,1,1,3,0 195 | 60,1,0,145,282,0,0,142,1,2.8,1,2,3,0 196 | 60,1,2,140,185,0,0,155,0,3,1,0,2,0 197 | 59,1,0,170,326,0,0,140,1,3.4,0,0,3,0 198 | 46,1,2,150,231,0,1,147,0,3.6,1,0,2,0 199 | 67,1,0,125,254,1,1,163,0,0.2,1,2,3,0 200 | 62,1,0,120,267,0,1,99,1,1.8,1,2,3,0 201 | 65,1,0,110,248,0,0,158,0,0.6,2,2,1,0 202 | 44,1,0,110,197,0,0,177,0,0,2,1,2,0 203 | 60,1,0,125,258,0,0,141,1,2.8,1,1,3,0 204 | 58,1,0,150,270,0,0,111,1,0.8,2,0,3,0 205 | 68,1,2,180,274,1,0,150,1,1.6,1,0,3,0 206 | 62,0,0,160,164,0,0,145,0,6.2,0,3,3,0 207 | 52,1,0,128,255,0,1,161,1,0,2,1,3,0 208 | 59,1,0,110,239,0,0,142,1,1.2,1,1,3,0 209 | 60,0,0,150,258,0,0,157,0,2.6,1,2,3,0 210 | 49,1,2,120,188,0,1,139,0,2,1,3,3,0 211 | 59,1,0,140,177,0,1,162,1,0,2,1,3,0 212 | 57,1,2,128,229,0,0,150,0,0.4,1,1,3,0 213 | 61,1,0,120,260,0,1,140,1,3.6,1,1,3,0 214 | 39,1,0,118,219,0,1,140,0,1.2,1,0,3,0 215 | 61,0,0,145,307,0,0,146,1,1,1,0,3,0 216 | 56,1,0,125,249,1,0,144,1,1.2,1,1,2,0 217 | 43,0,0,132,341,1,0,136,1,3,1,0,3,0 218 | 62,0,2,130,263,0,1,97,0,1.2,1,1,3,0 219 | 63,1,0,130,330,1,0,132,1,1.8,2,3,3,0 220 | 65,1,0,135,254,0,0,127,0,2.8,1,1,3,0 221 | 48,1,0,130,256,1,0,150,1,0,2,2,3,0 222 | 63,0,0,150,407,0,0,154,0,4,1,3,3,0 223 | 55,1,0,140,217,0,1,111,1,5.6,0,0,3,0 224 | 65,1,3,138,282,1,0,174,0,1.4,1,1,2,0 225 | 56,0,0,200,288,1,0,133,1,4,0,2,3,0 226 | 54,1,0,110,239,0,1,126,1,2.8,1,1,3,0 227 | 70,1,0,145,174,0,1,125,1,2.6,0,0,3,0 228 | 62,1,1,120,281,0,0,103,0,1.4,1,1,3,0 229 | 35,1,0,120,198,0,1,130,1,1.6,1,0,3,0 230 | 59,1,3,170,288,0,0,159,0,0.2,1,0,3,0 231 | 64,1,2,125,309,0,1,131,1,1.8,1,0,3,0 232 | 47,1,2,108,243,0,1,152,0,0,2,0,2,0 233 | 57,1,0,165,289,1,0,124,0,1,1,3,3,0 234 | 55,1,0,160,289,0,0,145,1,0.8,1,1,3,0 235 | 64,1,0,120,246,0,0,96,1,2.2,0,1,2,0 236 | 70,1,0,130,322,0,0,109,0,2.4,1,3,2,0 237 | 51,1,0,140,299,0,1,173,1,1.6,2,0,3,0 238 | 58,1,0,125,300,0,0,171,0,0,2,2,3,0 239 | 60,1,0,140,293,0,0,170,0,1.2,1,2,3,0 240 | 77,1,0,125,304,0,0,162,1,0,2,3,2,0 241 | 35,1,0,126,282,0,0,156,1,0,2,0,3,0 242 | 70,1,2,160,269,0,1,112,1,2.9,1,1,3,0 243 | 59,0,0,174,249,0,1,143,1,0,1,0,2,0 244 | 64,1,0,145,212,0,0,132,0,2,1,2,1,0 245 | 57,1,0,152,274,0,1,88,1,1.2,1,1,3,0 246 | 56,1,0,132,184,0,0,105,1,2.1,1,1,1,0 247 | 48,1,0,124,274,0,0,166,0,0.5,1,0,3,0 248 | 56,0,0,134,409,0,0,150,1,1.9,1,2,3,0 249 | 66,1,1,160,246,0,1,120,1,0,1,3,1,0 250 | 54,1,1,192,283,0,0,195,0,0,2,1,3,0 251 | 69,1,2,140,254,0,0,146,0,2,1,3,3,0 252 | 51,1,0,140,298,0,1,122,1,4.2,1,3,3,0 253 | 43,1,0,132,247,1,0,143,1,0.1,1,4,3,0 254 | 62,0,0,138,294,1,1,106,0,1.9,1,3,2,0 255 | 67,1,0,100,299,0,0,125,1,0.9,1,2,2,0 256 | 59,1,3,160,273,0,0,125,0,0,2,0,2,0 257 | 45,1,0,142,309,0,0,147,1,0,1,3,3,0 258 | 58,1,0,128,259,0,0,130,1,3,1,2,3,0 259 | 50,1,0,144,200,0,0,126,1,0.9,1,0,3,0 260 | 62,0,0,150,244,0,1,154,1,1.4,1,0,2,0 261 | 38,1,3,120,231,0,1,182,1,3.8,1,0,3,0 262 | 66,0,0,178,228,1,1,165,1,1,1,2,3,0 263 | 52,1,0,112,230,0,1,160,0,0,2,1,2,0 264 | 53,1,0,123,282,0,1,95,1,2,1,2,3,0 265 | 63,0,0,108,269,0,1,169,1,1.8,1,2,2,0 266 | 54,1,0,110,206,0,0,108,1,0,1,1,2,0 267 | 66,1,0,112,212,0,0,132,1,0.1,2,1,2,0 268 | 55,0,0,180,327,0,2,117,1,3.4,1,0,2,0 269 | 49,1,2,118,149,0,0,126,0,0.8,2,3,2,0 270 | 54,1,0,122,286,0,0,116,1,3.2,1,2,2,0 271 | 56,1,0,130,283,1,0,103,1,1.6,0,0,3,0 272 | 46,1,0,120,249,0,0,144,0,0.8,2,0,3,0 273 | 61,1,3,134,234,0,1,145,0,2.6,1,2,2,0 274 | 67,1,0,120,237,0,1,71,0,1,1,0,2,0 275 | 58,1,0,100,234,0,1,156,0,0.1,2,1,3,0 276 | 47,1,0,110,275,0,0,118,1,1,1,1,2,0 277 | 52,1,0,125,212,0,1,168,0,1,2,2,3,0 278 | 58,1,0,146,218,0,1,105,0,2,1,1,3,0 279 | 57,1,1,124,261,0,1,141,0,0.3,2,0,3,0 280 | 58,0,1,136,319,1,0,152,0,0,2,2,2,0 281 | 61,1,0,138,166,0,0,125,1,3.6,1,1,2,0 282 | 42,1,0,136,315,0,1,125,1,1.8,1,0,1,0 283 | 52,1,0,128,204,1,1,156,1,1,1,0,0,0 284 | 59,1,2,126,218,1,1,134,0,2.2,1,1,1,0 285 | 40,1,0,152,223,0,1,181,0,0,2,0,3,0 286 | 61,1,0,140,207,0,0,138,1,1.9,2,1,3,0 287 | 46,1,0,140,311,0,1,120,1,1.8,1,2,3,0 288 | 59,1,3,134,204,0,1,162,0,0.8,2,2,2,0 289 | 57,1,1,154,232,0,0,164,0,0,2,1,2,0 290 | 57,1,0,110,335,0,1,143,1,3,1,1,3,0 291 | 55,0,0,128,205,0,2,130,1,2,1,1,3,0 292 | 61,1,0,148,203,0,1,161,0,0,2,1,3,0 293 | 58,1,0,114,318,0,2,140,0,4.4,0,3,1,0 294 | 58,0,0,170,225,1,0,146,1,2.8,1,2,1,0 295 | 67,1,2,152,212,0,0,150,0,0.8,1,0,3,0 296 | 44,1,0,120,169,0,1,144,1,2.8,0,0,1,0 297 | 63,1,0,140,187,0,0,144,1,4,2,2,3,0 298 | 63,0,0,124,197,0,1,136,1,0,1,0,2,0 299 | 59,1,0,164,176,1,0,90,0,1,1,2,1,0 300 | 57,0,0,140,241,0,1,123,1,0.2,1,0,3,0 301 | 45,1,3,110,264,0,1,132,0,1.2,1,0,3,0 302 | 68,1,0,144,193,1,1,141,0,3.4,1,2,3,0 303 | 57,1,0,130,131,0,1,115,1,1.2,1,1,3,0 304 | 57,0,1,130,236,0,0,174,0,0,1,1,2,0 305 | -------------------------------------------------------------------------------- /DataAnalysis/source/tourist_arrivals.csv: -------------------------------------------------------------------------------- 1 | "date","value" 2 | "'2012-01-01'",2343290 3 | "'2012-02-01'",10468842 4 | "'2012-03-01'",13908950 5 | "'2012-04-01'",18456089 6 | "'2012-05-01'",20294254 7 | "'2012-06-01'",27101300 8 | "'2012-07-01'",32838284 9 | "'2012-08-01'",34392050 10 | "'2012-09-01'",23910073 11 | "'2012-10-01'",15828202 12 | "'2012-11-01'",10155960 13 | "'2012-12-01'",10804312 14 | "'2013-01-01'",9632532 15 | "'2013-02-01'",10628786 16 | "'2013-03-01'",14540671 17 | "'2013-04-01'",16192551 18 | "'2013-05-01'",21295358 19 | "'2013-06-01'",26751429 20 | "'2013-07-01'",32902913 21 | "'2013-08-01'",36156738 22 | "'2013-09-01'",23738366 23 | "'2013-10-01'",16062127 24 | "'2013-11-01'",10214150 25 | "'2013-12-01'",10832733 26 | "'2014-01-01'",9967129 27 | "'2014-02-01'",10511208 28 | "'2014-03-01'",13648915 29 | "'2014-04-01'",18901473 30 | "'2014-05-01'",21959612 31 | "'2014-06-01'",27492107 32 | "'2014-07-01'",32570827 33 | "'2014-08-01'",37625209 34 | "'2014-09-01'",24048425 35 | "'2014-10-01'",16899142 36 | "'2014-11-01'",10128432 37 | "'2014-12-01'",11664198 38 | "'2015-01-01'",10612222 39 | "'2015-02-01'",11550777 40 | "'2015-03-01'",13831733 41 | "'2015-04-01'",18952191 42 | "'2015-05-01'",25627245 43 | "'2015-06-01'",27925041 44 | "'2015-07-01'",35733205 45 | "'2015-08-01'",39925492 46 | "'2015-09-01'",25894479 47 | "'2015-10-01'",18243614 48 | "'2015-11-01'",10581677 49 | "'2015-12-01'",12279139 50 | "'2016-01-01'",11134303 51 | "'2016-02-01'",12561880 52 | "'2016-03-01'",16392340 53 | "'2016-04-01'",18591118 54 | "'2016-05-01'",23982773 55 | "'2016-06-01'",28833492 56 | "'2016-07-01'",39233326 57 | "'2016-08-01'",39122111 58 | "'2016-09-01'",27405383 59 | "'2016-10-01'",19341501 60 | "'2016-11-01'",11013233 61 | "'2016-12-01'",12965045 62 | "'2017-01-01'",11838496 63 | "'2017-02-01'",12777744 64 | "'2017-03-01'",15141686 65 | "'2017-04-01'",23678625 66 | "'2017-05-01'",23314377 67 | "'2017-06-01'",33607949 68 | "'2017-07-01'",40571601 69 | "'2017-08-01'",40151914 70 | "'2017-09-01'",28567249 71 | "'2017-10-01'",19936344 72 | "'2017-11-01'",12022393 73 | "'2017-12-01'",14188122 74 | "'2018-01-01'",11658597 75 | "'2018-02-01'",13589839 76 | "'2018-03-01'",17758577 77 | "'2018-04-01'",22161704 78 | "'2018-05-01'",26998217 79 | "'2018-06-01'",34183766 80 | "'2018-07-01'",39230371 81 | "'2018-08-01'",41063487 82 | "'2018-09-01'",30390840 83 | "'2018-10-01'",20257400 84 | "'2018-11-01'",13149876 85 | "'2018-12-01'",14689527 86 | "'2019-01-01'",12024702 87 | "'2019-02-01'",13688422 88 | "'2019-03-01'",16673419 89 | "'2019-04-01'",23658680 90 | "'2019-05-01'",24832942 91 | "'2019-06-01'",34658825 92 | "'2019-07-01'",39123041 93 | "'2019-08-01'",41588218 94 | "'2019-09-01'",30253817 95 | -------------------------------------------------------------------------------- /DataCollection/PDF/.ipynb_checkpoints/get_pdf-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 4 6 | } 7 | -------------------------------------------------------------------------------- /DataCollection/PDF/source/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/DataCollection/PDF/source/.DS_Store -------------------------------------------------------------------------------- /DataCollection/PDF/source/Bolletino-sorveglianza-integrata-COVID-19_17-marzo-2020_appendix.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/DataCollection/PDF/source/Bolletino-sorveglianza-integrata-COVID-19_17-marzo-2020_appendix.pdf -------------------------------------------------------------------------------- /DataCollection/Twitter/config.py: -------------------------------------------------------------------------------- 1 | TWITTER_CONSUMER_KEY = 'PUT HERE YOUR CONSUMER KEY' 2 | TWITTER_CONSUMER_SECRET = 'PUT HERE YOUR CONSUMER SECRET' 3 | TWITTER_ACCESS_TOKEN = 'PUT HERE YOUR ACCESS TOKEN' 4 | TWITTER_ACCESS_TOKEN_SECRET = 'PUT HERE YOUR ACCESS TOKEN SECRET' 5 | -------------------------------------------------------------------------------- /DataCollection/Twitter/get_tweets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Get Tweets\n", 8 | "\n", 9 | "This script extracts all the tweets with hashtag #covid-19 related to the day before today (yesterday) and saves them into a .csv file.\n", 10 | "We use the `tweepy` library, which can be installed with the command `pip install tweepy`.\n", 11 | "\n", 12 | "Firstly, we import the configuration file, called `config.py`, which is located in the same directory of this script." 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "from config import *\n", 22 | "import tweepy\n", 23 | "import datetime" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/plain": [ 34 | "'vLcxeGYqPTm2De02u5Fa0c2Zs'" 35 | ] 36 | }, 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "output_type": "execute_result" 40 | } 41 | ], 42 | "source": [ 43 | "TWITTER_CONSUMER_KEY" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "We setup the connection to our Twitter App by using the `OAuthHandler()` class and its `access_token()` function. Then we call the Twitter API through the `API()` function." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "auth = tweepy.OAuthHandler(TWITTER_CONSUMER_KEY, TWITTER_CONSUMER_SECRET)\n", 60 | "auth.set_access_token(TWITTER_ACCESS_TOKEN, TWITTER_ACCESS_TOKEN_SECRET)\n", 61 | "#auth = tweepy.OAuthHandler(\"vLcxeGYqPTm2De02u5Fa0c2Zs\", \"htu890Ysc8RCdN5eMkh25P6qDehVCbpAh0zNIKH8M55AK5RHqo\")\n", 62 | "#auth.set_access_token(\"108051742-qVpGsVGEpCTf7B5xvbNs5jYHuHKaAwlpmqQQVjgq\", \"goRE81O2jfuvtCGWWjspF7Fsibk3VUi2qApbJ3nZCb4l5\")\n", 63 | "\n", 64 | "api = tweepy.API(auth,wait_on_rate_limit=True)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "Now we setup dates. We need to setup today and yesterday." 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/plain": [ 82 | "(datetime.date(2021, 9, 6), datetime.date(2021, 9, 5))" 83 | ] 84 | }, 85 | "execution_count": 4, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "today = datetime.date.today()\n", 92 | "yesterday= today - datetime.timedelta(days=1)\n", 93 | "today, yesterday" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "We search for tweets on Twitter by using the `Cursor()` function. \n", 101 | "We pass the `api.search` parameter to the cursor, as well as the query string, which is specified through the `q` parameter of the cursor.\n", 102 | "The query string can receive many parameters, such as the following (not mandatory) ones:\n", 103 | "* `from:` - to specify a specific Twitter user profile\n", 104 | "* `since:` - to specify the beginning date of search\n", 105 | "* `until:` - to specify the ending date of search\n", 106 | "The cursor can also receive other parameters, such as the language and the `tweet_mode`. If `tweet_mode='extended'`, all the text of the tweet is returned, otherwise only the first 140 characters." 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 5, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "tweets_list = tweepy.Cursor(api.search, q=\"#Covid-19 since:\" + str(yesterday)+ \" until:\" + str(today),tweet_mode='extended', lang='it').items()" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 6, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "tweets_list = tweepy.Cursor(api.search, q=\"from: elonmusk\",tweet_mode='extended', lang='en').items()" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "Optionally, we can extract tweets from a given places, by specifying in the query string one of the following keywords, followed by `:`: \n", 132 | "* `place` - the place name or the place ID\n", 133 | "* `place_country` - the country code. See [here](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) to see the country code\n", 134 | "* `point_radius` - the circular geographic area within which to search for\n", 135 | "* `bounding_box` - the 4 sided geographic area, within which to search for.\n", 136 | "\n", 137 | "For more details, you can read the full [Twitter Documentation](https://developer.twitter.com/en/docs/tutorials/filtering-tweets-by-location).\n", 138 | "\n", 139 | "Here " 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 12, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "place = 'Rome'\n", 149 | "tweets_list = tweepy.Cursor(api.search, q=\"place: \" + place,tweet_mode='extended', lang='it').items()" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "Now we loop across the `tweets_list`, and, for each tweet, we extract the text, the creation date, the number of retweets and the favourite count. We store every tweet into a list, called `output`." 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 13, 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | "@TomFelton My Place ❤️ Fontana di Trevi, in Rome ❤️\n", 169 | "#ECFRSummerReadings\n", 170 | "🇪🇺🧐L' #UE attraversa una #crisi di #fiducia per la gestione della #pandemia: cosa dovrebbero fare i policymakers per ascoltare i cittadini europei?\n", 171 | "👉Il report di @sd270 @jana_puglierin @ECFRPower: https://t.co/WPA70SazC5\n", 172 | "In 🇮🇹: https://t.co/Zs3OaNopur https://t.co/WPX411i9C2\n" 173 | ] 174 | } 175 | ], 176 | "source": [ 177 | "output = []\n", 178 | "for tweet in tweets_list:\n", 179 | " text = tweet._json[\"full_text\"]\n", 180 | " print(text)\n", 181 | " favourite_count = tweet.favorite_count\n", 182 | " retweet_count = tweet.retweet_count\n", 183 | " created_at = tweet.created_at\n", 184 | " \n", 185 | " line = {'text' : text, 'favourite_count' : favourite_count, 'retweet_count' : retweet_count, 'created_at' : created_at}\n", 186 | " output.append(line)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "output" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "Finally, we convert the `output` list to a `pandas DataFrame` and we store results." 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "output" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "import pandas as pd\n", 221 | "\n", 222 | "df = pd.DataFrame(output)\n", 223 | "df.to_csv('output.csv', mode='a', header=False)\n", 224 | "#df.to_csv('output.csv')" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "df.shape" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "df.head(10)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [] 251 | } 252 | ], 253 | "metadata": { 254 | "kernelspec": { 255 | "display_name": "Python 3", 256 | "language": "python", 257 | "name": "python3" 258 | }, 259 | "language_info": { 260 | "codemirror_mode": { 261 | "name": "ipython", 262 | "version": 3 263 | }, 264 | "file_extension": ".py", 265 | "mimetype": "text/x-python", 266 | "name": "python", 267 | "nbconvert_exporter": "python", 268 | "pygments_lexer": "ipython3", 269 | "version": "3.8.1" 270 | } 271 | }, 272 | "nbformat": 4, 273 | "nbformat_minor": 4 274 | } 275 | -------------------------------------------------------------------------------- /DataCollection/Web/Nested Scraping.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Scraping Data from Nested HTML Pages with Python Selenium \n", 8 | "In this tutorial, I illustrate how to scrape a list of terms, distributed over two levels of nested pages, through Python `selenium`. As example, I scrape the list of terms from [Bocardi](https://www.brocardi.it).\n", 9 | "\n", 10 | "## Recognize the Web Site Structure\n", 11 | "In order to scrape data from a Web site, firstly I need to study the URIs structure. \n", 12 | "In my example, the list of terms is organized alphabetically, and for each letter of the alphabet there is a dedicated page, available at `/dizionario//` (first level of URI). For example, for the letter `a`, the dedicated page is available at `https://www.brocardi.it/dizionario/a/`. \n", 13 | "In addition, the list of terms for each letter is paginated in different pages. For each letter, the first page is available the the first level of URI, while starting from the second page, the URI changes and is available at `/dizionario//?page=`. For example, for the letter `a`, the list of terms in the second page is available at the link `https://www.brocardi.it/dizionario/a/?page=2`.\n", 14 | "\n", 15 | "## Environment Setup\n", 16 | "In my code, I need to implement two loops: an external loop for letters and an internal loop for pages. I note that some letters are missing (`jkwxy`). For the external loop, I build a list containing all the letters, but the missing ones. I exploit `string.ascii_lowercase` to build the list of letters." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import string\n", 26 | "letters = string.ascii_lowercase\n", 27 | "letters = letters.replace('jk', '')\n", 28 | "letters = letters.replace('wxy', '')\n", 29 | "letters = list(letters)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "Then I define two variables, `basic_url`, which contains the basic url to the Web site, and `table`, which will contain the list of all extracted terms. Initially `table` is an empty list." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "table = []\n", 46 | "basic_url = \"https://www.brocardi.it\"" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "Now I import all the `selenium` drivers and the `NoSuchElementException` exception, which will be used to catch some kind of exceptions, while performing the internal loop. I also import the `pandas` library." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "from selenium import webdriver\n", 63 | "from selenium.webdriver.chrome.options import Options \n", 64 | "from selenium.common.exceptions import NoSuchElementException" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "## Nested Loops\n", 72 | "I implement the external loop through a `for` ranging from `a` to `z`. At each step of the external loop, I build the url. Then I implement the internal infinite loop through a `while`. Within the internal loop I build a driver, which performs scraping. I exploit a `Chrome()` webdriver, which receives as input the `--headless` and the `--lang=it` options. The first options specifies that the browser will not be opened, while the second option specifies the language of the browser.\n", 73 | "\n", 74 | "Once connected, I search for two elements:\n", 75 | "* the elements which contain the list of terms\n", 76 | "* the element which contains the link to the next page.\n", 77 | "\n", 78 | "Both elements depend on the structure of the HTML page. I exploit the function `find_elements_by_xpath()` to search for a specific XPath.\n", 79 | "\n", 80 | "As already said, the internal loop is an infinite loop, where the break condition is given by a `NoSuchElementException`, raised when there are no further next pages. The list of terms is stored in the `table` variable." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "for letter in letters:\n", 90 | " \n", 91 | " url = basic_url + '/dizionario/' + letter + '/'\n", 92 | " while True:\n", 93 | " try:\n", 94 | " print(url)\n", 95 | " options = Options() \n", 96 | " options.add_argument(\"--headless\") \n", 97 | " options.add_argument(\"--lang=it\");\n", 98 | " driver = webdriver.Chrome(options=options)\n", 99 | "\n", 100 | " driver.get(url)\n", 101 | "\n", 102 | " # get the list of terms\n", 103 | " xpath = '//ul[@class=\"terms-list\"]'\n", 104 | " words = driver.find_element_by_xpath(xpath).text\n", 105 | " table.extend(words.split('\\n'))\n", 106 | " \n", 107 | " # get the next page\n", 108 | " xpath = '//a[@class=\"next\"]'\n", 109 | " url = driver.find_element_by_xpath(xpath).get_attribute('href')\n", 110 | " \n", 111 | " driver.close()\n", 112 | " \n", 113 | " except NoSuchElementException:\n", 114 | " break\n", 115 | " \n", 116 | " " 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "## Store results\n", 124 | "\n", 125 | "The variable `table` contains the list of all terms. I can store it to a CSV file. This can be done by building a `pandas` Dataframe." 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "import pandas as pd\n", 135 | "\n", 136 | "df = pd.DataFrame(table, columns=['word'])\n", 137 | "df.head()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "df['word'] = df['word'].str.lower()" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "df.to_csv('outputs/glossary.csv')" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [] 164 | } 165 | ], 166 | "metadata": { 167 | "kernelspec": { 168 | "display_name": "Python 3", 169 | "language": "python", 170 | "name": "python3" 171 | }, 172 | "language_info": { 173 | "codemirror_mode": { 174 | "name": "ipython", 175 | "version": 3 176 | }, 177 | "file_extension": ".py", 178 | "mimetype": "text/x-python", 179 | "name": "python", 180 | "nbconvert_exporter": "python", 181 | "pygments_lexer": "ipython3", 182 | "version": "3.8.1" 183 | } 184 | }, 185 | "nbformat": 4, 186 | "nbformat_minor": 4 187 | } 188 | -------------------------------------------------------------------------------- /DataCollection/Web/source/euro2020_groups.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | AutoIbride 11 | 12 | 13 | 14 | 15 | 16 | 17 |
18 |
19 |
20 |

Euro 2020 - Groups

21 |
22 |
23 |
24 | 25 |
26 |

Group A

27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 |
P+/-Pts
1 ITA Italy379
2 WAL Wales314
3 SUI Switzerland3-14
4 TUR Turkey3-70
60 |
61 | 62 |
63 |

Group B

64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 |
P+/-Pts
1 BEL Belgium369
2 DEN Denmark313
3 FIN Finland3-23
4 RUS Russia3-53
101 |
102 |
103 |

Group C

104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 |
P+/-Pts
1 NED Netherlands369
2 AUT Austria316
3 UKR Ukraine3-13
4 MKD North Macedonia3-60
142 |
143 |
144 |
145 |
146 |

Group D

147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 |
P+/-Pts
1 ENG England327
3 CRO Croatia314
3 CZE Czech Republic314
4 SCO Scotland3-41
184 |
185 |
186 |

Group E

187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 |
P+/-Pts
1 SWE Sweden327
2 ESP Spain355
3 SVK Slovakia3-53
4 POL Poland3-21
225 |
226 |
227 |

Group F

228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 |
P+/-Pts
1 FRA France315
2 GER Germany314
3 POR Portugal314
4 HUN Hungary3-32
266 |
267 |
268 |
269 | 270 | 271 | -------------------------------------------------------------------------------- /DataNarrative/source/car_sales.csv: -------------------------------------------------------------------------------- 1 | "DATE","BEACON","DRAPER","FILMORE","LAKESIDE","MARE VALLEY","NORTH","OAKLEY","ORLY","PIERCE","ROSEDALE","SEALY","SOUTHLAKE","WESTLAKE","WILDLAND","REGIONAL AVG" 2 | "2017 Q1",69,130,85,117,16,74,61,51,35,65,93,72,83,83,74 3 | "2017 Q2",77,86,97,107,17,71,55,61,32,55,89,64,73,79,69 4 | "2017 Q3",63,90,92,127,18,68,55,67,32,59,79,58,73,81,69 5 | "2017 Q4",61,100,95,132,21,72,72,51,49,53,82,77,85,75,73 6 | "2018 Q1",67,106,92,145,21,69,79,51,49,59,94,87,87,79,78 7 | "2018 Q2",67,91,83,112,17,71,60,66,42,52,83,70,54,72,67 8 | "2018 Q3",57,89,103,107,17,56,42,52,23,41,84,66,59,51,61 9 | "2018 Q4",62,87,109,95,17,66,44,60,48,37,81,59,64,46,63 10 | "2019 Q1",68,93,94,126,21,83,65,66,38,63,89,66,59,66,71 11 | "2019 Q2",61,96,104,133,13,79,80,47,24,49,70,65,68,57,68 12 | "2019 Q3",54,103,88,105,22,66,37,63,26,52,66,55,51,69,61 13 | -------------------------------------------------------------------------------- /DataNarrative/source/conjunctivitis.csv: -------------------------------------------------------------------------------- 1 | Week,conjunctivitis 2 | 2016-09-11,20 3 | 2016-09-18,14 4 | 2016-09-25,21 5 | 2016-10-02,10 6 | 2016-10-09,14 7 | 2016-10-16,25 8 | 2016-10-23,26 9 | 2016-10-30,21 10 | 2016-11-06,17 11 | 2016-11-13,25 12 | 2016-11-20,22 13 | 2016-11-27,26 14 | 2016-12-04,24 15 | 2016-12-11,22 16 | 2016-12-18,23 17 | 2016-12-25,31 18 | 2017-01-01,31 19 | 2017-01-08,22 20 | 2017-01-15,29 21 | 2017-01-22,28 22 | 2017-01-29,24 23 | 2017-02-05,37 24 | 2017-02-12,25 25 | 2017-02-19,28 26 | 2017-02-26,29 27 | 2017-03-05,35 28 | 2017-03-12,29 29 | 2017-03-19,33 30 | 2017-03-26,39 31 | 2017-04-02,27 32 | 2017-04-09,22 33 | 2017-04-16,28 34 | 2017-04-23,26 35 | 2017-04-30,33 36 | 2017-05-07,26 37 | 2017-05-14,40 38 | 2017-05-21,28 39 | 2017-05-28,44 40 | 2017-06-04,19 41 | 2017-06-11,27 42 | 2017-06-18,32 43 | 2017-06-25,25 44 | 2017-07-02,32 45 | 2017-07-09,37 46 | 2017-07-16,29 47 | 2017-07-23,22 48 | 2017-07-30,33 49 | 2017-08-06,28 50 | 2017-08-13,31 51 | 2017-08-20,36 52 | 2017-08-27,16 53 | 2017-09-03,21 54 | 2017-09-10,20 55 | 2017-09-17,18 56 | 2017-09-24,21 57 | 2017-10-01,28 58 | 2017-10-08,17 59 | 2017-10-15,19 60 | 2017-10-22,20 61 | 2017-10-29,20 62 | 2017-11-05,26 63 | 2017-11-12,32 64 | 2017-11-19,29 65 | 2017-11-26,31 66 | 2017-12-03,22 67 | 2017-12-10,24 68 | 2017-12-17,34 69 | 2017-12-24,35 70 | 2017-12-31,29 71 | 2018-01-07,23 72 | 2018-01-14,27 73 | 2018-01-21,30 74 | 2018-01-28,24 75 | 2018-02-04,25 76 | 2018-02-11,28 77 | 2018-02-18,35 78 | 2018-02-25,25 79 | 2018-03-04,27 80 | 2018-03-11,24 81 | 2018-03-18,26 82 | 2018-03-25,27 83 | 2018-04-01,28 84 | 2018-04-08,32 85 | 2018-04-15,38 86 | 2018-04-22,35 87 | 2018-04-29,28 88 | 2018-05-06,26 89 | 2018-05-13,24 90 | 2018-05-20,33 91 | 2018-05-27,34 92 | 2018-06-03,29 93 | 2018-06-10,30 94 | 2018-06-17,28 95 | 2018-06-24,27 96 | 2018-07-01,22 97 | 2018-07-08,22 98 | 2018-07-15,26 99 | 2018-07-22,25 100 | 2018-07-29,18 101 | 2018-08-05,24 102 | 2018-08-12,33 103 | 2018-08-19,35 104 | 2018-08-26,22 105 | 2018-09-02,20 106 | 2018-09-09,25 107 | 2018-09-16,19 108 | 2018-09-23,21 109 | 2018-09-30,21 110 | 2018-10-07,23 111 | 2018-10-14,24 112 | 2018-10-21,25 113 | 2018-10-28,31 114 | 2018-11-04,27 115 | 2018-11-11,31 116 | 2018-11-18,16 117 | 2018-11-25,32 118 | 2018-12-02,27 119 | 2018-12-09,29 120 | 2018-12-16,29 121 | 2018-12-23,33 122 | 2018-12-30,26 123 | 2019-01-06,22 124 | 2019-01-13,31 125 | 2019-01-20,38 126 | 2019-01-27,33 127 | 2019-02-03,30 128 | 2019-02-10,34 129 | 2019-02-17,22 130 | 2019-02-24,25 131 | 2019-03-03,22 132 | 2019-03-10,26 133 | 2019-03-17,29 134 | 2019-03-24,32 135 | 2019-03-31,26 136 | 2019-04-07,35 137 | 2019-04-14,33 138 | 2019-04-21,34 139 | 2019-04-28,31 140 | 2019-05-05,24 141 | 2019-05-12,27 142 | 2019-05-19,27 143 | 2019-05-26,32 144 | 2019-06-02,29 145 | 2019-06-09,40 146 | 2019-06-16,32 147 | 2019-06-23,45 148 | 2019-06-30,44 149 | 2019-07-07,22 150 | 2019-07-14,28 151 | 2019-07-21,32 152 | 2019-07-28,27 153 | 2019-08-04,33 154 | 2019-08-11,30 155 | 2019-08-18,22 156 | 2019-08-25,29 157 | 2019-09-01,20 158 | 2019-09-08,20 159 | 2019-09-15,29 160 | 2019-09-22,19 161 | 2019-09-29,19 162 | 2019-10-06,14 163 | 2019-10-13,19 164 | 2019-10-20,22 165 | 2019-10-27,18 166 | 2019-11-03,26 167 | 2019-11-10,24 168 | 2019-11-17,20 169 | 2019-11-24,18 170 | 2019-12-01,31 171 | 2019-12-08,21 172 | 2019-12-15,22 173 | 2019-12-22,29 174 | 2019-12-29,20 175 | 2020-01-05,27 176 | 2020-01-12,27 177 | 2020-01-19,19 178 | 2020-01-26,35 179 | 2020-02-02,43 180 | 2020-02-09,34 181 | 2020-02-16,48 182 | 2020-02-23,70 183 | 2020-03-01,96 184 | 2020-03-08,100 185 | 2020-03-15,98 186 | 2020-03-22,74 187 | 2020-03-29,62 188 | 2020-04-05,60 189 | 2020-04-12,48 190 | 2020-04-19,49 191 | 2020-04-26,51 192 | 2020-05-03,41 193 | 2020-05-10,41 194 | 2020-05-17,29 195 | 2020-05-24,44 196 | 2020-05-31,20 197 | 2020-06-07,22 198 | 2020-06-14,23 199 | 2020-06-21,28 200 | 2020-06-28,23 201 | 2020-07-05,25 202 | 2020-07-12,20 203 | 2020-07-19,35 204 | 2020-07-26,26 205 | 2020-08-02,33 206 | 2020-08-09,27 207 | 2020-08-16,35 208 | 2020-08-23,32 209 | 2020-08-30,36 210 | 2020-09-06,28 211 | 2020-09-13,27 212 | 2020-09-20,26 213 | 2020-09-27,30 214 | 2020-10-04,26 215 | 2020-10-11,33 216 | 2020-10-18,37 217 | 2020-10-25,31 218 | 2020-11-01,32 219 | 2020-11-08,36 220 | 2020-11-15,26 221 | 2020-11-22,21 222 | 2020-11-29,26 223 | 2020-12-06,32 224 | 2020-12-13,21 225 | 2020-12-20,21 226 | 2020-12-27,29 227 | 2021-01-03,24 228 | 2021-01-10,26 229 | 2021-01-17,17 230 | 2021-01-24,33 231 | 2021-01-31,20 232 | 2021-02-07,31 233 | 2021-02-14,27 234 | 2021-02-21,24 235 | 2021-02-28,21 236 | 2021-03-07,26 237 | 2021-03-14,29 238 | 2021-03-21,20 239 | 2021-03-28,32 240 | 2021-04-04,25 241 | 2021-04-11,22 242 | 2021-04-18,23 243 | 2021-04-25,14 244 | 2021-05-02,20 245 | 2021-05-09,25 246 | 2021-05-16,19 247 | 2021-05-23,26 248 | 2021-05-30,29 249 | 2021-06-06,31 250 | 2021-06-13,34 251 | 2021-06-20,23 252 | 2021-06-27,21 253 | 2021-07-04,35 254 | 2021-07-11,28 255 | 2021-07-18,22 256 | 2021-07-25,31 257 | 2021-08-01,23 258 | 2021-08-08,29 259 | 2021-08-15,23 260 | 2021-08-22,19 261 | 2021-08-29,25 262 | 2021-09-05,17 263 | -------------------------------------------------------------------------------- /DataNarrative/source/cough.csv: -------------------------------------------------------------------------------- 1 | Week,cough 2 | 2016-09-11,16 3 | 2016-09-18,16 4 | 2016-09-25,27 5 | 2016-10-02,36 6 | 2016-10-09,33 7 | 2016-10-16,40 8 | 2016-10-23,34 9 | 2016-10-30,35 10 | 2016-11-06,34 11 | 2016-11-13,36 12 | 2016-11-20,31 13 | 2016-11-27,38 14 | 2016-12-04,46 15 | 2016-12-11,47 16 | 2016-12-18,57 17 | 2016-12-25,78 18 | 2017-01-01,73 19 | 2017-01-08,51 20 | 2017-01-15,52 21 | 2017-01-22,50 22 | 2017-01-29,49 23 | 2017-02-05,42 24 | 2017-02-12,45 25 | 2017-02-19,47 26 | 2017-02-26,38 27 | 2017-03-05,39 28 | 2017-03-12,32 29 | 2017-03-19,36 30 | 2017-03-26,34 31 | 2017-04-02,29 32 | 2017-04-09,32 33 | 2017-04-16,30 34 | 2017-04-23,28 35 | 2017-04-30,27 36 | 2017-05-07,23 37 | 2017-05-14,27 38 | 2017-05-21,25 39 | 2017-05-28,23 40 | 2017-06-04,22 41 | 2017-06-11,21 42 | 2017-06-18,19 43 | 2017-06-25,17 44 | 2017-07-02,16 45 | 2017-07-09,17 46 | 2017-07-16,17 47 | 2017-07-23,15 48 | 2017-07-30,14 49 | 2017-08-06,17 50 | 2017-08-13,18 51 | 2017-08-20,17 52 | 2017-08-27,17 53 | 2017-09-03,17 54 | 2017-09-10,19 55 | 2017-09-17,24 56 | 2017-09-24,35 57 | 2017-10-01,42 58 | 2017-10-08,41 59 | 2017-10-15,45 60 | 2017-10-22,41 61 | 2017-10-29,33 62 | 2017-11-05,36 63 | 2017-11-12,30 64 | 2017-11-19,32 65 | 2017-11-26,38 66 | 2017-12-03,38 67 | 2017-12-10,39 68 | 2017-12-17,48 69 | 2017-12-24,76 70 | 2017-12-31,74 71 | 2018-01-07,67 72 | 2018-01-14,59 73 | 2018-01-21,55 74 | 2018-01-28,53 75 | 2018-02-04,48 76 | 2018-02-11,43 77 | 2018-02-18,44 78 | 2018-02-25,49 79 | 2018-03-04,35 80 | 2018-03-11,38 81 | 2018-03-18,33 82 | 2018-03-25,39 83 | 2018-04-01,30 84 | 2018-04-08,32 85 | 2018-04-15,30 86 | 2018-04-22,31 87 | 2018-04-29,25 88 | 2018-05-06,21 89 | 2018-05-13,24 90 | 2018-05-20,26 91 | 2018-05-27,29 92 | 2018-06-03,25 93 | 2018-06-10,19 94 | 2018-06-17,20 95 | 2018-06-24,19 96 | 2018-07-01,20 97 | 2018-07-08,19 98 | 2018-07-15,16 99 | 2018-07-22,18 100 | 2018-07-29,18 101 | 2018-08-05,20 102 | 2018-08-12,20 103 | 2018-08-19,19 104 | 2018-08-26,21 105 | 2018-09-02,23 106 | 2018-09-09,25 107 | 2018-09-16,23 108 | 2018-09-23,26 109 | 2018-09-30,33 110 | 2018-10-07,41 111 | 2018-10-14,44 112 | 2018-10-21,36 113 | 2018-10-28,36 114 | 2018-11-04,37 115 | 2018-11-11,40 116 | 2018-11-18,35 117 | 2018-11-25,39 118 | 2018-12-02,43 119 | 2018-12-09,41 120 | 2018-12-16,45 121 | 2018-12-23,59 122 | 2018-12-30,64 123 | 2019-01-06,54 124 | 2019-01-13,54 125 | 2019-01-20,57 126 | 2019-01-27,58 127 | 2019-02-03,62 128 | 2019-02-10,51 129 | 2019-02-17,53 130 | 2019-02-24,43 131 | 2019-03-03,48 132 | 2019-03-10,41 133 | 2019-03-17,37 134 | 2019-03-24,39 135 | 2019-03-31,37 136 | 2019-04-07,36 137 | 2019-04-14,28 138 | 2019-04-21,34 139 | 2019-04-28,30 140 | 2019-05-05,25 141 | 2019-05-12,27 142 | 2019-05-19,25 143 | 2019-05-26,27 144 | 2019-06-02,29 145 | 2019-06-09,24 146 | 2019-06-16,23 147 | 2019-06-23,19 148 | 2019-06-30,16 149 | 2019-07-07,15 150 | 2019-07-14,19 151 | 2019-07-21,16 152 | 2019-07-28,17 153 | 2019-08-04,18 154 | 2019-08-11,15 155 | 2019-08-18,17 156 | 2019-08-25,21 157 | 2019-09-01,18 158 | 2019-09-08,19 159 | 2019-09-15,23 160 | 2019-09-22,26 161 | 2019-09-29,26 162 | 2019-10-06,35 163 | 2019-10-13,43 164 | 2019-10-20,34 165 | 2019-10-27,40 166 | 2019-11-03,36 167 | 2019-11-10,41 168 | 2019-11-17,40 169 | 2019-11-24,38 170 | 2019-12-01,39 171 | 2019-12-08,38 172 | 2019-12-15,45 173 | 2019-12-22,54 174 | 2019-12-29,56 175 | 2020-01-05,52 176 | 2020-01-12,55 177 | 2020-01-19,62 178 | 2020-01-26,62 179 | 2020-02-02,59 180 | 2020-02-09,58 181 | 2020-02-16,60 182 | 2020-02-23,81 183 | 2020-03-01,79 184 | 2020-03-08,100 185 | 2020-03-15,79 186 | 2020-03-22,57 187 | 2020-03-29,38 188 | 2020-04-05,31 189 | 2020-04-12,28 190 | 2020-04-19,24 191 | 2020-04-26,22 192 | 2020-05-03,15 193 | 2020-05-10,17 194 | 2020-05-17,17 195 | 2020-05-24,13 196 | 2020-05-31,15 197 | 2020-06-07,12 198 | 2020-06-14,12 199 | 2020-06-21,11 200 | 2020-06-28,12 201 | 2020-07-05,12 202 | 2020-07-12,10 203 | 2020-07-19,11 204 | 2020-07-26,14 205 | 2020-08-02,19 206 | 2020-08-09,26 207 | 2020-08-16,27 208 | 2020-08-23,27 209 | 2020-08-30,25 210 | 2020-09-06,18 211 | 2020-09-13,17 212 | 2020-09-20,20 213 | 2020-09-27,27 214 | 2020-10-04,34 215 | 2020-10-11,40 216 | 2020-10-18,43 217 | 2020-10-25,47 218 | 2020-11-01,42 219 | 2020-11-08,41 220 | 2020-11-15,32 221 | 2020-11-22,27 222 | 2020-11-29,25 223 | 2020-12-06,23 224 | 2020-12-13,18 225 | 2020-12-20,25 226 | 2020-12-27,27 227 | 2021-01-03,21 228 | 2021-01-10,19 229 | 2021-01-17,17 230 | 2021-01-24,21 231 | 2021-01-31,22 232 | 2021-02-07,22 233 | 2021-02-14,21 234 | 2021-02-21,27 235 | 2021-02-28,26 236 | 2021-03-07,29 237 | 2021-03-14,24 238 | 2021-03-21,26 239 | 2021-03-28,28 240 | 2021-04-04,23 241 | 2021-04-11,22 242 | 2021-04-18,18 243 | 2021-04-25,23 244 | 2021-05-02,25 245 | 2021-05-09,22 246 | 2021-05-16,23 247 | 2021-05-23,21 248 | 2021-05-30,29 249 | 2021-06-06,25 250 | 2021-06-13,23 251 | 2021-06-20,20 252 | 2021-06-27,21 253 | 2021-07-04,22 254 | 2021-07-11,22 255 | 2021-07-18,26 256 | 2021-07-25,25 257 | 2021-08-01,24 258 | 2021-08-08,30 259 | 2021-08-15,30 260 | 2021-08-22,28 261 | 2021-08-29,29 262 | 2021-09-05,25 263 | -------------------------------------------------------------------------------- /DataNarrative/source/fever.csv: -------------------------------------------------------------------------------- 1 | Week,fever 2 | 2016-09-11,18 3 | 2016-09-18,21 4 | 2016-09-25,22 5 | 2016-10-02,20 6 | 2016-10-09,22 7 | 2016-10-16,21 8 | 2016-10-23,23 9 | 2016-10-30,19 10 | 2016-11-06,19 11 | 2016-11-13,20 12 | 2016-11-20,22 13 | 2016-11-27,23 14 | 2016-12-04,24 15 | 2016-12-11,31 16 | 2016-12-18,44 17 | 2016-12-25,59 18 | 2017-01-01,47 19 | 2017-01-08,37 20 | 2017-01-15,32 21 | 2017-01-22,32 22 | 2017-01-29,30 23 | 2017-02-05,29 24 | 2017-02-12,29 25 | 2017-02-19,29 26 | 2017-02-26,24 27 | 2017-03-05,24 28 | 2017-03-12,26 29 | 2017-03-19,26 30 | 2017-03-26,22 31 | 2017-04-02,20 32 | 2017-04-09,23 33 | 2017-04-16,25 34 | 2017-04-23,20 35 | 2017-04-30,20 36 | 2017-05-07,22 37 | 2017-05-14,19 38 | 2017-05-21,24 39 | 2017-05-28,26 40 | 2017-06-04,19 41 | 2017-06-11,20 42 | 2017-06-18,22 43 | 2017-06-25,21 44 | 2017-07-02,23 45 | 2017-07-09,22 46 | 2017-07-16,24 47 | 2017-07-23,21 48 | 2017-07-30,21 49 | 2017-08-06,22 50 | 2017-08-13,27 51 | 2017-08-20,22 52 | 2017-08-27,19 53 | 2017-09-03,22 54 | 2017-09-10,19 55 | 2017-09-17,22 56 | 2017-09-24,23 57 | 2017-10-01,23 58 | 2017-10-08,21 59 | 2017-10-15,20 60 | 2017-10-22,20 61 | 2017-10-29,19 62 | 2017-11-05,19 63 | 2017-11-12,22 64 | 2017-11-19,20 65 | 2017-11-26,21 66 | 2017-12-03,24 67 | 2017-12-10,28 68 | 2017-12-17,34 69 | 2017-12-24,59 70 | 2017-12-31,60 71 | 2018-01-07,46 72 | 2018-01-14,40 73 | 2018-01-21,43 74 | 2018-01-28,38 75 | 2018-02-04,34 76 | 2018-02-11,28 77 | 2018-02-18,29 78 | 2018-02-25,27 79 | 2018-03-04,23 80 | 2018-03-11,26 81 | 2018-03-18,24 82 | 2018-03-25,24 83 | 2018-04-01,25 84 | 2018-04-08,21 85 | 2018-04-15,21 86 | 2018-04-22,24 87 | 2018-04-29,21 88 | 2018-05-06,19 89 | 2018-05-13,21 90 | 2018-05-20,20 91 | 2018-05-27,25 92 | 2018-06-03,21 93 | 2018-06-10,23 94 | 2018-06-17,23 95 | 2018-06-24,22 96 | 2018-07-01,20 97 | 2018-07-08,20 98 | 2018-07-15,20 99 | 2018-07-22,23 100 | 2018-07-29,23 101 | 2018-08-05,24 102 | 2018-08-12,28 103 | 2018-08-19,27 104 | 2018-08-26,28 105 | 2018-09-02,25 106 | 2018-09-09,24 107 | 2018-09-16,21 108 | 2018-09-23,22 109 | 2018-09-30,22 110 | 2018-10-07,24 111 | 2018-10-14,22 112 | 2018-10-21,21 113 | 2018-10-28,16 114 | 2018-11-04,22 115 | 2018-11-11,22 116 | 2018-11-18,21 117 | 2018-11-25,19 118 | 2018-12-02,21 119 | 2018-12-09,23 120 | 2018-12-16,24 121 | 2018-12-23,30 122 | 2018-12-30,31 123 | 2019-01-06,32 124 | 2019-01-13,33 125 | 2019-01-20,44 126 | 2019-01-27,48 127 | 2019-02-03,45 128 | 2019-02-10,38 129 | 2019-02-17,36 130 | 2019-02-24,34 131 | 2019-03-03,29 132 | 2019-03-10,25 133 | 2019-03-17,25 134 | 2019-03-24,24 135 | 2019-03-31,23 136 | 2019-04-07,19 137 | 2019-04-14,21 138 | 2019-04-21,24 139 | 2019-04-28,23 140 | 2019-05-05,26 141 | 2019-05-12,20 142 | 2019-05-19,23 143 | 2019-05-26,19 144 | 2019-06-02,21 145 | 2019-06-09,23 146 | 2019-06-16,22 147 | 2019-06-23,25 148 | 2019-06-30,23 149 | 2019-07-07,24 150 | 2019-07-14,21 151 | 2019-07-21,21 152 | 2019-07-28,20 153 | 2019-08-04,22 154 | 2019-08-11,22 155 | 2019-08-18,20 156 | 2019-08-25,21 157 | 2019-09-01,20 158 | 2019-09-08,17 159 | 2019-09-15,20 160 | 2019-09-22,21 161 | 2019-09-29,22 162 | 2019-10-06,24 163 | 2019-10-13,25 164 | 2019-10-20,26 165 | 2019-10-27,23 166 | 2019-11-03,21 167 | 2019-11-10,20 168 | 2019-11-17,21 169 | 2019-11-24,23 170 | 2019-12-01,21 171 | 2019-12-08,20 172 | 2019-12-15,24 173 | 2019-12-22,26 174 | 2019-12-29,30 175 | 2020-01-05,31 176 | 2020-01-12,34 177 | 2020-01-19,41 178 | 2020-01-26,44 179 | 2020-02-02,40 180 | 2020-02-09,44 181 | 2020-02-16,44 182 | 2020-02-23,63 183 | 2020-03-01,58 184 | 2020-03-08,100 185 | 2020-03-15,80 186 | 2020-03-22,59 187 | 2020-03-29,44 188 | 2020-04-05,33 189 | 2020-04-12,30 190 | 2020-04-19,31 191 | 2020-04-26,33 192 | 2020-05-03,29 193 | 2020-05-10,29 194 | 2020-05-17,26 195 | 2020-05-24,25 196 | 2020-05-31,25 197 | 2020-06-07,23 198 | 2020-06-14,21 199 | 2020-06-21,21 200 | 2020-06-28,22 201 | 2020-07-05,22 202 | 2020-07-12,23 203 | 2020-07-19,21 204 | 2020-07-26,24 205 | 2020-08-02,25 206 | 2020-08-09,36 207 | 2020-08-16,43 208 | 2020-08-23,37 209 | 2020-08-30,34 210 | 2020-09-06,33 211 | 2020-09-13,34 212 | 2020-09-20,30 213 | 2020-09-27,33 214 | 2020-10-04,40 215 | 2020-10-11,47 216 | 2020-10-18,48 217 | 2020-10-25,50 218 | 2020-11-01,56 219 | 2020-11-08,45 220 | 2020-11-15,37 221 | 2020-11-22,30 222 | 2020-11-29,23 223 | 2020-12-06,27 224 | 2020-12-13,26 225 | 2020-12-20,29 226 | 2020-12-27,31 227 | 2021-01-03,26 228 | 2021-01-10,22 229 | 2021-01-17,23 230 | 2021-01-24,25 231 | 2021-01-31,29 232 | 2021-02-07,24 233 | 2021-02-14,30 234 | 2021-02-21,35 235 | 2021-02-28,37 236 | 2021-03-07,32 237 | 2021-03-14,30 238 | 2021-03-21,34 239 | 2021-03-28,28 240 | 2021-04-04,28 241 | 2021-04-11,25 242 | 2021-04-18,29 243 | 2021-04-25,27 244 | 2021-05-02,30 245 | 2021-05-09,27 246 | 2021-05-16,28 247 | 2021-05-23,25 248 | 2021-05-30,30 249 | 2021-06-06,28 250 | 2021-06-13,30 251 | 2021-06-20,33 252 | 2021-06-27,26 253 | 2021-07-04,27 254 | 2021-07-11,33 255 | 2021-07-18,39 256 | 2021-07-25,38 257 | 2021-08-01,38 258 | 2021-08-08,36 259 | 2021-08-15,34 260 | 2021-08-22,31 261 | 2021-08-29,26 262 | 2021-09-05,29 263 | -------------------------------------------------------------------------------- /DataNarrative/source/paracetamol.csv: -------------------------------------------------------------------------------- 1 | Date,20 cps,30 cps,tot 2 | 2016-06-01,2407999,1875856,4283855 3 | 2016-12-01,2799691,1857160,4656851 4 | 2017-06-01,2455595,1850597,4306192 5 | 2017-12-01,2747445,2145758,4893203 6 | 2018-06-01,2358099,1981767,4339866 7 | 2018-12-01,2463923,2238109,4702032 8 | 2019-06-01,2212298,2112443,4324741 9 | 2019-12-01,2337481,2399045,4736526 10 | 2020-06-01,2889999,2470717,5360716 11 | 2020-12-01,2385066,2371008,4756074 12 | -------------------------------------------------------------------------------- /DataNarrative/source/sore throat.csv: -------------------------------------------------------------------------------- 1 | Week,sore throat 2 | 2016-09-11,18 3 | 2016-09-18,26 4 | 2016-09-25,41 5 | 2016-10-02,41 6 | 2016-10-09,34 7 | 2016-10-16,33 8 | 2016-10-23,35 9 | 2016-10-30,29 10 | 2016-11-06,34 11 | 2016-11-13,37 12 | 2016-11-20,34 13 | 2016-11-27,34 14 | 2016-12-04,37 15 | 2016-12-11,39 16 | 2016-12-18,42 17 | 2016-12-25,57 18 | 2017-01-01,53 19 | 2017-01-08,40 20 | 2017-01-15,29 21 | 2017-01-22,34 22 | 2017-01-29,29 23 | 2017-02-05,33 24 | 2017-02-12,41 25 | 2017-02-19,35 26 | 2017-02-26,40 27 | 2017-03-05,34 28 | 2017-03-12,35 29 | 2017-03-19,37 30 | 2017-03-26,34 31 | 2017-04-02,33 32 | 2017-04-09,34 33 | 2017-04-16,29 34 | 2017-04-23,31 35 | 2017-04-30,28 36 | 2017-05-07,37 37 | 2017-05-14,28 38 | 2017-05-21,25 39 | 2017-05-28,30 40 | 2017-06-04,23 41 | 2017-06-11,18 42 | 2017-06-18,24 43 | 2017-06-25,19 44 | 2017-07-02,23 45 | 2017-07-09,16 46 | 2017-07-16,14 47 | 2017-07-23,15 48 | 2017-07-30,20 49 | 2017-08-06,21 50 | 2017-08-13,32 51 | 2017-08-20,27 52 | 2017-08-27,21 53 | 2017-09-03,22 54 | 2017-09-10,23 55 | 2017-09-17,32 56 | 2017-09-24,38 57 | 2017-10-01,43 58 | 2017-10-08,44 59 | 2017-10-15,32 60 | 2017-10-22,31 61 | 2017-10-29,31 62 | 2017-11-05,35 63 | 2017-11-12,30 64 | 2017-11-19,29 65 | 2017-11-26,41 66 | 2017-12-03,33 67 | 2017-12-10,37 68 | 2017-12-17,48 69 | 2017-12-24,60 70 | 2017-12-31,60 71 | 2018-01-07,50 72 | 2018-01-14,33 73 | 2018-01-21,41 74 | 2018-01-28,43 75 | 2018-02-04,28 76 | 2018-02-11,33 77 | 2018-02-18,42 78 | 2018-02-25,36 79 | 2018-03-04,28 80 | 2018-03-11,31 81 | 2018-03-18,37 82 | 2018-03-25,34 83 | 2018-04-01,34 84 | 2018-04-08,30 85 | 2018-04-15,29 86 | 2018-04-22,34 87 | 2018-04-29,30 88 | 2018-05-06,26 89 | 2018-05-13,29 90 | 2018-05-20,30 91 | 2018-05-27,26 92 | 2018-06-03,23 93 | 2018-06-10,21 94 | 2018-06-17,26 95 | 2018-06-24,22 96 | 2018-07-01,21 97 | 2018-07-08,21 98 | 2018-07-15,17 99 | 2018-07-22,21 100 | 2018-07-29,28 101 | 2018-08-05,28 102 | 2018-08-12,27 103 | 2018-08-19,28 104 | 2018-08-26,26 105 | 2018-09-02,23 106 | 2018-09-09,27 107 | 2018-09-16,31 108 | 2018-09-23,33 109 | 2018-09-30,36 110 | 2018-10-07,40 111 | 2018-10-14,36 112 | 2018-10-21,37 113 | 2018-10-28,24 114 | 2018-11-04,28 115 | 2018-11-11,35 116 | 2018-11-18,33 117 | 2018-11-25,34 118 | 2018-12-02,35 119 | 2018-12-09,34 120 | 2018-12-16,46 121 | 2018-12-23,50 122 | 2018-12-30,44 123 | 2019-01-06,33 124 | 2019-01-13,39 125 | 2019-01-20,41 126 | 2019-01-27,38 127 | 2019-02-03,37 128 | 2019-02-10,41 129 | 2019-02-17,38 130 | 2019-02-24,29 131 | 2019-03-03,37 132 | 2019-03-10,32 133 | 2019-03-17,32 134 | 2019-03-24,39 135 | 2019-03-31,37 136 | 2019-04-07,34 137 | 2019-04-14,32 138 | 2019-04-21,28 139 | 2019-04-28,31 140 | 2019-05-05,25 141 | 2019-05-12,29 142 | 2019-05-19,23 143 | 2019-05-26,25 144 | 2019-06-02,30 145 | 2019-06-09,23 146 | 2019-06-16,25 147 | 2019-06-23,23 148 | 2019-06-30,17 149 | 2019-07-07,20 150 | 2019-07-14,20 151 | 2019-07-21,18 152 | 2019-07-28,18 153 | 2019-08-04,17 154 | 2019-08-11,17 155 | 2019-08-18,21 156 | 2019-08-25,23 157 | 2019-09-01,25 158 | 2019-09-08,18 159 | 2019-09-15,27 160 | 2019-09-22,38 161 | 2019-09-29,35 162 | 2019-10-06,39 163 | 2019-10-13,29 164 | 2019-10-20,29 165 | 2019-10-27,30 166 | 2019-11-03,37 167 | 2019-11-10,31 168 | 2019-11-17,32 169 | 2019-11-24,36 170 | 2019-12-01,35 171 | 2019-12-08,42 172 | 2019-12-15,35 173 | 2019-12-22,58 174 | 2019-12-29,48 175 | 2020-01-05,44 176 | 2020-01-12,39 177 | 2020-01-19,44 178 | 2020-01-26,44 179 | 2020-02-02,38 180 | 2020-02-09,40 181 | 2020-02-16,48 182 | 2020-02-23,61 183 | 2020-03-01,72 184 | 2020-03-08,100 185 | 2020-03-15,63 186 | 2020-03-22,44 187 | 2020-03-29,37 188 | 2020-04-05,30 189 | 2020-04-12,29 190 | 2020-04-19,22 191 | 2020-04-26,21 192 | 2020-05-03,22 193 | 2020-05-10,20 194 | 2020-05-17,20 195 | 2020-05-24,22 196 | 2020-05-31,16 197 | 2020-06-07,14 198 | 2020-06-14,17 199 | 2020-06-21,15 200 | 2020-06-28,11 201 | 2020-07-05,20 202 | 2020-07-12,13 203 | 2020-07-19,24 204 | 2020-07-26,32 205 | 2020-08-02,42 206 | 2020-08-09,48 207 | 2020-08-16,54 208 | 2020-08-23,46 209 | 2020-08-30,37 210 | 2020-09-06,26 211 | 2020-09-13,27 212 | 2020-09-20,36 213 | 2020-09-27,37 214 | 2020-10-04,51 215 | 2020-10-11,59 216 | 2020-10-18,60 217 | 2020-10-25,46 218 | 2020-11-01,32 219 | 2020-11-08,38 220 | 2020-11-15,37 221 | 2020-11-22,29 222 | 2020-11-29,23 223 | 2020-12-06,26 224 | 2020-12-13,23 225 | 2020-12-20,29 226 | 2020-12-27,27 227 | 2021-01-03,22 228 | 2021-01-10,21 229 | 2021-01-17,23 230 | 2021-01-24,27 231 | 2021-01-31,27 232 | 2021-02-07,33 233 | 2021-02-14,34 234 | 2021-02-21,36 235 | 2021-02-28,33 236 | 2021-03-07,29 237 | 2021-03-14,28 238 | 2021-03-21,31 239 | 2021-03-28,25 240 | 2021-04-04,20 241 | 2021-04-11,20 242 | 2021-04-18,22 243 | 2021-04-25,29 244 | 2021-05-02,27 245 | 2021-05-09,26 246 | 2021-05-16,30 247 | 2021-05-23,27 248 | 2021-05-30,28 249 | 2021-06-06,19 250 | 2021-06-13,24 251 | 2021-06-20,26 252 | 2021-06-27,27 253 | 2021-07-04,25 254 | 2021-07-11,27 255 | 2021-07-18,44 256 | 2021-07-25,44 257 | 2021-08-01,39 258 | 2021-08-08,39 259 | 2021-08-15,48 260 | 2021-08-22,37 261 | 2021-08-29,37 262 | 2021-09-05,37 263 | -------------------------------------------------------------------------------- /DataVisualization/.ipynb_checkpoints/Autoplotter-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 4 6 | } 7 | -------------------------------------------------------------------------------- /DataVisualization/Altair/AltairAnimation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import pandas as pd 5 | import altair as alt 6 | import streamlit as st 7 | import time 8 | 9 | df = pd.read_csv('../sources/tourist_arrivals.csv') 10 | df['date'] = pd.to_datetime(df['date']) 11 | 12 | # Build an empty graph 13 | lines = alt.Chart(df).mark_line().encode( 14 | x=alt.X('1:T',axis=alt.Axis(title='date')), 15 | y=alt.Y('0:Q',axis=alt.Axis(title='value')) 16 | ).properties( 17 | width=600, 18 | height=300 19 | ) 20 | 21 | # Plot a Chart 22 | def plot_animation(df): 23 | lines = alt.Chart(df).mark_line().encode( 24 | x=alt.X('date:T', axis=alt.Axis(title='date')), 25 | y=alt.Y('value:Q',axis=alt.Axis(title='value')), 26 | ).properties( 27 | width=600, 28 | height=300 29 | ) 30 | return lines 31 | 32 | 33 | N = df.shape[0] # number of elements in the dataframe 34 | burst = 6 # number of elements (months) to add to the plot 35 | size = burst # size of the current dataset 36 | 37 | # Plot Animation 38 | line_plot = st.altair_chart(lines) 39 | start_btn = st.button('Start') 40 | 41 | if start_btn: 42 | for i in range(1,N): 43 | step_df = df.iloc[0:size] 44 | lines = plot_animation(step_df) 45 | line_plot = line_plot.altair_chart(lines) 46 | size = i + burst 47 | if size >= N: 48 | size = N - 1 49 | time.sleep(0.1) 50 | -------------------------------------------------------------------------------- /DataVisualization/D3Graphs/badBarChart.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /DataVisualization/D3Graphs/barChart.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /DataVisualization/D3Graphs/choroplethMap.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 21 | 22 | 23 | 24 | 25 |
26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /DataVisualization/D3Graphs/js/chart.js: -------------------------------------------------------------------------------- 1 | // set the dimensions and margins of the graph 2 | var margin = {top: 20, right: 30, bottom: 40, left: 90}, 3 | width = 460 - margin.left - margin.right, 4 | height = 400 - margin.top - margin.bottom; 5 | 6 | // append the svg object to the body of the page 7 | var svg = d3.select("#my_chart") 8 | .append("svg") 9 | .attr("width", width + margin.left + margin.right) 10 | .attr("height", height + margin.top + margin.bottom) 11 | .append("g") 12 | .attr("transform", 13 | "translate(" + margin.left + "," + margin.top + ")"); 14 | 15 | // Parse the Data 16 | d3.csv("https://raw.githubusercontent.com/holtzy/data_to_viz/master/Example_dataset/7_OneCatOneNum_header.csv").then( function(data) { 17 | 18 | // Add X axis 19 | var x = d3.scaleLinear() 20 | .domain([0, 13000]) 21 | .range([ 0, width]); 22 | 23 | svg.append("g") 24 | .attr("transform", "translate(0," + height + ")") 25 | .call(d3.axisBottom(x)) 26 | .selectAll("text") 27 | .attr("transform", "translate(-10,0)rotate(-45)") 28 | .style("text-anchor", "end"); 29 | 30 | // Y axis 31 | var y = d3.scaleBand() 32 | .range([ 0, height ]) 33 | .domain(data.map(function(d) { return d.Country; })) 34 | .padding(.1); 35 | 36 | svg.append("g") 37 | .call(d3.axisLeft(y)) 38 | 39 | //Bars 40 | svg.selectAll("myRect") 41 | .data(data) 42 | .enter() 43 | .append("rect") 44 | .attr("x", x(0) ) 45 | .attr("y", function(d) { return y(d.Country); }) 46 | .attr("width", function(d) { return x(d.Value); }) 47 | .attr("height", y.bandwidth() ) 48 | .attr("fill", "#69b3a2") 49 | 50 | 51 | 52 | }) -------------------------------------------------------------------------------- /DataVisualization/D3Graphs/js/chart2.js: -------------------------------------------------------------------------------- 1 | // set the dimensions and margins of the graph 2 | var margin = {top: 20, right: 30, bottom: 40, left: 90}, 3 | width = 460 - margin.left - margin.right, 4 | height = 400 - margin.top - margin.bottom; 5 | 6 | // append the svg object to the body of the page 7 | var svg = d3.select("#my_chart") 8 | .append("svg") 9 | .attr("width", width + margin.left + margin.right) 10 | .attr("height", height + margin.top + margin.bottom) 11 | .append("g") 12 | .attr("transform", 13 | "translate(" + margin.left + "," + margin.top + ")"); 14 | 15 | // Parse the Data 16 | d3.csv("https://raw.githubusercontent.com/holtzy/data_to_viz/master/Example_dataset/7_OneCatOneNum_header.csv", 17 | function(d){ return {Country: d.Country, Value: +d.Value}} 18 | ).then( function(data) { 19 | 20 | data.sort(function(x, y){return d3.descending(x.Value, y.Value);}) 21 | 22 | // Add X axis 23 | var x = d3.scaleLinear() 24 | .domain([0, 13000]) 25 | .range([ 0, width]); 26 | svg.append("g") 27 | .attr("transform", "translate(0," + height + ")") 28 | .call(d3.axisBottom(x)) 29 | .selectAll("text") 30 | .attr("transform", "translate(-10,0)rotate(-45)") 31 | .style("text-anchor", "end"); 32 | 33 | // Y axis 34 | var y = d3.scaleBand() 35 | .range([ 0, height ]) 36 | .domain(data.map(function(d) { return d.Country; })) 37 | .padding(.1); 38 | svg.append("g") 39 | .call(d3.axisLeft(y)) 40 | 41 | //Bars 42 | svg.selectAll("myRect") 43 | .data(data) 44 | .enter() 45 | .append("rect") 46 | .attr("x", x(50) ) 47 | .attr("y", function(d) { return y(d.Country); }) 48 | .attr("width", function(d) { return x(d.Value); }) 49 | .attr("height", y.bandwidth() ) 50 | .attr("fill", function(d){ if (d.Country == 'France') return "#cc0000" ; else return "#a3a3c2"}) 51 | 52 | 53 | // Features of the annotation 54 | const annotations = [ 55 | { 56 | note: { 57 | label: "Thanks to its marketing policy, in 2021 France has reached the third position.", 58 | title: "France product sales", 59 | wrap: 200, // try something smaller to see text split in several lines 60 | padding: 10 // More = text lower 61 | 62 | }, 63 | color: ["#cc0000"], 64 | x: x(2500), 65 | y: 100, 66 | dy: 100, 67 | dx: 100 68 | } 69 | ] 70 | 71 | // Add annotation to the chart 72 | const makeAnnotations = d3.annotation() 73 | .annotations(annotations) 74 | 75 | svg.append("g") 76 | .call(makeAnnotations) 77 | 78 | 79 | }) -------------------------------------------------------------------------------- /DataVisualization/D3Graphs/js/line.js: -------------------------------------------------------------------------------- 1 | // set the dimensions and margins of the graph 2 | var margin = {top: 10, right: 30, bottom: 30, left: 60}, 3 | width = 460 - margin.left - margin.right, 4 | height = 400 - margin.top - margin.bottom; 5 | 6 | // create a tooltip 7 | var Tooltip = d3.select("#my_line") 8 | .append("div") 9 | .style("opacity", 0) 10 | .attr("class", "tooltip") 11 | .style("background-color", "white") 12 | .style("border", "solid") 13 | .style("border-width", "1px") 14 | .style("border-radius", "5px") 15 | .style("padding", "5px") 16 | .style("position", "absolute") 17 | 18 | // append the svg object to the body of the page 19 | var svg = d3.select("#my_line") 20 | .append("svg") 21 | .attr("width", width + margin.left + margin.right) 22 | .attr("height", height + margin.top + margin.bottom) 23 | .append("g") 24 | .attr("transform", 25 | "translate(" + margin.left + "," + margin.top + ")"); 26 | 27 | 28 | 29 | 30 | //Read the data 31 | d3.csv("https://raw.githubusercontent.com/holtzy/D3-graph-gallery/master/DATA/connectedscatter.csv", 32 | 33 | // When reading the csv, I must format variables: 34 | function(d){ 35 | return { date : d3.timeParse("%Y-%m-%d")(d.date), value : d.value } 36 | }).then( 37 | 38 | // Now I can use this dataset: 39 | function(data) { 40 | 41 | // Add X axis --> it is a date format 42 | var x = d3.scaleTime() 43 | .domain(d3.extent(data, function(d) { return d.date; })) 44 | .range([ 0, width ]); 45 | svg.append("g") 46 | .attr("transform", "translate(0," + height + ")") 47 | .call(d3.axisBottom(x)); 48 | 49 | // Add Y axis 50 | var y = d3.scaleLinear() 51 | .domain( [8000, 9200]) 52 | .range([ height, 0 ]); 53 | svg.append("g") 54 | .call(d3.axisLeft(y)); 55 | 56 | // Add the line 57 | svg.append("path") 58 | .datum(data) 59 | .attr("fill", "none") 60 | .attr("stroke", "black") 61 | .attr("stroke-width", 1.5) 62 | .attr("d", d3.line() 63 | .x(function(d) { return x(d.date) }) 64 | .y(function(d) { return y(d.value) }) 65 | ) 66 | 67 | 68 | 69 | // Three function that change the tooltip when user hover / move / leave a cell 70 | 71 | var mouseover = function(d) { 72 | Tooltip 73 | .style("opacity", 1) 74 | .html("Exact value: " + d.value) 75 | .style("left", (d3.event.pageX) + "px") 76 | .style("top", (d3.event.pageY - 28) + "px"); 77 | } 78 | var mousemove = function(d) { 79 | Tooltip 80 | .html("Exact value: " + d.value) 81 | .style("left", (d3.event.pageX) + "px") 82 | .style("top", (d3.event.pageY - 28) + "px"); 83 | } 84 | var mouseleave = function(d) { 85 | Tooltip 86 | .style("opacity", 0) 87 | } 88 | 89 | // Add the points 90 | svg 91 | .append("g") 92 | .selectAll("dot") 93 | .data(data) 94 | .enter() 95 | .append("circle") 96 | .attr("class", "myCircle") 97 | .attr("cx", function(d) { return x(d.date) } ) 98 | .attr("cy", function(d) { return y(d.value) } ) 99 | .attr("r", 4) 100 | .attr("stroke", "#69b3a2") 101 | .attr("stroke-width", 3) 102 | .attr("fill", "white") 103 | .on("mouseover", mouseover) 104 | .on("mousemove", mousemove) 105 | .on("mouseleave", mouseleave) 106 | }) -------------------------------------------------------------------------------- /DataVisualization/D3Graphs/js/line2.js: -------------------------------------------------------------------------------- 1 | // set the dimensions and margins of the graph 2 | var margin = {top: 10, right: 30, bottom: 40, left: 60}, 3 | width = 460 - margin.left - margin.right, 4 | height = 400 - margin.top - margin.bottom; 5 | 6 | // create a tooltip 7 | var Tooltip = d3.select("#my_line") 8 | .append("div") 9 | .style("opacity", 0) 10 | .attr("class", "tooltip") 11 | .style("background-color", "white") 12 | .style("border", "solid") 13 | .style("border-width", "1px") 14 | .style("border-radius", "5px") 15 | .style("padding", "5px") 16 | .style("position", "absolute") 17 | 18 | // append the svg object to the body of the page 19 | var svg = d3.select("#my_line") 20 | .append("svg") 21 | .attr("width", width + margin.left + margin.right) 22 | .attr("height", height + margin.top + margin.bottom) 23 | .append("g") 24 | .attr("transform", 25 | "translate(" + margin.left + "," + margin.top + ")"); 26 | 27 | 28 | 29 | 30 | //Read the data 31 | d3.csv("https://raw.githubusercontent.com/holtzy/D3-graph-gallery/master/DATA/connectedscatter.csv", 32 | 33 | // When reading the csv, I must format variables: 34 | function(d){ 35 | return { date : d3.timeParse("%Y-%m-%d")(d.date), value : +d.value } 36 | }).then( 37 | 38 | // Now I can use this dataset: 39 | function(data) { 40 | 41 | // Add X axis --> it is a date format 42 | var x = d3.scaleTime() 43 | .domain(d3.extent(data, function(d) { return d.date; })) 44 | .range([ 0, width ]); 45 | svg.append("g") 46 | .attr("transform", "translate(0," + height + ")") 47 | .call(d3.axisBottom(x)) 48 | .selectAll("text") 49 | .style("text-anchor", "end") 50 | .attr("dx", "-.8em") 51 | .attr("dy", ".15em") 52 | .attr("transform", "rotate(-45)");; 53 | 54 | // Add Y axis 55 | var y = d3.scaleLinear() 56 | .domain( [8000, 9100]) 57 | .range([ height, 0 ]); 58 | svg.append("g") 59 | .call(d3.axisLeft(y)); 60 | 61 | // text label for the y axis 62 | svg.append("text") 63 | .attr("transform", "rotate(-90)") 64 | .attr("y", 0 - margin.left) 65 | .attr("x",0 - (height / 2)) 66 | .attr("dy", "1em") 67 | .style("text-anchor", "middle") 68 | .text("Earnings"); 69 | 70 | 71 | // Add the line 72 | svg.append("path") 73 | .datum(data) 74 | .attr("fill", "none") 75 | .attr("stroke", "black") 76 | .attr("stroke-width", 1.5) 77 | .attr("d", d3.line() 78 | .x(function(d) { return x(d.date) }) 79 | .y(function(d) { return y(d.value) }) 80 | ) 81 | 82 | 83 | 84 | // Three function that change the tooltip when user hover / move / leave a cell 85 | 86 | var mouseover = function(d) { 87 | Tooltip 88 | .style("opacity", 1) 89 | .html(d.value + " €") 90 | .style("left", (d3.event.pageX) + "px") 91 | .style("top", (d3.event.pageY - 28) + "px"); 92 | } 93 | var mousemove = function(d) { 94 | Tooltip 95 | .html(d.value + " €") 96 | .style("left", (d3.event.pageX) + "px") 97 | .style("top", (d3.event.pageY - 28) + "px"); 98 | } 99 | var mouseleave = function(d) { 100 | Tooltip 101 | .style("opacity", 0) 102 | } 103 | 104 | // Add the points 105 | svg 106 | .append("g") 107 | .selectAll("dot") 108 | .data(data) 109 | .enter() 110 | .append("circle") 111 | .attr("class", "myCircle") 112 | .attr("cx", function(d) { return x(d.date) } ) 113 | .attr("cy", function(d) { return y(d.value) } ) 114 | .attr("r", 4) 115 | .attr("stroke", "#000000") 116 | .attr("stroke-width", 1) 117 | .attr("fill", "#000000") 118 | .on("mouseover", mouseover) 119 | .on("mousemove", mousemove) 120 | .on("mouseleave", mouseleave) 121 | 122 | //Add annotations 123 | 124 | var parseDate = function(d){ return d3.timeParse("%Y-%m-%d")(d)} 125 | const annotations = [ 126 | // first annotation 127 | { 128 | note: { 129 | label: "Earnings plummeted", 130 | title: "April 17th - 19th", 131 | wrap: 150, // try something smaller to see text split in several lines 132 | padding: 10 // More = text lower 133 | 134 | }, 135 | color: ["#cc0000"], 136 | x: x(parseDate('2018-04-18')), 137 | y: y(8197), 138 | dy: -100, 139 | dx: -5, 140 | subject: { 141 | radius: 50, 142 | radiusPadding: 5 143 | }, 144 | type: d3.annotationCalloutCircle, 145 | }, 146 | // second annotation 147 | { 148 | note: { 149 | label: "Strong Recovery", 150 | title: "April 20th", 151 | wrap: 150, // try something smaller to see text split in several lines 152 | padding: 10 // More = text lower 153 | 154 | }, 155 | color: [" #00b300"], 156 | x: x(parseDate('2018-04-20')), 157 | y: y(8880.23), 158 | dy: 40, 159 | dx: 40, 160 | type: d3.annotationCalloutElbow, 161 | }, 162 | 163 | ] 164 | 165 | window.makeAnnotations = d3.annotation() 166 | .annotations(annotations) 167 | 168 | 169 | 170 | 171 | 172 | 173 | svg.append("g") 174 | .call(makeAnnotations) 175 | }) -------------------------------------------------------------------------------- /DataVisualization/D3Graphs/js/map.js: -------------------------------------------------------------------------------- 1 | // set the dimensions and margins of the graph 2 | var margin = {top: 20, right: 10, bottom: 40, left: 100}, 3 | width = 600 - margin.left - margin.right, 4 | height = 400 - margin.top - margin.bottom; 5 | 6 | // The svg 7 | var svg = d3.select("svg") 8 | .attr("width", width + margin.left + margin.right) 9 | .attr("height", height + margin.top + margin.bottom) 10 | .append("g") 11 | .attr("transform", 12 | "translate(" + margin.left + "," + margin.top + ")"); 13 | 14 | 15 | 16 | // create a tooltip 17 | var tooltip = d3.select("#tooltip") 18 | .style("opacity", 0) 19 | .attr("class", "tooltip") 20 | .style("background-color", "white") 21 | .style("border", "solid") 22 | .style("border-width", "1px") 23 | .style("border-radius", "5px") 24 | .style("padding", "5px") 25 | .style("position", "absolute") 26 | 27 | 28 | 29 | // Map and projection 30 | //var path = d3.geoPath(); 31 | var projection = d3.geoMercator() 32 | .scale(70) 33 | .center([0,20]) 34 | .translate([width / 2 - margin.left, height / 2]); 35 | 36 | // Data and color scale 37 | var data = d3.map(); 38 | 39 | //var my_domain = [100000, 1000000, 10000000, 30000000, 100000000, 500000000] 40 | var domain = [100000000, 500000000] 41 | var labels = ["< 100 M", "100 M - 500 M", "> 500 M"] 42 | var range = ["#F8CAEE","#BF76AF","#852170"] 43 | var colorScale = d3.scaleThreshold() 44 | .domain(domain) 45 | .range(range); 46 | 47 | 48 | var promises = [] 49 | promises.push(d3.json("https://raw.githubusercontent.com/holtzy/D3-graph-gallery/master/DATA/world.geojson")) 50 | promises.push(d3.csv("https://raw.githubusercontent.com/holtzy/D3-graph-gallery/master/DATA/world_population.csv", function(d) { data.set(d.code, +d.pop); })) 51 | 52 | 53 | myDataPromises = Promise.all(promises).then(function(topo) { 54 | 55 | 56 | 57 | 58 | let mouseOver = function(d) { 59 | d3.selectAll(".topo") 60 | 61 | .transition() 62 | .duration(200) 63 | .style("opacity", .5) 64 | 65 | /*var population = data.get(d.id) || 0; 66 | var min_pop = 0 67 | var max_pop = domain[0] 68 | for(var i = 0; i < domain.length; i++){ 69 | if (population >= min_pop && population <= max_pop) 70 | break; 71 | min_pop = domain[i] 72 | if (i < domain.length-1) 73 | max_pop = domain[i+1] 74 | else 75 | max_pop = 1e50 76 | }*/ 77 | 78 | d3.select(this) 79 | //.filter(function(d){d.total = data.get(d.id) || 0; return d.total <= max_pop && d.total >= min_pop}) 80 | .transition() 81 | .duration(200) 82 | .style("opacity", 1) 83 | .style("stroke", "black") 84 | 85 | d.total = data.get(d.id) || 0; 86 | 87 | tooltip 88 | .style("opacity", 0.8) 89 | .html(d.id + ": " + d3.format(",.2r")(d.total)) 90 | .style("left", (d3.event.pageX) + "px") 91 | .style("top", (d3.event.pageY - 28) + "px"); 92 | 93 | d3.select("#annotation") 94 | .style("opacity", 0) 95 | 96 | 97 | } 98 | 99 | let mouseLeave = function(d) { 100 | d3.selectAll(".topo") 101 | .transition() 102 | .duration(200) 103 | .style("opacity", .7) 104 | 105 | d3.selectAll(".topo") 106 | .transition() 107 | .duration(200) 108 | .style("stroke", "transparent") 109 | 110 | d3.select("#annotation") 111 | .style("opacity", 1) 112 | 113 | tooltip 114 | .style("opacity", 0) 115 | } 116 | 117 | var topo = topo[0] 118 | 119 | // Draw the map 120 | svg.append("g") 121 | .selectAll("path") 122 | 123 | .data(topo.features) 124 | .enter() 125 | .append("path") 126 | .attr("class", "topo") 127 | // draw each country 128 | .attr("d", d3.geoPath() 129 | .projection(projection) 130 | ) 131 | // set the color of each country 132 | .attr("fill", function (d) { 133 | d.total = data.get(d.id) || 0; 134 | return colorScale(d.total); 135 | }) 136 | .style("opacity", .7) 137 | .on("mouseover", mouseOver ) 138 | .on("mouseleave", mouseLeave ) 139 | 140 | 141 | // legend 142 | var legend_x = width - margin.left 143 | var legend_y = height - 30 144 | svg.append("g") 145 | .attr("class", "legendQuant") 146 | .attr("transform", "translate(" + legend_x + "," + legend_y+")"); 147 | 148 | var legend = d3.legendColor() 149 | .labels(labels) 150 | .title("Population") 151 | .scale(colorScale) 152 | 153 | 154 | svg.select(".legendQuant") 155 | .call(legend); 156 | 157 | 158 | // Features of the annotation 159 | const annotations = [ 160 | { 161 | note: { 162 | label: "despite its great territorial extension Australia has only 20 million inhabitants.", 163 | title: "Australia Population", 164 | wrap: 150, // try something smaller to see text split in several lines 165 | padding: 10 // More = text lower 166 | 167 | }, 168 | color: ["#852170"], 169 | x: projection([150.916672,-31.083332])[0], 170 | y: projection([150.916672,-31.083332])[1], 171 | dy: -30, 172 | dx: 10 173 | } 174 | ] 175 | 176 | 177 | 178 | // Add annotation to the chart 179 | const makeAnnotations = d3.annotation() 180 | .annotations(annotations) 181 | 182 | svg.append("g") 183 | .style("opacity", 1) 184 | .attr("id", "annotation") 185 | .call(makeAnnotations) 186 | 187 | }) 188 | 189 | -------------------------------------------------------------------------------- /DataVisualization/D3Graphs/line.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 16 | 17 | 18 | 19 |
20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /DataVisualization/D3Graphs/simpleLine.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 16 | 17 | 18 | 19 |
20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /DataVisualization/Django/Model Creation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Read from Google Spreadsheet" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 4, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from __future__ import print_function\n", 17 | "import os.path\n", 18 | "from googleapiclient.discovery import build\n", 19 | "from google_auth_oauthlib.flow import InstalledAppFlow\n", 20 | "from google.auth.transport.requests import Request\n", 21 | "from google.oauth2.credentials import Credentials\n", 22 | "\n", 23 | "# If modifying these scopes, delete the file token.json.\n", 24 | "SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly']\n", 25 | "\n", 26 | "# The ID and range of a sample spreadsheet.\n", 27 | "SPREADSHEET_ID = '1Reb_q0xspIy_QpOid6z9AzAuy4WWRydYda_lk4qaZpY'\n", 28 | "RANGE_NAME = 'Class Data!A1:E'\n", 29 | "\n", 30 | "creds = None\n", 31 | "# The file token.json stores the user's access and refresh tokens, and is\n", 32 | "# created automatically when the authorization flow completes for the first\n", 33 | "# time.\n", 34 | "if os.path.exists('token.json'):\n", 35 | " creds = Credentials.from_authorized_user_file('token.json', SCOPES)\n", 36 | "# If there are no (valid) credentials available, let the user log in.\n", 37 | "if not creds or not creds.valid:\n", 38 | " if creds and creds.expired and creds.refresh_token:\n", 39 | " creds.refresh(Request())\n", 40 | " else:\n", 41 | " flow = InstalledAppFlow.from_client_secrets_file(\n", 42 | " 'credentials.json', SCOPES)\n", 43 | " creds = flow.run_local_server(port=0)\n", 44 | " # Save the credentials for the next run\n", 45 | " with open('token.json', 'w') as token:\n", 46 | " token.write(creds.to_json())\n", 47 | "\n", 48 | "service = build('sheets', 'v4', credentials=creds)\n", 49 | "# Call the Sheets API\n", 50 | "sheet = service.spreadsheets()" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "# Convert the spreadsheet into a dict of dataframes, one for each sheet" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 6, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "import pandas as pd\n", 67 | "sheet_metadata = service.spreadsheets().get(spreadsheetId=SPREADSHEET_ID).execute()\n", 68 | "df_dict = {}\n", 69 | "\n", 70 | "properties = sheet_metadata.get('sheets')\n", 71 | "for item in properties:\n", 72 | " table = item.get('properties').get('title')\n", 73 | " df_dict[table] = pd.DataFrame()\n", 74 | " \n", 75 | " result = sheet.values().get(spreadsheetId=SPREADSHEET_ID,\n", 76 | " range=table + '!A1:F').execute()\n", 77 | " header = result.get('values', [])[0]\n", 78 | " \n", 79 | " values = result.get('values', [])[1:] \n", 80 | " if not values:\n", 81 | " print('No data found.')\n", 82 | " else:\n", 83 | " all_data = []\n", 84 | " for col_id, col_name in enumerate(header):\n", 85 | " column_data = []\n", 86 | " for row in values:\n", 87 | " if col_id < len(row):\n", 88 | " column_data.append(row[col_id])\n", 89 | " else:\n", 90 | " column_data.append('')\n", 91 | " \n", 92 | "\n", 93 | " ds = pd.Series(data=column_data, name=col_name)\n", 94 | " all_data.append(ds)\n", 95 | " df_dict[table] = pd.concat(all_data, axis=1)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "# Build a Mapping Table" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 7, 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "data": { 112 | "text/plain": [ 113 | "{'PERSON': {'id': ['Integer', {'default': 'None', 'primary_key': 'True'}],\n", 114 | " 'Name': ['Char', {'default': 'None', 'max_length': '32'}],\n", 115 | " 'Surname': ['Char', {'default': 'None', 'max_length': '32'}],\n", 116 | " 'BirthDate': ['Date', {'default': 'None'}],\n", 117 | " 'Sex': ['Char', {'default': 'None', 'max_length': '1'}]},\n", 118 | " 'ORGANIZATION': {'id': ['Integer',\n", 119 | " {'default': 'None', 'primary_key': 'True'}],\n", 120 | " 'Name': ['Char', {'default': 'None', 'max_length': '64'}],\n", 121 | " 'Address': ['Char', {'default': 'None', 'max_length': '128'}],\n", 122 | " 'WebSite': ['Char', {'default': 'None', 'max_length': '128'}],\n", 123 | " 'Locality': ['Char', {'default': 'None', 'max_length': '64'}]},\n", 124 | " 'STAFF': {'person_id': ['Integer', {'default': 'None'}],\n", 125 | " 'organization_id': ['Integer', {'default': 'None'}]}}" 126 | ] 127 | }, 128 | "execution_count": 7, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "tables = {}\n", 135 | "for table,df in df_dict.items():\n", 136 | " tables[table] = {}\n", 137 | " for i in range(0, df.shape[0]):\n", 138 | " attr = {}\n", 139 | " attr['default'] = 'None'\n", 140 | " if df['MAX LENGTH'][i] != '':\n", 141 | " attr['max_length'] = df['MAX LENGTH'][i]\n", 142 | " if df['KEY'][i] == 'primary key':\n", 143 | " attr['primary_key'] = 'True'\n", 144 | " tables[table][df['ATTRIBUTES'][i]] = [df['DATA TYPE'][i], attr]\n", 145 | "tables" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "# Create the content of the model.py script" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 8, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "def get_type(attr_type):\n", 162 | " if isinstance(attr_type, list):\n", 163 | " attr = attr_type[0] + 'Field('\n", 164 | " for k,v in attr_type[1].items():\n", 165 | " attr = attr + k + '=' + v + ','\n", 166 | " attr = attr[:-1]\n", 167 | " return attr + (')\\n')\n", 168 | " else:\n", 169 | " return attr_type + 'Field()\\n'" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 9, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "script = 'from django.db import models\\n'\n", 179 | "\n", 180 | "for model,attributes in tables.items():\n", 181 | " script = script + \"class \" + model + \"(models.Model):\\n\"\n", 182 | " for attr_name,attr_type in attributes.items():\n", 183 | " script = script + '\\t' + attr_name + ' = models.' + get_type(attr_type)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 10, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "root = 'mywebsite/myapp/'\n", 193 | "file_name = root + 'models.py'" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 11, 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "data": { 203 | "text/plain": [ 204 | "'from django.db import models\\nclass PERSON(models.Model):\\n\\tid = models.IntegerField(default=None,primary_key=True)\\n\\tName = models.CharField(default=None,max_length=32)\\n\\tSurname = models.CharField(default=None,max_length=32)\\n\\tBirthDate = models.DateField(default=None)\\n\\tSex = models.CharField(default=None,max_length=1)\\nclass ORGANIZATION(models.Model):\\n\\tid = models.IntegerField(default=None,primary_key=True)\\n\\tName = models.CharField(default=None,max_length=64)\\n\\tAddress = models.CharField(default=None,max_length=128)\\n\\tWebSite = models.CharField(default=None,max_length=128)\\n\\tLocality = models.CharField(default=None,max_length=64)\\nclass STAFF(models.Model):\\n\\tperson_id = models.IntegerField(default=None)\\n\\torganization_id = models.IntegerField(default=None)\\n'" 205 | ] 206 | }, 207 | "execution_count": 11, 208 | "metadata": {}, 209 | "output_type": "execute_result" 210 | } 211 | ], 212 | "source": [ 213 | "script" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 12, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "with open(file_name, \"w\") as py_file:\n", 223 | " py_file.write(script)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "# Create the file myapp/admin.py" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 16, 236 | "metadata": {}, 237 | "outputs": [ 238 | { 239 | "data": { 240 | "text/plain": [ 241 | "'from django.contrib import admin\\n\\n\\nfrom .models import *\\nadmin.site.register(PERSON)\\nadmin.site.register(ORGANIZATION)\\nadmin.site.register(STAFF)\\n'" 242 | ] 243 | }, 244 | "execution_count": 16, 245 | "metadata": {}, 246 | "output_type": "execute_result" 247 | } 248 | ], 249 | "source": [ 250 | "script = \"\"\"from django.contrib import admin\\n\n", 251 | "\n", 252 | "from .models import *\n", 253 | "\"\"\"\n", 254 | "\n", 255 | "for model in tables.keys():\n", 256 | " script = script + \"admin.site.register(\" + model + \")\\n\"\n", 257 | "script" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 17, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "file_name = root + 'admin.py'" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 18, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "with open(file_name, \"w\",encoding='utf-8') as py_file:\n", 276 | " py_file.write(script)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [] 299 | } 300 | ], 301 | "metadata": { 302 | "kernelspec": { 303 | "display_name": "Python 3 (ipykernel)", 304 | "language": "python", 305 | "name": "python3" 306 | }, 307 | "language_info": { 308 | "codemirror_mode": { 309 | "name": "ipython", 310 | "version": 3 311 | }, 312 | "file_extension": ".py", 313 | "mimetype": "text/x-python", 314 | "name": "python", 315 | "nbconvert_exporter": "python", 316 | "pygments_lexer": "ipython3", 317 | "version": "3.8.10" 318 | } 319 | }, 320 | "nbformat": 4, 321 | "nbformat_minor": 4 322 | } 323 | -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Django's command-line utility for administrative tasks.""" 3 | import os 4 | import sys 5 | 6 | 7 | def main(): 8 | """Run administrative tasks.""" 9 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mywebsite.settings') 10 | try: 11 | from django.core.management import execute_from_command_line 12 | except ImportError as exc: 13 | raise ImportError( 14 | "Couldn't import Django. Are you sure it's installed and " 15 | "available on your PYTHONPATH environment variable? Did you " 16 | "forget to activate a virtual environment?" 17 | ) from exc 18 | execute_from_command_line(sys.argv) 19 | 20 | 21 | if __name__ == '__main__': 22 | main() 23 | -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/myapp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/DataVisualization/Django/mywebsite/myapp/__init__.py -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/myapp/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/DataVisualization/Django/mywebsite/myapp/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/myapp/__pycache__/admin.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/DataVisualization/Django/mywebsite/myapp/__pycache__/admin.cpython-38.pyc -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/myapp/__pycache__/apps.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/DataVisualization/Django/mywebsite/myapp/__pycache__/apps.cpython-38.pyc -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/myapp/__pycache__/models.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/DataVisualization/Django/mywebsite/myapp/__pycache__/models.cpython-38.pyc -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/myapp/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | from .models import Person 4 | 5 | admin.site.register(Person) -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/myapp/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class MyappConfig(AppConfig): 5 | default_auto_field = 'django.db.models.BigAutoField' 6 | name = 'myapp' 7 | -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/myapp/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.2.7 on 2021-10-04 16:21 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | initial = True 9 | 10 | dependencies = [ 11 | ] 12 | 13 | operations = [ 14 | migrations.CreateModel( 15 | name='Person', 16 | fields=[ 17 | ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 18 | ('Name', models.CharField(max_length=64)), 19 | ('Surname', models.CharField(max_length=64)), 20 | ('BirthDate', models.DateTimeField()), 21 | ('Sex', models.CharField(max_length=1)), 22 | ], 23 | ), 24 | ] 25 | -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/myapp/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/DataVisualization/Django/mywebsite/myapp/migrations/__init__.py -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/myapp/migrations/__pycache__/0001_initial.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/DataVisualization/Django/mywebsite/myapp/migrations/__pycache__/0001_initial.cpython-38.pyc -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/myapp/migrations/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/DataVisualization/Django/mywebsite/myapp/migrations/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/myapp/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | 3 | class Person(models.Model): 4 | Name = models.CharField(max_length=64) 5 | Surname = models.CharField(max_length=64) 6 | BirthDate = models.DateTimeField() 7 | Sex = models.CharField(max_length=1) -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/myapp/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/myapp/views.py: -------------------------------------------------------------------------------- 1 | from django.shortcuts import render 2 | 3 | # Create your views here. 4 | -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/mywebsite/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/DataVisualization/Django/mywebsite/mywebsite/__init__.py -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/mywebsite/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/DataVisualization/Django/mywebsite/mywebsite/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/mywebsite/__pycache__/settings.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/DataVisualization/Django/mywebsite/mywebsite/__pycache__/settings.cpython-38.pyc -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/mywebsite/__pycache__/urls.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/DataVisualization/Django/mywebsite/mywebsite/__pycache__/urls.cpython-38.pyc -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/mywebsite/__pycache__/wsgi.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/DataVisualization/Django/mywebsite/mywebsite/__pycache__/wsgi.cpython-38.pyc -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/mywebsite/asgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | ASGI config for mywebsite project. 3 | 4 | It exposes the ASGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/3.2/howto/deployment/asgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.asgi import get_asgi_application 13 | 14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mywebsite.settings') 15 | 16 | application = get_asgi_application() 17 | -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/mywebsite/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for mywebsite project. 3 | 4 | Generated by 'django-admin startproject' using Django 3.2.7. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/3.2/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/3.2/ref/settings/ 11 | """ 12 | 13 | from pathlib import Path 14 | import pymysql 15 | pymysql.install_as_MySQLdb() 16 | 17 | # Build paths inside the project like this: BASE_DIR / 'subdir'. 18 | BASE_DIR = Path(__file__).resolve().parent.parent 19 | 20 | 21 | # Quick-start development settings - unsuitable for production 22 | # See https://docs.djangoproject.com/en/3.2/howto/deployment/checklist/ 23 | 24 | # SECURITY WARNING: keep the secret key used in production secret! 25 | SECRET_KEY = 'django-insecure-_)66jn6nueurb8nmr@f3gm@c(3$f_#p67#)7#@n2%=we^v^kzo' 26 | 27 | # SECURITY WARNING: don't run with debug turned on in production! 28 | DEBUG = True 29 | 30 | ALLOWED_HOSTS = [] 31 | 32 | 33 | # Application definition 34 | 35 | INSTALLED_APPS = [ 36 | 'django.contrib.admin', 37 | 'django.contrib.auth', 38 | 'django.contrib.contenttypes', 39 | 'django.contrib.sessions', 40 | 'django.contrib.messages', 41 | 'django.contrib.staticfiles', 42 | 'myapp.apps.MyappConfig' 43 | ] 44 | 45 | MIDDLEWARE = [ 46 | 'django.middleware.security.SecurityMiddleware', 47 | 'django.contrib.sessions.middleware.SessionMiddleware', 48 | 'django.middleware.common.CommonMiddleware', 49 | 'django.middleware.csrf.CsrfViewMiddleware', 50 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 51 | 'django.contrib.messages.middleware.MessageMiddleware', 52 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 53 | ] 54 | 55 | ROOT_URLCONF = 'mywebsite.urls' 56 | 57 | TEMPLATES = [ 58 | { 59 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 60 | 'DIRS': [], 61 | 'APP_DIRS': True, 62 | 'OPTIONS': { 63 | 'context_processors': [ 64 | 'django.template.context_processors.debug', 65 | 'django.template.context_processors.request', 66 | 'django.contrib.auth.context_processors.auth', 67 | 'django.contrib.messages.context_processors.messages', 68 | ], 69 | }, 70 | }, 71 | ] 72 | 73 | WSGI_APPLICATION = 'mywebsite.wsgi.application' 74 | 75 | 76 | # Database 77 | # https://docs.djangoproject.com/en/3.2/ref/settings/#databases 78 | 79 | 80 | DATABASES = { 81 | 'default': { 82 | 'ENGINE': 'django.db.backends.mysql', 83 | 'NAME': 'mywebsite_db', 84 | 'USER': 'root', 85 | 'PASSWORD' : '' 86 | } 87 | } 88 | 89 | 90 | 91 | # Password validation 92 | # https://docs.djangoproject.com/en/3.2/ref/settings/#auth-password-validators 93 | 94 | AUTH_PASSWORD_VALIDATORS = [ 95 | { 96 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', 97 | }, 98 | { 99 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', 100 | }, 101 | { 102 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', 103 | }, 104 | { 105 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', 106 | }, 107 | ] 108 | 109 | 110 | # Internationalization 111 | # https://docs.djangoproject.com/en/3.2/topics/i18n/ 112 | 113 | LANGUAGE_CODE = 'en-us' 114 | 115 | TIME_ZONE = 'UTC' 116 | 117 | USE_I18N = True 118 | 119 | USE_L10N = True 120 | 121 | USE_TZ = True 122 | 123 | 124 | # Static files (CSS, JavaScript, Images) 125 | # https://docs.djangoproject.com/en/3.2/howto/static-files/ 126 | 127 | STATIC_URL = '/static/' 128 | 129 | # Default primary key field type 130 | # https://docs.djangoproject.com/en/3.2/ref/settings/#default-auto-field 131 | 132 | DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' 133 | -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/mywebsite/urls.py: -------------------------------------------------------------------------------- 1 | """mywebsite URL Configuration 2 | 3 | The `urlpatterns` list routes URLs to views. For more information please see: 4 | https://docs.djangoproject.com/en/3.2/topics/http/urls/ 5 | Examples: 6 | Function views 7 | 1. Add an import: from my_app import views 8 | 2. Add a URL to urlpatterns: path('', views.home, name='home') 9 | Class-based views 10 | 1. Add an import: from other_app.views import Home 11 | 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') 12 | Including another URLconf 13 | 1. Import the include() function: from django.urls import include, path 14 | 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) 15 | """ 16 | from django.contrib import admin 17 | from django.urls import path 18 | 19 | urlpatterns = [ 20 | path('admin/', admin.site.urls), 21 | ] 22 | -------------------------------------------------------------------------------- /DataVisualization/Django/mywebsite/mywebsite/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for mywebsite project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/3.2/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mywebsite.settings') 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /DataVisualization/GenerativeAI/build_chart.py: -------------------------------------------------------------------------------- 1 | # import the required libraries 2 | # load the data 'tsc00001.csv' into a pandas dataframe 3 | # drop the 'unit' column 4 | # select the rows where 'geo' is 'IT' 5 | # drop the 'geo' column 6 | # use melt() as follows: 7 | # - use 'sectperf' as id_vars 8 | # - use 'date' as var_name 9 | # - use 'value' as value_name 10 | # convert the 'date' column to integer 11 | # convert the 'value' column to float 12 | 13 | # draw a line chart in Python Altair as follows: 14 | # - use 'data' for the x axis 15 | # - use the value column for the y axis 16 | # - use 'sectperf' for color 17 | # save the chart as 'chart.html' 18 | 19 | import pandas as pd 20 | import altair as alt 21 | 22 | df = pd.read_csv('tsc00001.csv') 23 | df = df.drop(columns=['unit']) 24 | df = df[df['geo'] == 'IT'] 25 | df = df.drop(columns=['geo']) 26 | df = pd.melt(df, id_vars=['sectperf'], var_name='date', value_name='value') 27 | df['date'] = df['date'].astype(int) 28 | df['value'] = df['value'].astype(float) 29 | 30 | chart = alt.Chart(df).mark_line().encode( 31 | x=alt.X('date:O', title='', axis=alt.Axis(labelAngle=0)), 32 | y=alt.Y('value', title='Percentage of gross domestic product'), 33 | color=alt.Color('sectperf', legend=None), 34 | # set the StrokeWith as follows: 35 | # - if 'sectperf' is 'BES', use 5 36 | # - in the other cases use 1 37 | strokeWidth=alt.condition( 38 | alt.datum.sectperf == 'BES', 39 | alt.value(5), 40 | alt.value(1) 41 | ) 42 | ).properties( 43 | width=600, 44 | height=400, 45 | title=['Driving Growth:', 46 | 'A Decade of Business Enterprise Performance Dominance (2010-2021)'] 47 | ) 48 | 49 | # add a text layer as follows: 50 | # - select only date == 2021 51 | # - use 'data' for the x axis 52 | # - use the sectperf column for the y axis 53 | 54 | text = alt.Chart(df[df['date'] == 2021]).mark_text( 55 | align='left', 56 | baseline='middle', 57 | dx=7, 58 | fontSize=14 59 | ).encode( 60 | x=alt.X('date:O', title='', axis=alt.Axis(labelAngle=0)), 61 | y=alt.Y('value', title='Percentage of gross domestic product'), 62 | text='sectperf', 63 | color=alt.Color('sectperf', legend=None), 64 | ).properties( 65 | width=600, 66 | height=400, 67 | ) 68 | 69 | # Add the following image to the chart: 'red.png' 70 | 71 | df_red = pd.DataFrame({'url': ['red.png']}) 72 | 73 | red = alt.Chart(df_red).mark_image( 74 | align='center', 75 | baseline='top', 76 | width=300, 77 | height=300 78 | ).encode( 79 | url='url' 80 | ) 81 | 82 | 83 | 84 | chart = (red | chart + text 85 | ).configure_axis( 86 | grid=False, 87 | titleFontSize=14, 88 | ).configure_view( 89 | strokeWidth=0 90 | ).configure_title( 91 | fontSize=20, 92 | anchor='middle', 93 | color='grey' 94 | ) 95 | 96 | chart.save('chart.html') -------------------------------------------------------------------------------- /DataVisualization/GenerativeAI/chart.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 15 | 16 | 17 | 18 | 19 | 20 |
21 | 40 | 41 | -------------------------------------------------------------------------------- /DataVisualization/GenerativeAI/red.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/DataVisualization/GenerativeAI/red.png -------------------------------------------------------------------------------- /DataVisualization/Gradio/Gradio Example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Gradio Tutorial\n", 8 | "This short tutorial describes how to build a quick interactive interface in Gradio. You can install Gradio through the command `!pip install gradio`." 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "# Basic Interface\n", 16 | "This first example builds an interface which reads a text from a textbox, then it anonymises dates contained in the text and finally displays the anonymised text.\n", 17 | "\n", 18 | "Firstly, I define a function, called `anonymise_text()` which takes as input a text, and then replaces all the dates contained in the text with a HTML text, which highlights the anonymised dates." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 105, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import re\n", 28 | "\n", 29 | "def anonymise_date(txt):\n", 30 | " date_RE=\"(\\d{1,2}[\\.-/]\\d{1,2}[\\.-/]\\d{4})|(\\d{4})\"\n", 31 | " matches=re.findall(date_RE,txt)\n", 32 | " if matches:\n", 33 | " for match in matches:\n", 34 | " index = 0\n", 35 | " if match[index] == '':\n", 36 | " index = 1\n", 37 | " txt = txt.replace(match[index], \"XXXXXX\")\n", 38 | " return txt" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "Then, I import the Gradio package and I build the `Interface()` object, which receives as input the following parameters: \n", 46 | "* the `anomyise_date` function previously defined\n", 47 | "* an input textbox `gr.inputs.Textbox()`, which will capture the input text. The textbox receives as input a placeholder, which contains the default text\n", 48 | "* an output HTML object, which will contain the result returned by the `anonymise_date()` function\n", 49 | "* a list of some default texts, identified through the `examples` keyword, which can be used by the user." 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 135, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "import gradio as gr\n", 59 | "\n", 60 | "iface = gr.Interface(\n", 61 | " anonymise_date,\n", 62 | " gr.inputs.Textbox(placeholder=\"Enter sentence here...\"),\n", 63 | " gr.outputs.HTML(),\n", 64 | " examples=[\n", 65 | " [\"William Shakespeare was born on 26/04/1616\"],\n", 66 | " [\"I will go there on 20/03/2022.\"],\n", 67 | " ]\n", 68 | ")" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "Finally, I launch the Interface:" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 136, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "Running locally at: http://127.0.0.1:7903/\n", 88 | "To create a public link, set `share=True` in `launch()`.\n", 89 | "Interface loading below...\n" 90 | ] 91 | }, 92 | { 93 | "data": { 94 | "text/html": [ 95 | "\n", 96 | " \n", 103 | " " 104 | ], 105 | "text/plain": [ 106 | "" 107 | ] 108 | }, 109 | "metadata": {}, 110 | "output_type": "display_data" 111 | }, 112 | { 113 | "data": { 114 | "text/plain": [ 115 | "(, 'http://127.0.0.1:7903/', None)" 116 | ] 117 | }, 118 | "execution_count": 136, 119 | "metadata": {}, 120 | "output_type": "execute_result" 121 | } 122 | ], 123 | "source": [ 124 | "iface.launch()" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "# Advanced Interface\n", 132 | "Now I can improve the previous example, by replacing the textbox with a file uploaded dynamically through the interface. In this example, I manage only plain text files (e.g. txt). Gradio uploads a file as a [temporary file](https://docs.python.org/3/library/tempfile.html). \n", 133 | "\n", 134 | "I define a function, called `process_text()` which receives as input a temporary file, checks its extension, reads the file content and finally returns the original text as well as the anonymised one. Note that the extension for a txt temporary file is not .txt but .plain." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 133, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "def process_text(tmp_file):\n", 144 | " if not tmp_file.name.endswith('.plain'):\n", 145 | " return 'Not Supported File. Please provide a txt file', ''\n", 146 | " with open(tmp_file.name, 'r') as f:\n", 147 | " txt = f.read()\n", 148 | "\n", 149 | " return txt, anonymise_date(txt)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "Now I can build the interface. With respect to the previous example, I pass a `gr.inputs.File()` object as input and I build a list of output files." 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 134, 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | "Running locally at: http://127.0.0.1:7902/\n", 169 | "To create a public link, set `share=True` in `launch()`.\n", 170 | "Interface loading below...\n" 171 | ] 172 | }, 173 | { 174 | "data": { 175 | "text/html": [ 176 | "\n", 177 | " \n", 184 | " " 185 | ], 186 | "text/plain": [ 187 | "" 188 | ] 189 | }, 190 | "metadata": {}, 191 | "output_type": "display_data" 192 | }, 193 | { 194 | "data": { 195 | "text/plain": [ 196 | "(, 'http://127.0.0.1:7902/', None)" 197 | ] 198 | }, 199 | "execution_count": 134, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "iface = gr.Interface(\n", 206 | " process_text,\n", 207 | " gr.inputs.File(label='Upload TXT file'),\n", 208 | " [gr.outputs.HTML(label='Original Text'),gr.outputs.HTML(label='Anonymised Text')],\n", 209 | ")\n", 210 | "\n", 211 | "iface.launch()" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [] 220 | } 221 | ], 222 | "metadata": { 223 | "kernelspec": { 224 | "display_name": "Python 3", 225 | "language": "python", 226 | "name": "python3" 227 | }, 228 | "language_info": { 229 | "codemirror_mode": { 230 | "name": "ipython", 231 | "version": 3 232 | }, 233 | "file_extension": ".py", 234 | "mimetype": "text/x-python", 235 | "name": "python", 236 | "nbconvert_exporter": "python", 237 | "pygments_lexer": "ipython3", 238 | "version": "3.8.1" 239 | } 240 | }, 241 | "nbformat": 4, 242 | "nbformat_minor": 4 243 | } 244 | -------------------------------------------------------------------------------- /DataVisualization/Plotly/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/DataVisualization/Plotly/.DS_Store -------------------------------------------------------------------------------- /DataVisualization/Plotly/.ipynb_checkpoints/Plot.ly-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 4 6 | } 7 | -------------------------------------------------------------------------------- /DataVisualization/Plotly/Plot.ly.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "'positivi.html'" 12 | ] 13 | }, 14 | "execution_count": 6, 15 | "metadata": {}, 16 | "output_type": "execute_result" 17 | } 18 | ], 19 | "source": [ 20 | "import plotly\n", 21 | "import plotly.graph_objs as go\n", 22 | "import pandas as pd\n", 23 | "\n", 24 | "df = pd.read_csv('https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-andamento-nazionale/dpc-covid19-ita-andamento-nazionale.csv')\n", 25 | "\n", 26 | "# Create a trace\n", 27 | "data = [go.Scatter(\n", 28 | " x = df['data'],\n", 29 | " y = df['totale_positivi'],\n", 30 | ")]\n", 31 | "\n", 32 | "layout = go.Layout(\n", 33 | " xaxis=dict(\n", 34 | " title='Data', \n", 35 | " ),\n", 36 | " yaxis=dict(\n", 37 | " title='Totale positivi', \n", 38 | " )\n", 39 | " )\n", 40 | "fig = go.Figure(data=data, layout=layout)\n", 41 | "plotly.offline.plot(fig,filename='positivi.html',config={'displayModeBar': False})" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [] 50 | } 51 | ], 52 | "metadata": { 53 | "kernelspec": { 54 | "display_name": "Python 3", 55 | "language": "python", 56 | "name": "python3" 57 | }, 58 | "language_info": { 59 | "codemirror_mode": { 60 | "name": "ipython", 61 | "version": 3 62 | }, 63 | "file_extension": ".py", 64 | "mimetype": "text/x-python", 65 | "name": "python", 66 | "nbconvert_exporter": "python", 67 | "pygments_lexer": "ipython3", 68 | "version": "3.8.1" 69 | } 70 | }, 71 | "nbformat": 4, 72 | "nbformat_minor": 4 73 | } 74 | -------------------------------------------------------------------------------- /DataVisualization/Plotly/positivi.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | 8 |
9 | 10 |
11 | 12 | -------------------------------------------------------------------------------- /DataVisualization/sources/eu_live_births.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/DataVisualization/sources/eu_live_births.xlsx -------------------------------------------------------------------------------- /DataVisualization/sources/eu_regions.csv: -------------------------------------------------------------------------------- 1 | #;Country;Population (2020);Subregion 2 | 1;Russia;145934462;Eastern Europe 3 | 2;Germany;83783942;Western Europe 4 | 3;United Kingdom;67886011;Northern Europe 5 | 4;France;65273511;Western Europe 6 | 5;Italy;60461826;Southern Europe 7 | 6;Spain;46754778;Southern Europe 8 | 7;Ukraine;43733762;Eastern Europe 9 | 8;Poland;37846611;Eastern Europe 10 | 9;Romania;19237691;Eastern Europe 11 | 10;Netherlands;17134872;Western Europe 12 | 11;Belgium;11589623;Western Europe 13 | 12;Czechia;10708981;Eastern Europe 14 | 13;Greece;10423054;Southern Europe 15 | 14;Portugal;10196709;Southern Europe 16 | 15;Sweden;10099265;Northern Europe 17 | 16;Hungary;9660351;Eastern Europe 18 | 17;Belarus;9449323;Eastern Europe 19 | 18;Austria;9006398;Western Europe 20 | 19;Serbia;8737371;Southern Europe 21 | 20;Switzerland;8654622;Western Europe 22 | 21;Bulgaria;6948445;Eastern Europe 23 | 22;Denmark;5792202;Northern Europe 24 | 23;Finland;5540720;Northern Europe 25 | 24;Slovakia;5459642;Eastern Europe 26 | 25;Norway;5421241;Northern Europe 27 | 26;Ireland;4937786;Northern Europe 28 | 27;Croatia;4105267;Southern Europe 29 | 28;Moldova;4033963;Eastern Europe 30 | 29;Bosnia and Herzegovina;3280819;Southern Europe 31 | 30;Albania;2877797;Southern Europe 32 | 31;Lithuania;2722289;Northern Europe 33 | 32;North Macedonia;2083374;Southern Europe 34 | 33;Slovenia;2078938;Southern Europe 35 | 34;Latvia;1886198;Northern Europe 36 | 35;Kosovo;1798188;Southern Europe 37 | 36;Estonia;1326535;Northern Europe 38 | 37;Montenegro;628066;Southern Europe 39 | 38;Luxembourg;625978;Western Europe 40 | 39;Malta;441543;Southern Europe 41 | 40;Iceland;341243;Northern Europe 42 | 41;Andorra;77265;Southern Europe 43 | 42;Monaco;39242;Western Europe 44 | 43;Liechtenstein;38128;Western Europe 45 | 44;San Marino;33931;Southern Europe 46 | 45;Holy See;801;Southern Europe -------------------------------------------------------------------------------- /DataVisualization/sources/tourist_arrivals.csv: -------------------------------------------------------------------------------- 1 | "date","value" 2 | "'2012-01-01'",2343290 3 | "'2012-02-01'",10468842 4 | "'2012-03-01'",13908950 5 | "'2012-04-01'",18456089 6 | "'2012-05-01'",20294254 7 | "'2012-06-01'",27101300 8 | "'2012-07-01'",32838284 9 | "'2012-08-01'",34392050 10 | "'2012-09-01'",23910073 11 | "'2012-10-01'",15828202 12 | "'2012-11-01'",10155960 13 | "'2012-12-01'",10804312 14 | "'2013-01-01'",9632532 15 | "'2013-02-01'",10628786 16 | "'2013-03-01'",14540671 17 | "'2013-04-01'",16192551 18 | "'2013-05-01'",21295358 19 | "'2013-06-01'",26751429 20 | "'2013-07-01'",32902913 21 | "'2013-08-01'",36156738 22 | "'2013-09-01'",23738366 23 | "'2013-10-01'",16062127 24 | "'2013-11-01'",10214150 25 | "'2013-12-01'",10832733 26 | "'2014-01-01'",9967129 27 | "'2014-02-01'",10511208 28 | "'2014-03-01'",13648915 29 | "'2014-04-01'",18901473 30 | "'2014-05-01'",21959612 31 | "'2014-06-01'",27492107 32 | "'2014-07-01'",32570827 33 | "'2014-08-01'",37625209 34 | "'2014-09-01'",24048425 35 | "'2014-10-01'",16899142 36 | "'2014-11-01'",10128432 37 | "'2014-12-01'",11664198 38 | "'2015-01-01'",10612222 39 | "'2015-02-01'",11550777 40 | "'2015-03-01'",13831733 41 | "'2015-04-01'",18952191 42 | "'2015-05-01'",25627245 43 | "'2015-06-01'",27925041 44 | "'2015-07-01'",35733205 45 | "'2015-08-01'",39925492 46 | "'2015-09-01'",25894479 47 | "'2015-10-01'",18243614 48 | "'2015-11-01'",10581677 49 | "'2015-12-01'",12279139 50 | "'2016-01-01'",11134303 51 | "'2016-02-01'",12561880 52 | "'2016-03-01'",16392340 53 | "'2016-04-01'",18591118 54 | "'2016-05-01'",23982773 55 | "'2016-06-01'",28833492 56 | "'2016-07-01'",39233326 57 | "'2016-08-01'",39122111 58 | "'2016-09-01'",27405383 59 | "'2016-10-01'",19341501 60 | "'2016-11-01'",11013233 61 | "'2016-12-01'",12965045 62 | "'2017-01-01'",11838496 63 | "'2017-02-01'",12777744 64 | "'2017-03-01'",15141686 65 | "'2017-04-01'",23678625 66 | "'2017-05-01'",23314377 67 | "'2017-06-01'",33607949 68 | "'2017-07-01'",40571601 69 | "'2017-08-01'",40151914 70 | "'2017-09-01'",28567249 71 | "'2017-10-01'",19936344 72 | "'2017-11-01'",12022393 73 | "'2017-12-01'",14188122 74 | "'2018-01-01'",11658597 75 | "'2018-02-01'",13589839 76 | "'2018-03-01'",17758577 77 | "'2018-04-01'",22161704 78 | "'2018-05-01'",26998217 79 | "'2018-06-01'",34183766 80 | "'2018-07-01'",39230371 81 | "'2018-08-01'",41063487 82 | "'2018-09-01'",30390840 83 | "'2018-10-01'",20257400 84 | "'2018-11-01'",13149876 85 | "'2018-12-01'",14689527 86 | "'2019-01-01'",12024702 87 | "'2019-02-01'",13688422 88 | "'2019-03-01'",16673419 89 | "'2019-04-01'",23658680 90 | "'2019-05-01'",24832942 91 | "'2019-06-01'",34658825 92 | "'2019-07-01'",39123041 93 | "'2019-08-01'",41588218 94 | "'2019-09-01'",30253817 95 | -------------------------------------------------------------------------------- /Datasets/Animals.sql: -------------------------------------------------------------------------------- 1 | -- phpMyAdmin SQL Dump 2 | -- version 5.1.1 3 | -- https://www.phpmyadmin.net/ 4 | -- 5 | -- Host: localhost 6 | -- Creato il: Nov 26, 2021 alle 20:15 7 | -- Versione del server: 10.4.21-MariaDB 8 | -- Versione PHP: 7.3.31 9 | 10 | SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO"; 11 | START TRANSACTION; 12 | SET time_zone = "+00:00"; 13 | 14 | 15 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 16 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 17 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 18 | /*!40101 SET NAMES utf8mb4 */; 19 | 20 | -- 21 | -- Database: `Animals` 22 | -- 23 | 24 | -- -------------------------------------------------------- 25 | 26 | -- 27 | -- Struttura della tabella `species` 28 | -- 29 | 30 | CREATE TABLE `Animals` ( 31 | `animal_id` int(11) NOT NULL, 32 | `name` varchar(32) NOT NULL, 33 | `description` text DEFAULT NULL, 34 | `parent_id` int(11) DEFAULT NULL 35 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; 36 | 37 | -- 38 | -- Dump dei dati per la tabella `species` 39 | -- 40 | 41 | INSERT INTO `Animals` (`animal_id`, `name`, `description`, `parent_id`) VALUES 42 | (1, 'Animal', NULL, NULL), 43 | (2, 'Mammal', NULL, 1), 44 | (3, 'Bird', NULL, 1), 45 | (4, 'Fish', NULL, 1), 46 | (5, 'Cat', NULL, 2), 47 | (6, 'Dog', NULL, 2), 48 | (7, 'Lion', NULL, 2), 49 | (8, 'Pheasant', NULL, 3), 50 | (9, 'Parrot', NULL, 3), 51 | (10, 'Eagle', NULL, 3), 52 | (11, 'Shark', NULL, 4), 53 | (12, 'Clownfish', NULL, 4), 54 | (13, 'Swordfish', NULL, 4); 55 | 56 | -- 57 | -- Indici per le tabelle scaricate 58 | -- 59 | 60 | -- 61 | -- Indici per le tabelle `species` 62 | -- 63 | ALTER TABLE `Animals` 64 | ADD PRIMARY KEY (`animal_id`), 65 | ADD KEY `parent_child` (`parent_id`); 66 | 67 | -- 68 | -- AUTO_INCREMENT per le tabelle scaricate 69 | -- 70 | 71 | -- 72 | -- AUTO_INCREMENT per la tabella `species` 73 | -- 74 | ALTER TABLE `Animals` 75 | MODIFY `animal_id` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=14; 76 | 77 | -- 78 | -- Limiti per le tabelle scaricate 79 | -- 80 | 81 | -- 82 | -- Limiti per la tabella `species` 83 | -- 84 | ALTER TABLE `Animals` 85 | ADD CONSTRAINT `parent_child` FOREIGN KEY (`parent_id`) REFERENCES `species` (`animal_id`); 86 | COMMIT; 87 | 88 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 89 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 90 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 91 | -------------------------------------------------------------------------------- /Datasets/Animals_nested.sql: -------------------------------------------------------------------------------- 1 | -- phpMyAdmin SQL Dump 2 | -- version 5.1.1 3 | -- https://www.phpmyadmin.net/ 4 | -- 5 | -- Host: localhost 6 | -- Creato il: Nov 27, 2021 alle 10:47 7 | -- Versione del server: 10.4.21-MariaDB 8 | -- Versione PHP: 7.3.33 9 | 10 | SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO"; 11 | START TRANSACTION; 12 | SET time_zone = "+00:00"; 13 | 14 | 15 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 16 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 17 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 18 | /*!40101 SET NAMES utf8mb4 */; 19 | 20 | -- 21 | -- Database: `Animals` 22 | -- 23 | 24 | -- -------------------------------------------------------- 25 | 26 | -- 27 | -- Struttura della tabella `Animals_nested` 28 | -- 29 | 30 | CREATE TABLE `Animals_nested` ( 31 | `animal_id` int(11) NOT NULL, 32 | `name` varchar(32) NOT NULL, 33 | `left_value` int(11) NOT NULL, 34 | `right_value` int(11) NOT NULL 35 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; 36 | 37 | -- 38 | -- Dump dei dati per la tabella `Animals_nested` 39 | -- 40 | 41 | INSERT INTO `Animals_nested` (`animal_id`, `name`, `left_value`, `right_value`) VALUES 42 | (1, 'Animal', 1, 26), 43 | (2, 'Mammal', 2, 9), 44 | (3, 'Bird', 10, 17), 45 | (4, 'Fish', 18, 25), 46 | (5, 'Cat', 3, 4), 47 | (6, 'Dog', 5, 6), 48 | (7, 'Lion', 7, 8), 49 | (8, 'Pheasant', 11, 12), 50 | (9, 'Parrot', 13, 14), 51 | (10, 'Eagle', 15, 16), 52 | (11, 'Shark', 19, 20), 53 | (12, 'ClownFish', 21, 22), 54 | (13, 'Swordfish', 23, 24); 55 | 56 | -- 57 | -- Indici per le tabelle scaricate 58 | -- 59 | 60 | -- 61 | -- Indici per le tabelle `Animals_nested` 62 | -- 63 | ALTER TABLE `Animals_nested` 64 | ADD PRIMARY KEY (`animal_id`); 65 | 66 | -- 67 | -- AUTO_INCREMENT per le tabelle scaricate 68 | -- 69 | 70 | -- 71 | -- AUTO_INCREMENT per la tabella `Animals_nested` 72 | -- 73 | ALTER TABLE `Animals_nested` 74 | MODIFY `animal_id` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=14; 75 | COMMIT; 76 | 77 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 78 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 79 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 80 | -------------------------------------------------------------------------------- /Datasets/Shakespeare.txt: -------------------------------------------------------------------------------- 1 | William Shakespeare 2 | 3 | FIRST PARAGRAPH 4 | William Shakespeare (bapt. 26.04.1564 death 23.04.1616) was an English playwright, poet, and actor, widely regarded as the greatest writer in the English language and the world's greatest dramatist. He is often called England's national poet and the "Bard of Avon" (or simply "the Bard"). His extant works, including collaborations, consist of some 39 plays, 154 sonnets, three long narrative poems, and a few other verses, some of uncertain authorship. His plays have been translated into every major living language and are performed more often than those of any other playwright. They also continue to be studied and reinterpreted. 5 | 6 | SECOND PARAGRAPH 7 | 8 | Shakespeare was born and raised in Stratford-upon-Avon, Warwickshire. At the age of 18, he married Anne Hathaway, with whom he had three children: Susanna and twins Hamnet and Judith. Sometime between 1585 and 1592, he began a successful career in London as an actor, writer, and part-owner of a playing company called the Lord Chamberlain's Men, later known as the King's Men. At age 49 (around 1613), he appears to have retired to Stratford, where he died three years later. Few records of Shakespeare's private life survive; this has stimulated considerable speculation about such matters as his physical appearance, his sexuality, his religious beliefs, and whether the works attributed to him were written by others. 9 | 10 | Text extracted from Wikipedia. 11 | -------------------------------------------------------------------------------- /Datasets/capitals1.csv: -------------------------------------------------------------------------------- 1 | "Country","Capital","Value" 2 | "Austria","Vienna",2 3 | "Belgium","Brussels",3 4 | "The Netherlands","Amsterdam",1 5 | "Norway","Oslo", 6 | "Austria","Vienna",2 7 | -------------------------------------------------------------------------------- /Datasets/capitals2.csv: -------------------------------------------------------------------------------- 1 | "Country","Capital","Value" 2 | "Italy","Rome",3 3 | "France","Paris",1 4 | "Germany","Berlin",6 5 | "Spain","Madrid",7 6 | "Portugal","Lisboa",8 7 | "Belgium","Brussels",3 8 | -------------------------------------------------------------------------------- /Datasets/rainfall.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/Datasets/rainfall.xlsx -------------------------------------------------------------------------------- /EnvironmentSetup/Docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:slim 2 | WORKDIR /app_home 3 | COPY . . 4 | RUN pip install --no-cache-dir -r requirements.txt 5 | CMD ["python", "./app.py"] -------------------------------------------------------------------------------- /EnvironmentSetup/Docker/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Data Analysis with Scikit-learn 5 | # In this short tutorial I illustrate a complete data analysis process which exploits the `scikit-learn` Python library. The process includes 6 | # * preprocessing, which includes features selection, normalization and balancing 7 | # * model selection with parameters tuning 8 | # * model evaluation 9 | 10 | # ## Load Dataset 11 | # Firstly, I load the dataset through the Python `pandas` library. I exploit the `heart.csv` dataset, provided by the [Kaggle repository](https://www.kaggle.com/rashikrahmanpritom/heart-attack-analysis-prediction-dataset). 12 | 13 | # In[132]: 14 | import warnings 15 | warnings.filterwarnings("ignore") 16 | 17 | import pandas as pd 18 | import numpy as np 19 | 20 | images_dir = 'images' 21 | df = pd.read_csv('source/heart.csv') 22 | df.head() 23 | 24 | 25 | # In[133]: 26 | 27 | 28 | df.shape 29 | 30 | 31 | # ## Features selection 32 | # Now, I split the columns of the dataset in input (`X`) and output (`Y`). I use all the columns but `output` as input features. 33 | 34 | # In[134]: 35 | 36 | 37 | features = [] 38 | for column in df.columns: 39 | if column != 'output': 40 | features.append(column) 41 | features 42 | 43 | 44 | # In[135]: 45 | 46 | 47 | X = df[features] 48 | Y = df['output'] 49 | 50 | 51 | # In order to select the minimum set of input features, I calculate the Pearson correlation coefficient among features, through `corr()` function, provided by a `pandas dataframe`. 52 | 53 | # In[136]: 54 | 55 | 56 | X.corr() 57 | 58 | 59 | # I note that all the features have a low correlation, thus I can keep all of them as input features. 60 | 61 | # ## Data Normalization 62 | # Data Normalization scales all the features in the same interval. I exploit the `MinMaxScaler()` provided by the `scikit-learn` library. I dealt with Data Normalization in `scikit-learn` in my [previous article](https://towardsdatascience.com/data-normalization-with-python-scikit-learn-e9c5640fed58), while I [this](https://towardsdatascience.com/data-preprocessing-with-python-pandas-part-3-normalisation-5b5392d27673) article I described the general process of Data Normalization without `scikit-learn`. 63 | 64 | # In[137]: 65 | 66 | 67 | X.describe() 68 | 69 | 70 | # For each input feature I calculate the `MinMaxScaler()` and I store the result in the same `X` column. The `MinMaxScaler()` must be fitted firstly through the `fit()` function and then can be applied for a transformation through the `transform()` function. Note that I must reshape every feature in the format (-1,1) in order to be passed as input parameter of the scaler. For example, `Reshape(-1,1)` transforms the array `[0,1,2,3,5]` into `[[0],[1],[2],[3],[5]]`. 71 | 72 | # In[138]: 73 | 74 | 75 | from sklearn.preprocessing import MinMaxScaler 76 | 77 | for column in X.columns: 78 | feature = np.array(X[column]).reshape(-1,1) 79 | scaler = MinMaxScaler() 80 | scaler.fit(feature) 81 | feature_scaled = scaler.transform(feature) 82 | X[column] = feature_scaled.reshape(1,-1)[0] 83 | 84 | 85 | # In[139]: 86 | 87 | 88 | X.describe() 89 | 90 | 91 | # ## Split the dataset in Training and Test 92 | # Now I split the dataset into two parts: training and testset. The test set size is 20% of the whole dataset. I exploit the `scikit-learn` function `train_test_split()`. I will use the training set to train the model and the testset to test the performance of the model. 93 | 94 | # In[140]: 95 | 96 | 97 | import numpy as np 98 | from sklearn.model_selection import train_test_split 99 | 100 | X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42) 101 | 102 | 103 | # ## Balancing 104 | # I check whether the dataset is balanced or not, i.e. if the output classes in the training set are equally represented. I can use the `value_counts()` function to calculate the number of records in each output class. 105 | 106 | # In[141]: 107 | 108 | 109 | y_train.value_counts() 110 | 111 | 112 | # The output classes are not balanced, thus I can balance it. I can exploit the `imblearn` library, to perform balancing. I try both oversampling the minority class and undersampling the majority class. More details related to the Imbalanced Learn library can be found [here](https://imbalanced-learn.org/stable/). 113 | # Firstly, I perform over sampling through the `RandomOverSampler()`. I create the model and then I fit with the training set. The `fit_resample()` function returns the balanced training set. 114 | 115 | # In[142]: 116 | 117 | 118 | from imblearn.over_sampling import RandomOverSampler 119 | over_sampler = RandomOverSampler(random_state=42) 120 | X_bal_over, y_bal_over = over_sampler.fit_resample(X_train, y_train) 121 | 122 | 123 | # I calculate the number of records in each class through the `value_counts()` function and I note that now the dataset is balanced. 124 | 125 | # In[143]: 126 | 127 | 128 | y_bal_over.value_counts() 129 | 130 | 131 | # Secondly, I perform under sampling through the `RandomUnderSampler()` model. 132 | 133 | # In[144]: 134 | 135 | 136 | from imblearn.under_sampling import RandomUnderSampler 137 | 138 | under_sampler = RandomUnderSampler(random_state=42) 139 | X_bal_under, y_bal_under = under_sampler.fit_resample(X_train, y_train) 140 | 141 | 142 | # In[145]: 143 | 144 | 145 | y_bal_under.value_counts() 146 | 147 | 148 | # ## Model Selection and Training 149 | 150 | # Now, I'm ready to train the model. I choose a `KNeighborsClassifier` and firstly I train it with imbalanced data. I exploit the `fit()` function to train the model and then the`predict_proba()` function to predict the values of the test set. 151 | 152 | # In[120]: 153 | 154 | 155 | from sklearn.neighbors import KNeighborsClassifier 156 | 157 | model = KNeighborsClassifier(n_neighbors=3) 158 | model.fit(X_train, y_train) 159 | y_score = model.predict_proba(X_test) 160 | 161 | 162 | # I calculate the performance of the model. In particular, I calculate the `roc_curve()` and the `precision_recall()` and then I plot them. I exploit the `scikitplot` library to plot curves. 163 | # 164 | # From the plot I note that there is a roc curve for each class. With respect to the precision recall curve, the class 1 works better than class 0, probably because it is represented by a greater number of samples. 165 | 166 | # In[121]: 167 | 168 | 169 | import matplotlib.pyplot as plt 170 | from sklearn.metrics import roc_curve 171 | from scikitplot.metrics import plot_roc,auc 172 | from scikitplot.metrics import plot_precision_recall 173 | 174 | fpr0, tpr0, thresholds = roc_curve(y_test, y_score[:, 1]) 175 | 176 | # Plot metrics 177 | plot_roc(y_test, y_score) 178 | plt.savefig(f"{images_dir}/roc.png") 179 | #plt.show() 180 | 181 | plot_precision_recall(y_test, y_score) 182 | plt.savefig(f"{images_dir}/precision_recall.png") 183 | #plt.show() 184 | 185 | 186 | # Now, I recalculate the same things with oversampling balancing. I note that the precision recall curve of class 0 increases, while that of class 1 decreases. 187 | 188 | # In[122]: 189 | 190 | 191 | model = KNeighborsClassifier(n_neighbors=3) 192 | model.fit(X_bal_over, y_bal_over) 193 | y_score = model.predict_proba(X_test) 194 | 195 | 196 | # In[123]: 197 | 198 | 199 | fpr0, tpr0, thresholds = roc_curve(y_test, y_score[:, 1]) 200 | 201 | # Plot metrics 202 | plot_roc(y_test, y_score) 203 | plt.savefig(f"{images_dir}/roc-oversampling.png") 204 | #plt.show() 205 | 206 | plot_precision_recall(y_test, y_score) 207 | plt.savefig(f"{images_dir}/precision_recall-oversampling.png") 208 | #plt.show() 209 | 210 | 211 | # Finally, I train the model through under sampled data and I note a general deterioration of the performance. 212 | 213 | # In[124]: 214 | 215 | 216 | model = KNeighborsClassifier(n_neighbors=3) 217 | model.fit(X_bal_under, y_bal_under) 218 | y_score = model.predict_proba(X_test) 219 | 220 | 221 | # In[125]: 222 | 223 | 224 | fpr0, tpr0, thresholds = roc_curve(y_test, y_score[:, 1]) 225 | 226 | # Plot metrics 227 | plot_roc(y_test, y_score) 228 | #plt.show() 229 | plt.savefig(f"{images_dir}/roc-undersampling.png") 230 | 231 | plot_precision_recall(y_test, y_score) 232 | plt.savefig(f"{images_dir}/precision_recall-undersampling.png") 233 | #plt.show() 234 | 235 | 236 | # ## Parameters Tuning 237 | # In the last part of this tutorial, I try to improve the performance of the model by searching for best parameters for my model. I exploit the `GridSearchCV` mechanism provided by the `scikit-learn` library. I select a range of values for each parameter to be tested and I put them in the `param_grid` variable. I create a `GridSearchCV()` object, I fit with the training set and then I retrieve the best estimator, contained in the `best_estimator_` variable. 238 | 239 | # In[126]: 240 | 241 | 242 | from sklearn.model_selection import GridSearchCV 243 | 244 | model = KNeighborsClassifier() 245 | 246 | param_grid = { 247 | 'n_neighbors': np.arange(2,8), 248 | 'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'], 249 | 'metric' : ['euclidean','manhattan','chebyshev','minkowski'] 250 | } 251 | 252 | grid = GridSearchCV(model, param_grid = param_grid) 253 | grid.fit(X_train, y_train) 254 | 255 | 256 | best_estimator = grid.best_estimator_ 257 | best_estimator 258 | 259 | 260 | # I exploit the best estimator as model for my predictions and I calculate the performance of the algorithm. 261 | 262 | # In[127]: 263 | 264 | 265 | best_estimator.fit(X_train, y_train) 266 | y_score = best_estimator.predict_proba(X_test) 267 | 268 | 269 | # In[128]: 270 | 271 | 272 | fpr0, tpr0, thresholds = roc_curve(y_test, y_score[:, 1]) 273 | 274 | # Plot metrics 275 | plot_roc(y_test, y_score) 276 | plt.savefig(f"{images_dir}/roc-cv.png") 277 | #plt.show() 278 | 279 | plot_precision_recall(y_test, y_score) 280 | plt.savefig(f"{images_dir}/precision_recall-cv.png") 281 | #plt.show() 282 | 283 | 284 | # I note that the roc curve has improved. I try now with the over sampled training set. 285 | 286 | # In[129]: 287 | 288 | 289 | grid = GridSearchCV(model, param_grid = param_grid) 290 | grid.fit(X_bal_over, y_bal_over) 291 | 292 | 293 | best_estimator = grid.best_estimator_ 294 | best_estimator 295 | 296 | 297 | # In this case I obtain the best performance. 298 | 299 | # In[130]: 300 | 301 | 302 | best_estimator.fit(X_bal_over, y_bal_over) 303 | y_score = best_estimator.predict_proba(X_test) 304 | 305 | 306 | # In[131]: 307 | 308 | 309 | fpr0, tpr0, thresholds = roc_curve(y_test, y_score[:, 1]) 310 | roc_auc0 = auc(fpr0, tpr0) 311 | 312 | # Plot metrics 313 | plot_roc(y_test, y_score) 314 | plt.savefig(f"{images_dir}/roc-cv-oversampling.png") 315 | #plt.show() 316 | 317 | plot_precision_recall(y_test, y_score) 318 | plt.savefig(f"{images_dir}/precision_recall-cv-oversampling.png") 319 | #plt.show() 320 | 321 | 322 | # In[ ]: 323 | 324 | 325 | 326 | 327 | -------------------------------------------------------------------------------- /EnvironmentSetup/Docker/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | scikit-learn 4 | imblearn 5 | matplotlib 6 | scikit-plot -------------------------------------------------------------------------------- /Preprocessing/Balancing/glass.csv: -------------------------------------------------------------------------------- 1 | RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,target 2 | 1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.00,1 3 | 1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.00,1 4 | 1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.00,1 5 | 1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.00,1 6 | 1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.00,1 7 | 1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0.00,0.26,1 8 | 1.51743,13.30,3.60,1.14,73.09,0.58,8.17,0.00,0.00,1 9 | 1.51756,13.15,3.61,1.05,73.24,0.57,8.24,0.00,0.00,1 10 | 1.51918,14.04,3.58,1.37,72.08,0.56,8.30,0.00,0.00,1 11 | 1.51755,13.00,3.60,1.36,72.99,0.57,8.40,0.00,0.11,1 12 | 1.51571,12.72,3.46,1.56,73.20,0.67,8.09,0.00,0.24,1 13 | 1.51763,12.80,3.66,1.27,73.01,0.60,8.56,0.00,0.00,1 14 | 1.51589,12.88,3.43,1.40,73.28,0.69,8.05,0.00,0.24,1 15 | 1.51748,12.86,3.56,1.27,73.21,0.54,8.38,0.00,0.17,1 16 | 1.51763,12.61,3.59,1.31,73.29,0.58,8.50,0.00,0.00,1 17 | 1.51761,12.81,3.54,1.23,73.24,0.58,8.39,0.00,0.00,1 18 | 1.51784,12.68,3.67,1.16,73.11,0.61,8.70,0.00,0.00,1 19 | 1.52196,14.36,3.85,0.89,71.36,0.15,9.15,0.00,0.00,1 20 | 1.51911,13.90,3.73,1.18,72.12,0.06,8.89,0.00,0.00,1 21 | 1.51735,13.02,3.54,1.69,72.73,0.54,8.44,0.00,0.07,1 22 | 1.51750,12.82,3.55,1.49,72.75,0.54,8.52,0.00,0.19,1 23 | 1.51966,14.77,3.75,0.29,72.02,0.03,9.00,0.00,0.00,1 24 | 1.51736,12.78,3.62,1.29,72.79,0.59,8.70,0.00,0.00,1 25 | 1.51751,12.81,3.57,1.35,73.02,0.62,8.59,0.00,0.00,1 26 | 1.51720,13.38,3.50,1.15,72.85,0.50,8.43,0.00,0.00,1 27 | 1.51764,12.98,3.54,1.21,73.00,0.65,8.53,0.00,0.00,1 28 | 1.51793,13.21,3.48,1.41,72.64,0.59,8.43,0.00,0.00,1 29 | 1.51721,12.87,3.48,1.33,73.04,0.56,8.43,0.00,0.00,1 30 | 1.51768,12.56,3.52,1.43,73.15,0.57,8.54,0.00,0.00,1 31 | 1.51784,13.08,3.49,1.28,72.86,0.60,8.49,0.00,0.00,1 32 | 1.51768,12.65,3.56,1.30,73.08,0.61,8.69,0.00,0.14,1 33 | 1.51747,12.84,3.50,1.14,73.27,0.56,8.55,0.00,0.00,1 34 | 1.51775,12.85,3.48,1.23,72.97,0.61,8.56,0.09,0.22,1 35 | 1.51753,12.57,3.47,1.38,73.39,0.60,8.55,0.00,0.06,1 36 | 1.51783,12.69,3.54,1.34,72.95,0.57,8.75,0.00,0.00,1 37 | 1.51567,13.29,3.45,1.21,72.74,0.56,8.57,0.00,0.00,1 38 | 1.51909,13.89,3.53,1.32,71.81,0.51,8.78,0.11,0.00,1 39 | 1.51797,12.74,3.48,1.35,72.96,0.64,8.68,0.00,0.00,1 40 | 1.52213,14.21,3.82,0.47,71.77,0.11,9.57,0.00,0.00,1 41 | 1.52213,14.21,3.82,0.47,71.77,0.11,9.57,0.00,0.00,1 42 | 1.51793,12.79,3.50,1.12,73.03,0.64,8.77,0.00,0.00,1 43 | 1.51755,12.71,3.42,1.20,73.20,0.59,8.64,0.00,0.00,1 44 | 1.51779,13.21,3.39,1.33,72.76,0.59,8.59,0.00,0.00,1 45 | 1.52210,13.73,3.84,0.72,71.76,0.17,9.74,0.00,0.00,1 46 | 1.51786,12.73,3.43,1.19,72.95,0.62,8.76,0.00,0.30,1 47 | 1.51900,13.49,3.48,1.35,71.95,0.55,9.00,0.00,0.00,1 48 | 1.51869,13.19,3.37,1.18,72.72,0.57,8.83,0.00,0.16,1 49 | 1.52667,13.99,3.70,0.71,71.57,0.02,9.82,0.00,0.10,1 50 | 1.52223,13.21,3.77,0.79,71.99,0.13,10.02,0.00,0.00,1 51 | 1.51898,13.58,3.35,1.23,72.08,0.59,8.91,0.00,0.00,1 52 | 1.52320,13.72,3.72,0.51,71.75,0.09,10.06,0.00,0.16,1 53 | 1.51926,13.20,3.33,1.28,72.36,0.60,9.14,0.00,0.11,1 54 | 1.51808,13.43,2.87,1.19,72.84,0.55,9.03,0.00,0.00,1 55 | 1.51837,13.14,2.84,1.28,72.85,0.55,9.07,0.00,0.00,1 56 | 1.51778,13.21,2.81,1.29,72.98,0.51,9.02,0.00,0.09,1 57 | 1.51769,12.45,2.71,1.29,73.70,0.56,9.06,0.00,0.24,1 58 | 1.51215,12.99,3.47,1.12,72.98,0.62,8.35,0.00,0.31,1 59 | 1.51824,12.87,3.48,1.29,72.95,0.60,8.43,0.00,0.00,1 60 | 1.51754,13.48,3.74,1.17,72.99,0.59,8.03,0.00,0.00,1 61 | 1.51754,13.39,3.66,1.19,72.79,0.57,8.27,0.00,0.11,1 62 | 1.51905,13.60,3.62,1.11,72.64,0.14,8.76,0.00,0.00,1 63 | 1.51977,13.81,3.58,1.32,71.72,0.12,8.67,0.69,0.00,1 64 | 1.52172,13.51,3.86,0.88,71.79,0.23,9.54,0.00,0.11,1 65 | 1.52227,14.17,3.81,0.78,71.35,0.00,9.69,0.00,0.00,1 66 | 1.52172,13.48,3.74,0.90,72.01,0.18,9.61,0.00,0.07,1 67 | 1.52099,13.69,3.59,1.12,71.96,0.09,9.40,0.00,0.00,1 68 | 1.52152,13.05,3.65,0.87,72.22,0.19,9.85,0.00,0.17,1 69 | 1.52152,13.05,3.65,0.87,72.32,0.19,9.85,0.00,0.17,1 70 | 1.52152,13.12,3.58,0.90,72.20,0.23,9.82,0.00,0.16,1 71 | 1.52300,13.31,3.58,0.82,71.99,0.12,10.17,0.00,0.03,1 72 | 1.51574,14.86,3.67,1.74,71.87,0.16,7.36,0.00,0.12,2 73 | 1.51848,13.64,3.87,1.27,71.96,0.54,8.32,0.00,0.32,2 74 | 1.51593,13.09,3.59,1.52,73.10,0.67,7.83,0.00,0.00,2 75 | 1.51631,13.34,3.57,1.57,72.87,0.61,7.89,0.00,0.00,2 76 | 1.51596,13.02,3.56,1.54,73.11,0.72,7.90,0.00,0.00,2 77 | 1.51590,13.02,3.58,1.51,73.12,0.69,7.96,0.00,0.00,2 78 | 1.51645,13.44,3.61,1.54,72.39,0.66,8.03,0.00,0.00,2 79 | 1.51627,13.00,3.58,1.54,72.83,0.61,8.04,0.00,0.00,2 80 | 1.51613,13.92,3.52,1.25,72.88,0.37,7.94,0.00,0.14,2 81 | 1.51590,12.82,3.52,1.90,72.86,0.69,7.97,0.00,0.00,2 82 | 1.51592,12.86,3.52,2.12,72.66,0.69,7.97,0.00,0.00,2 83 | 1.51593,13.25,3.45,1.43,73.17,0.61,7.86,0.00,0.00,2 84 | 1.51646,13.41,3.55,1.25,72.81,0.68,8.10,0.00,0.00,2 85 | 1.51594,13.09,3.52,1.55,72.87,0.68,8.05,0.00,0.09,2 86 | 1.51409,14.25,3.09,2.08,72.28,1.10,7.08,0.00,0.00,2 87 | 1.51625,13.36,3.58,1.49,72.72,0.45,8.21,0.00,0.00,2 88 | 1.51569,13.24,3.49,1.47,73.25,0.38,8.03,0.00,0.00,2 89 | 1.51645,13.40,3.49,1.52,72.65,0.67,8.08,0.00,0.10,2 90 | 1.51618,13.01,3.50,1.48,72.89,0.60,8.12,0.00,0.00,2 91 | 1.51640,12.55,3.48,1.87,73.23,0.63,8.08,0.00,0.09,2 92 | 1.51841,12.93,3.74,1.11,72.28,0.64,8.96,0.00,0.22,2 93 | 1.51605,12.90,3.44,1.45,73.06,0.44,8.27,0.00,0.00,2 94 | 1.51588,13.12,3.41,1.58,73.26,0.07,8.39,0.00,0.19,2 95 | 1.51590,13.24,3.34,1.47,73.10,0.39,8.22,0.00,0.00,2 96 | 1.51629,12.71,3.33,1.49,73.28,0.67,8.24,0.00,0.00,2 97 | 1.51860,13.36,3.43,1.43,72.26,0.51,8.60,0.00,0.00,2 98 | 1.51841,13.02,3.62,1.06,72.34,0.64,9.13,0.00,0.15,2 99 | 1.51743,12.20,3.25,1.16,73.55,0.62,8.90,0.00,0.24,2 100 | 1.51689,12.67,2.88,1.71,73.21,0.73,8.54,0.00,0.00,2 101 | 1.51811,12.96,2.96,1.43,72.92,0.60,8.79,0.14,0.00,2 102 | 1.51655,12.75,2.85,1.44,73.27,0.57,8.79,0.11,0.22,2 103 | 1.51730,12.35,2.72,1.63,72.87,0.70,9.23,0.00,0.00,2 104 | 1.51820,12.62,2.76,0.83,73.81,0.35,9.42,0.00,0.20,2 105 | 1.52725,13.80,3.15,0.66,70.57,0.08,11.64,0.00,0.00,2 106 | 1.52410,13.83,2.90,1.17,71.15,0.08,10.79,0.00,0.00,2 107 | 1.52475,11.45,0.00,1.88,72.19,0.81,13.24,0.00,0.34,2 108 | 1.53125,10.73,0.00,2.10,69.81,0.58,13.30,3.15,0.28,2 109 | 1.53393,12.30,0.00,1.00,70.16,0.12,16.19,0.00,0.24,2 110 | 1.52222,14.43,0.00,1.00,72.67,0.10,11.52,0.00,0.08,2 111 | 1.51818,13.72,0.00,0.56,74.45,0.00,10.99,0.00,0.00,2 112 | 1.52664,11.23,0.00,0.77,73.21,0.00,14.68,0.00,0.00,2 113 | 1.52739,11.02,0.00,0.75,73.08,0.00,14.96,0.00,0.00,2 114 | 1.52777,12.64,0.00,0.67,72.02,0.06,14.40,0.00,0.00,2 115 | 1.51892,13.46,3.83,1.26,72.55,0.57,8.21,0.00,0.14,2 116 | 1.51847,13.10,3.97,1.19,72.44,0.60,8.43,0.00,0.00,2 117 | 1.51846,13.41,3.89,1.33,72.38,0.51,8.28,0.00,0.00,2 118 | 1.51829,13.24,3.90,1.41,72.33,0.55,8.31,0.00,0.10,2 119 | 1.51708,13.72,3.68,1.81,72.06,0.64,7.88,0.00,0.00,2 120 | 1.51673,13.30,3.64,1.53,72.53,0.65,8.03,0.00,0.29,2 121 | 1.51652,13.56,3.57,1.47,72.45,0.64,7.96,0.00,0.00,2 122 | 1.51844,13.25,3.76,1.32,72.40,0.58,8.42,0.00,0.00,2 123 | 1.51663,12.93,3.54,1.62,72.96,0.64,8.03,0.00,0.21,2 124 | 1.51687,13.23,3.54,1.48,72.84,0.56,8.10,0.00,0.00,2 125 | 1.51707,13.48,3.48,1.71,72.52,0.62,7.99,0.00,0.00,2 126 | 1.52177,13.20,3.68,1.15,72.75,0.54,8.52,0.00,0.00,2 127 | 1.51872,12.93,3.66,1.56,72.51,0.58,8.55,0.00,0.12,2 128 | 1.51667,12.94,3.61,1.26,72.75,0.56,8.60,0.00,0.00,2 129 | 1.52081,13.78,2.28,1.43,71.99,0.49,9.85,0.00,0.17,2 130 | 1.52068,13.55,2.09,1.67,72.18,0.53,9.57,0.27,0.17,2 131 | 1.52020,13.98,1.35,1.63,71.76,0.39,10.56,0.00,0.18,2 132 | 1.52177,13.75,1.01,1.36,72.19,0.33,11.14,0.00,0.00,2 133 | 1.52614,13.70,0.00,1.36,71.24,0.19,13.44,0.00,0.10,2 134 | 1.51813,13.43,3.98,1.18,72.49,0.58,8.15,0.00,0.00,2 135 | 1.51800,13.71,3.93,1.54,71.81,0.54,8.21,0.00,0.15,2 136 | 1.51811,13.33,3.85,1.25,72.78,0.52,8.12,0.00,0.00,2 137 | 1.51789,13.19,3.90,1.30,72.33,0.55,8.44,0.00,0.28,2 138 | 1.51806,13.00,3.80,1.08,73.07,0.56,8.38,0.00,0.12,2 139 | 1.51711,12.89,3.62,1.57,72.96,0.61,8.11,0.00,0.00,2 140 | 1.51674,12.79,3.52,1.54,73.36,0.66,7.90,0.00,0.00,2 141 | 1.51674,12.87,3.56,1.64,73.14,0.65,7.99,0.00,0.00,2 142 | 1.51690,13.33,3.54,1.61,72.54,0.68,8.11,0.00,0.00,2 143 | 1.51851,13.20,3.63,1.07,72.83,0.57,8.41,0.09,0.17,2 144 | 1.51662,12.85,3.51,1.44,73.01,0.68,8.23,0.06,0.25,2 145 | 1.51709,13.00,3.47,1.79,72.72,0.66,8.18,0.00,0.00,2 146 | 1.51660,12.99,3.18,1.23,72.97,0.58,8.81,0.00,0.24,2 147 | 1.51839,12.85,3.67,1.24,72.57,0.62,8.68,0.00,0.35,2 148 | 1.51769,13.65,3.66,1.11,72.77,0.11,8.60,0.00,0.00,3 149 | 1.51610,13.33,3.53,1.34,72.67,0.56,8.33,0.00,0.00,3 150 | 1.51670,13.24,3.57,1.38,72.70,0.56,8.44,0.00,0.10,3 151 | 1.51643,12.16,3.52,1.35,72.89,0.57,8.53,0.00,0.00,3 152 | 1.51665,13.14,3.45,1.76,72.48,0.60,8.38,0.00,0.17,3 153 | 1.52127,14.32,3.90,0.83,71.50,0.00,9.49,0.00,0.00,3 154 | 1.51779,13.64,3.65,0.65,73.00,0.06,8.93,0.00,0.00,3 155 | 1.51610,13.42,3.40,1.22,72.69,0.59,8.32,0.00,0.00,3 156 | 1.51694,12.86,3.58,1.31,72.61,0.61,8.79,0.00,0.00,3 157 | 1.51646,13.04,3.40,1.26,73.01,0.52,8.58,0.00,0.00,3 158 | 1.51655,13.41,3.39,1.28,72.64,0.52,8.65,0.00,0.00,3 159 | 1.52121,14.03,3.76,0.58,71.79,0.11,9.65,0.00,0.00,3 160 | 1.51776,13.53,3.41,1.52,72.04,0.58,8.79,0.00,0.00,3 161 | 1.51796,13.50,3.36,1.63,71.94,0.57,8.81,0.00,0.09,3 162 | 1.51832,13.33,3.34,1.54,72.14,0.56,8.99,0.00,0.00,3 163 | 1.51934,13.64,3.54,0.75,72.65,0.16,8.89,0.15,0.24,3 164 | 1.52211,14.19,3.78,0.91,71.36,0.23,9.14,0.00,0.37,3 165 | 1.51514,14.01,2.68,3.50,69.89,1.68,5.87,2.20,0.00,5 166 | 1.51915,12.73,1.85,1.86,72.69,0.60,10.09,0.00,0.00,5 167 | 1.52171,11.56,1.88,1.56,72.86,0.47,11.41,0.00,0.00,5 168 | 1.52151,11.03,1.71,1.56,73.44,0.58,11.62,0.00,0.00,5 169 | 1.51969,12.64,0.00,1.65,73.75,0.38,11.53,0.00,0.00,5 170 | 1.51666,12.86,0.00,1.83,73.88,0.97,10.17,0.00,0.00,5 171 | 1.51994,13.27,0.00,1.76,73.03,0.47,11.32,0.00,0.00,5 172 | 1.52369,13.44,0.00,1.58,72.22,0.32,12.24,0.00,0.00,5 173 | 1.51316,13.02,0.00,3.04,70.48,6.21,6.96,0.00,0.00,5 174 | 1.51321,13.00,0.00,3.02,70.70,6.21,6.93,0.00,0.00,5 175 | 1.52043,13.38,0.00,1.40,72.25,0.33,12.50,0.00,0.00,5 176 | 1.52058,12.85,1.61,2.17,72.18,0.76,9.70,0.24,0.51,5 177 | 1.52119,12.97,0.33,1.51,73.39,0.13,11.27,0.00,0.28,5 178 | 1.51905,14.00,2.39,1.56,72.37,0.00,9.57,0.00,0.00,6 179 | 1.51937,13.79,2.41,1.19,72.76,0.00,9.77,0.00,0.00,6 180 | 1.51829,14.46,2.24,1.62,72.38,0.00,9.26,0.00,0.00,6 181 | 1.51852,14.09,2.19,1.66,72.67,0.00,9.32,0.00,0.00,6 182 | 1.51299,14.40,1.74,1.54,74.55,0.00,7.59,0.00,0.00,6 183 | 1.51888,14.99,0.78,1.74,72.50,0.00,9.95,0.00,0.00,6 184 | 1.51916,14.15,0.00,2.09,72.74,0.00,10.88,0.00,0.00,6 185 | 1.51969,14.56,0.00,0.56,73.48,0.00,11.22,0.00,0.00,6 186 | 1.51115,17.38,0.00,0.34,75.41,0.00,6.65,0.00,0.00,6 187 | 1.51131,13.69,3.20,1.81,72.81,1.76,5.43,1.19,0.00,7 188 | 1.51838,14.32,3.26,2.22,71.25,1.46,5.79,1.63,0.00,7 189 | 1.52315,13.44,3.34,1.23,72.38,0.60,8.83,0.00,0.00,7 190 | 1.52247,14.86,2.20,2.06,70.26,0.76,9.76,0.00,0.00,7 191 | 1.52365,15.79,1.83,1.31,70.43,0.31,8.61,1.68,0.00,7 192 | 1.51613,13.88,1.78,1.79,73.10,0.00,8.67,0.76,0.00,7 193 | 1.51602,14.85,0.00,2.38,73.28,0.00,8.76,0.64,0.09,7 194 | 1.51623,14.20,0.00,2.79,73.46,0.04,9.04,0.40,0.09,7 195 | 1.51719,14.75,0.00,2.00,73.02,0.00,8.53,1.59,0.08,7 196 | 1.51683,14.56,0.00,1.98,73.29,0.00,8.52,1.57,0.07,7 197 | 1.51545,14.14,0.00,2.68,73.39,0.08,9.07,0.61,0.05,7 198 | 1.51556,13.87,0.00,2.54,73.23,0.14,9.41,0.81,0.01,7 199 | 1.51727,14.70,0.00,2.34,73.28,0.00,8.95,0.66,0.00,7 200 | 1.51531,14.38,0.00,2.66,73.10,0.04,9.08,0.64,0.00,7 201 | 1.51609,15.01,0.00,2.51,73.05,0.05,8.83,0.53,0.00,7 202 | 1.51508,15.15,0.00,2.25,73.50,0.00,8.34,0.63,0.00,7 203 | 1.51653,11.95,0.00,1.19,75.18,2.70,8.93,0.00,0.00,7 204 | 1.51514,14.85,0.00,2.42,73.72,0.00,8.39,0.56,0.00,7 205 | 1.51658,14.80,0.00,1.99,73.11,0.00,8.28,1.71,0.00,7 206 | 1.51617,14.95,0.00,2.27,73.30,0.00,8.71,0.67,0.00,7 207 | 1.51732,14.95,0.00,1.80,72.99,0.00,8.61,1.55,0.00,7 208 | 1.51645,14.94,0.00,1.87,73.11,0.00,8.67,1.38,0.00,7 209 | 1.51831,14.39,0.00,1.82,72.86,1.41,6.47,2.88,0.00,7 210 | 1.51640,14.37,0.00,2.74,72.85,0.00,9.45,0.54,0.00,7 211 | 1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.00,7 212 | 1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.00,7 213 | 1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.00,7 214 | 1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.00,7 215 | 1.51711,14.23,0.00,2.08,73.36,0.00,8.62,1.67,0.00,7 -------------------------------------------------------------------------------- /Preprocessing/Binning/.ipynb_checkpoints/Data Preprocessing - Binning-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 4 6 | } 7 | -------------------------------------------------------------------------------- /Preprocessing/Binning/cupcake.csv: -------------------------------------------------------------------------------- 1 | Mese,Cupcake 2004-01,5 2004-02,5 2004-03,4 2004-04,6 2004-05,5 2004-06,6 2004-07,6 2004-08,5 2004-09,5 2004-10,10 2004-11,7 2004-12,7 2005-01,8 2005-02,8 2005-03,8 2005-04,8 2005-05,8 2005-06,8 2005-07,8 2005-08,8 2005-09,8 2005-10,11 2005-11,8 2005-12,8 2006-01,9 2006-02,11 2006-03,10 2006-04,11 2006-05,11 2006-06,11 2006-07,11 2006-08,12 2006-09,13 2006-10,15 2006-11,11 2006-12,12 2007-01,12 2007-02,16 2007-03,16 2007-04,16 2007-05,15 2007-06,18 2007-07,18 2007-08,19 2007-09,18 2007-10,24 2007-11,19 2007-12,18 2008-01,20 2008-02,25 2008-03,25 2008-04,25 2008-05,25 2008-06,24 2008-07,25 2008-08,26 2008-09,26 2008-10,32 2008-11,29 2008-12,27 2009-01,29 2009-02,36 2009-03,35 2009-04,36 2009-05,36 2009-06,36 2009-07,37 2009-08,40 2009-09,36 2009-10,44 2009-11,38 2009-12,36 2010-01,38 2010-02,47 2010-03,46 2010-04,45 2010-05,50 2010-06,49 2010-07,57 2010-08,65 2010-09,60 2010-10,64 2010-11,58 2010-12,56 2011-01,57 2011-02,73 2011-03,71 2011-04,75 2011-05,70 2011-06,70 2011-07,71 2011-08,79 2011-09,75 2011-10,88 2011-11,80 2011-12,80 2012-01,80 2012-02,99 2012-03,92 2012-04,94 2012-05,94 2012-06,93 2012-07,88 2012-08,89 2012-09,94 2012-10,100 2012-11,87 2012-12,85 2013-01,82 2013-02,98 2013-03,100 2013-04,87 2013-05,91 2013-06,84 2013-07,84 2013-08,85 2013-09,88 2013-10,97 2013-11,87 2013-12,86 2014-01,82 2014-02,100 2014-03,92 2014-04,90 2014-05,86 2014-06,78 2014-07,80 2014-08,84 2014-09,84 2014-10,92 2014-11,84 2014-12,80 2015-01,74 2015-02,87 2015-03,81 2015-04,78 2015-05,77 2015-06,72 2015-07,72 2015-08,70 2015-09,69 2015-10,81 2015-11,73 2015-12,69 2016-01,68 2016-02,76 2016-03,73 2016-04,65 2016-05,68 2016-06,63 2016-07,61 2016-08,60 2016-09,63 2016-10,67 2016-11,61 2016-12,58 2017-01,57 2017-02,71 2017-03,62 2017-04,60 2017-05,58 2017-06,55 2017-07,53 2017-08,54 2017-09,54 2017-10,61 2017-11,54 2017-12,51 2018-01,50 2018-02,60 2018-03,57 2018-04,50 2018-05,51 2018-06,49 2018-07,47 2018-08,47 2018-09,49 2018-10,53 2018-11,48 2018-12,50 2019-01,51 2019-02,55 2019-03,50 2019-04,48 2019-05,46 2019-06,44 2019-07,43 2019-08,44 2019-09,45 2019-10,57 2019-11,45 2019-12,44 2020-01,43 2020-02,51 2020-03,46 2020-04,58 2020-05,60 2020-06,53 2020-07,50 2020-08,47 2020-09,44 2020-10,49 2020-11,44 2020-12,43 -------------------------------------------------------------------------------- /Preprocessing/MissingValues/cupcake.csv: -------------------------------------------------------------------------------- 1 | "Mese","Cupcake" 2 | "2004-01",5 3 | "2004-02", 4 | "2004-03",4 5 | "2004-04",6 6 | "2004-05",5 7 | "2004-06",6 8 | "2004-07",6 9 | "2004-08",5 10 | "2004-09",5 11 | "2004-10",10 12 | "2004-11",7 13 | "2004-12",7 14 | "2005-01",8 15 | "2005-02",8 16 | "2005-03",8 17 | "2005-04",8 18 | "2005-05",8 19 | "2005-06",8 20 | "2005-07",8 21 | "2005-08",8 22 | "2005-09",8 23 | "2005-10",11 24 | "2005-11",8 25 | "2005-12",8 26 | "2006-01",9 27 | "2006-02",11 28 | "2006-03", 29 | "2006-04",11 30 | "2006-05",11 31 | "2006-06",11 32 | "2006-07",11 33 | "2006-08",12 34 | "2006-09",13 35 | "2006-10",15 36 | "2006-11",11 37 | "2006-12",12 38 | "2007-01",12 39 | "2007-02",16 40 | "2007-03",16 41 | "2007-04",16 42 | "2007-05",15 43 | "2007-06",18 44 | "2007-07",18 45 | "2007-08",19 46 | "2007-09",18 47 | "2007-10",24 48 | "2007-11",19 49 | "2007-12",18 50 | "2008-01",20 51 | "2008-02",25 52 | "2008-03",25 53 | "2008-04",25 54 | "2008-05",25 55 | "2008-06",24 56 | "2008-07",25 57 | "2008-08",26 58 | "2008-09",26 59 | "2008-10",32 60 | "2008-11",29 61 | "2008-12",27 62 | "2009-01",29 63 | "2009-02",36 64 | "2009-03",35 65 | "2009-04",36 66 | "2009-05",36 67 | "2009-06",36 68 | "2009-07",37 69 | "2009-08",40 70 | "2009-09",36 71 | "2009-10",44 72 | "2009-11",38 73 | "2009-12",36 74 | "2010-01",38 75 | "2010-02",47 76 | "2010-03",46 77 | "2010-04",45 78 | "2010-05",50 79 | "2010-06",49 80 | "2010-07",57 81 | "2010-08",65 82 | "2010-09",60 83 | "2010-10",64 84 | "2010-11",58 85 | "2010-12",56 86 | "2011-01",57 87 | "2011-02",73 88 | "2011-03",71 89 | "2011-04",75 90 | "2011-05",70 91 | "2011-06",70 92 | "2011-07",71 93 | "2011-08",79 94 | "2011-09",75 95 | "2011-10",88 96 | "2011-11",80 97 | "2011-12",80 98 | "2012-01",80 99 | "2012-02",99 100 | "2012-03",92 101 | "2012-04",94 102 | "2012-05",94 103 | "2012-06",93 104 | "2012-07",88 105 | "2012-08",89 106 | "2012-09",94 107 | "2012-10",100 108 | "2012-11",87 109 | "2012-12",85 110 | "2013-01",82 111 | "2013-02",98 112 | "2013-03",100 113 | "2013-04",87 114 | "2013-05",91 115 | "2013-06",84 116 | "2013-07",84 117 | "2013-08",85 118 | "2013-09",88 119 | "2013-10",97 120 | "2013-11",87 121 | "2013-12",86 122 | "2014-01",82 123 | "2014-02",100 124 | "2014-03",92 125 | "2014-04",90 126 | "2014-05",86 127 | "2014-06",78 128 | "2014-07",80 129 | "2014-08",84 130 | "2014-09",84 131 | "2014-10",92 132 | "2014-11",84 133 | "2014-12",80 134 | "2015-01",74 135 | "2015-02",87 136 | "2015-03",81 137 | "2015-04",78 138 | "2015-05",77 139 | "2015-06",72 140 | "2015-07",72 141 | "2015-08",70 142 | "2015-09",69 143 | "2015-10",81 144 | "2015-11",73 145 | "2015-12",69 146 | "2016-01",68 147 | "2016-02",76 148 | "2016-03",73 149 | "2016-04",65 150 | "2016-05",68 151 | "2016-06",63 152 | "2016-07",61 153 | "2016-08",60 154 | "2016-09",63 155 | "2016-10",67 156 | "2016-11",61 157 | "2016-12",58 158 | "2017-01",57 159 | "2017-02",71 160 | "2017-03",62 161 | "2017-04",60 162 | "2017-05",58 163 | "2017-06",55 164 | "2017-07",53 165 | "2017-08",54 166 | "2017-09",54 167 | "2017-10",61 168 | "2017-11",54 169 | "2017-12",51 170 | "2018-01",50 171 | "2018-02",60 172 | "2018-03",57 173 | "2018-04",50 174 | "2018-05",51 175 | "2018-06",49 176 | "2018-07",47 177 | "2018-08",47 178 | "2018-09",49 179 | "2018-10",53 180 | "2018-11",48 181 | "2018-12",50 182 | "2019-01",51 183 | "2019-02",55 184 | "2019-03",50 185 | "2019-04",48 186 | "2019-05",46 187 | "2019-06",44 188 | "2019-07",43 189 | "2019-08",44 190 | "2019-09",45 191 | "2019-10",57 192 | "2019-11",45 193 | "2019-12",44 194 | "2020-01",43 195 | "2020-02",51 196 | "2020-03",46 197 | "2020-04",58 198 | "2020-05",60 199 | "2020-06",53 200 | "2020-07",50 201 | "2020-08",47 202 | "2020-09",44 203 | "2020-10",49 204 | "2020-11",44 205 | "2020-12",43 206 | -------------------------------------------------------------------------------- /Preprocessing/Normalization/.ipynb_checkpoints/Data Preprocessing - Normalization-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 4 6 | } 7 | -------------------------------------------------------------------------------- /Preprocessing/RemoveDuplicates/cupcake_duplicates.csv: -------------------------------------------------------------------------------- 1 | Mese,Cupcake 2004-01,5 2004-01,5 2004-01,5 2004-02,5 2004-03,4 2004-04,6 2004-05,5 2004-06,6 2004-07,6 2004-08,5 2004-09,5 2004-10,10 2004-11,7 2004-12,7 2005-01,8 2005-02,8 2005-03,8 2005-04,8 2005-05,8 2005-06,8 2005-07,8 2005-08,8 2005-09,8 2005-10,11 2005-11,8 2005-12,8 2006-01,9 2006-02,11 2006-03,10 2006-04,11 2006-05,11 2006-06,11 2006-07,11 2006-08,12 2006-09,13 2006-10,15 2006-11,11 2006-12,12 2007-01,12 2007-02,16 2007-03,16 2007-04,16 2007-05,15 2007-06,18 2007-07,18 2007-08,19 2007-09,18 2007-09,18 2007-09,18 2007-10,24 2007-11,19 2007-12,18 2008-01,20 2008-02,25 2008-03,25 2008-04,25 2008-04,25 2008-04,25 2008-05,25 2008-06,24 2008-07,25 2008-08,26 2008-09,26 2008-10,32 2008-11,29 2008-12,27 2009-01,29 2009-02,36 2009-03,35 2009-04,36 2009-05,36 2009-06,36 2009-07,37 2009-08,40 2009-09,36 2009-10,44 2009-11,38 2009-12,36 2010-01,38 2010-02,47 2010-03,46 2010-04,45 2010-05,50 2010-06,49 2010-07,57 2010-08,65 2010-09,60 2010-10,64 2010-11,58 2010-12,56 2011-01,57 2011-02,73 2011-03,71 2011-04,75 2011-05,70 2011-06,70 2011-07,71 2011-08,79 2011-09,75 2011-10,88 2011-11,80 2011-12,80 2012-01,80 2012-02,99 2012-03,92 2012-04,94 2012-05,94 2012-06,93 2012-07,88 2012-08,89 2012-09,94 2012-10,100 2012-11,87 2012-12,85 2013-01,82 2013-02,98 2013-03,100 2013-04,87 2013-05,91 2013-06,84 2013-07,84 2013-08,85 2013-09,88 2013-10,97 2013-11,87 2013-12,86 2014-01,82 2014-02,100 2014-03,92 2014-04,90 2014-05,86 2014-06,78 2014-07,80 2014-08,84 2014-09,84 2014-10,92 2014-11,84 2014-12,80 2015-01,74 2015-02,87 2015-03,81 2015-04,78 2015-05,77 2015-06,72 2015-07,72 2015-08,70 2015-09,69 2015-10,81 2015-11,73 2015-12,69 2016-01,68 2016-02,76 2016-03,73 2016-04,65 2016-05,68 2016-06,63 2016-07,61 2016-08,60 2016-09,63 2016-10,67 2016-11,61 2016-12,58 2017-01,57 2017-02,71 2017-03,62 2017-04,60 2017-05,58 2017-06,55 2017-07,53 2017-08,54 2017-09,54 2017-10,61 2017-11,54 2017-12,51 2018-01,50 2018-02,60 2018-03,57 2018-04,50 2018-05,51 2018-06,49 2018-07,47 2018-08,47 2018-09,49 2018-10,53 2018-11,48 2018-12,50 2019-01,51 2019-02,55 2019-03,50 2019-04,48 2019-05,46 2019-06,44 2019-07,43 2019-08,44 2019-09,45 2019-10,57 2019-11,45 2019-12,44 2020-01,43 2020-02,51 2020-03,46 2020-04,58 2020-05,60 2020-06,53 2020-07,50 2020-08,47 2020-09,44 2020-10,49 2020-11,44 2020-12,43 -------------------------------------------------------------------------------- /Preprocessing/SQLDF/.ipynb_checkpoints/SQL DF-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 4 6 | } 7 | -------------------------------------------------------------------------------- /Preprocessing/SpeedUp/PySpark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Generate DataFrame" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "\n", 19 | "data = []\n", 20 | "bins = np.arange(1000, 500_000, step=10_000)\n", 21 | "for N in bins:\n", 22 | " for i in range(0, N):\n", 23 | " data.append({'index': i, 'value': i})\n", 24 | " df = pd.DataFrame(data)\n", 25 | " df.to_csv(str(N) + '.csv')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "# Pandas" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "import time\n", 42 | "\n", 43 | "pandas_results = []\n", 44 | "\n", 45 | "for i in bins:\n", 46 | " s = time.time()\n", 47 | " df = pd.read_csv(str(i) + '.csv')\n", 48 | " e = time.time()\n", 49 | " pandas_time = e - s \n", 50 | " pandas_results.append({'time' : pandas_time, 'bin' : i})" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "# PySpark" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "import findspark\n", 67 | "#import all the libraries of pyspark.sql\n", 68 | "from pyspark.sql import *\n", 69 | "#import SparkContext and SparkConf\n", 70 | "from pyspark import SparkContext, SparkConf\n", 71 | "\n", 72 | "#setup configuration property \n", 73 | "#set the master URL \n", 74 | "#set an application name \n", 75 | "conf = SparkConf().setMaster(\"local\").setAppName(\"sparkproject\")\n", 76 | "#start spark cluster \n", 77 | "#if already started then get it else start it \n", 78 | "sc = SparkContext.getOrCreate(conf=conf)\n", 79 | "\n", 80 | "spark = SparkSession.builder.getOrCreate()" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "pyspark_results = []\n", 90 | "\n", 91 | "for i in bins:\n", 92 | " s = time.time()\n", 93 | " df = spark.read.csv(str(i) + '.csv',inferSchema =True,header=True)\n", 94 | " e = time.time()\n", 95 | " pyspark_time = e - s \n", 96 | " pyspark_results.append({'time' : pandas_time, 'bin' : i})" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "sc.stop()" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "df_spark = pd.DataFrame(pyspark_results)\n", 115 | "df_pandas = pd.DataFrame(pandas_results)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "import matplotlib.pyplot as plt\n", 125 | "\n", 126 | "plt.plot(df_spark['bin'], df_spark['time'], label='pandas')\n", 127 | "plt.plot(df_pandas['bin'], df_pandas['time'], label='pyspark')\n", 128 | "plt.legend()\n", 129 | "plt.grid()\n", 130 | "plt.xlabel('N')\n", 131 | "plt.ylabel('time (sec.)')\n", 132 | "plt.title('Time elapsed to load a dataset with N records')\n", 133 | "plt.show()" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [] 142 | } 143 | ], 144 | "metadata": { 145 | "kernelspec": { 146 | "display_name": "osiris_fo", 147 | "language": "python", 148 | "name": "osiris_fo" 149 | }, 150 | "language_info": { 151 | "codemirror_mode": { 152 | "name": "ipython", 153 | "version": 3 154 | }, 155 | "file_extension": ".py", 156 | "mimetype": "text/x-python", 157 | "name": "python", 158 | "nbconvert_exporter": "python", 159 | "pygments_lexer": "ipython3", 160 | "version": "3.8.1" 161 | } 162 | }, 163 | "nbformat": 4, 164 | "nbformat_minor": 4 165 | } 166 | -------------------------------------------------------------------------------- /Preprocessing/Standardization/.ipynb_checkpoints/Data Preprocessing - Standardization-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 4 6 | } 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ![Logo](https://alod83online.files.wordpress.com/2021/07/logo.jpg) 3 | 4 | 5 | # Data Science for Beginners 6 | 7 | A collection of Jupyter Notebooks, HTML and JS code for Data Scientists. 8 | 9 | Comments on the single scripts can be found on my [Medium blog](https://alod83.medium.com/) as well as on my [Web site](https://alod83.altervista.org/). 10 | 11 | ## Programming Languages 12 | * Python 3.X 13 | * HTML5 14 | * Javascript, with a focus on D3.JS 15 | * CSS 16 | 17 | Other interesting tutorials can be found on [my Observable Profile](https://observablehq.com/@alod83). 18 | ## Authors 19 | 20 | - [@alod83](https://www.github.com/alod83) 21 | 22 | 23 | ## Documentation 24 | 25 | The Project is organised in different folders, one covered topic: 26 | * **Data Collection** - data extraction from HTML, Twitter, PDF ... 27 | * **Preprocessing** - missing data, duplicates, normalisation, binning ... 28 | * **Data Analysis** - full workflow in scikit-learn and PyCaret, as well as overfitting, Auto ML, ... 29 | * **Text Analysis** - sentiment analysis, ... 30 | * **Data Visualisation** - examples in Altair, Plotly, D3.js, ... 31 | * **Data Narrative** - how to improve data visualisations. 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /Tests/Calculator/calculator.py: -------------------------------------------------------------------------------- 1 | def add(a, b): 2 | return a + b 3 | 4 | def subtract(a, b): 5 | return a - b 6 | 7 | def multiply(a, b): 8 | return a * b 9 | 10 | def divide(a, b): 11 | return a / b -------------------------------------------------------------------------------- /Tests/Calculator/test_calculator.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from calculator import * 3 | 4 | def test_addition(): 5 | result = add(2, 4) 6 | assert result == 6 7 | 8 | result = add(-1, 3) 9 | assert result == 2 10 | 11 | result = add(0, 0) 12 | assert result == 0 13 | 14 | def test_addition(): 15 | result = add(2, 3) 16 | assert result == 5 17 | 18 | result = add(-1, 5) 19 | assert result == 4 20 | 21 | result = add(0, 0) 22 | assert result == 0 23 | 24 | def test_subtraction(): 25 | result = subtract(5, 2) 26 | assert result == 3 27 | 28 | result = subtract(10, -3) 29 | assert result == 13 30 | 31 | result = subtract(0, 0) 32 | assert result == 0 33 | 34 | def test_multiplication(): 35 | result = multiply(2, 3) 36 | assert result == 6 37 | 38 | result = multiply(-4, 5) 39 | assert result == -20 40 | 41 | result = multiply(0, 100) 42 | assert result == 0 43 | 44 | def test_division(): 45 | result = divide(10, 2) 46 | assert result == 5 47 | 48 | result = divide(25, -5) 49 | assert result == -5 50 | 51 | with pytest.raises(ZeroDivisionError): 52 | divide(10, 0) 53 | -------------------------------------------------------------------------------- /Tests/CalculatorFixture/calculator.py: -------------------------------------------------------------------------------- 1 | class Calculator: 2 | def __init__(self): 3 | self.result = 0 4 | 5 | def add(self, a, b): 6 | self.result = a + b 7 | return self.result 8 | 9 | def subtract(self, a, b): 10 | self.result = a - b 11 | return self.result 12 | 13 | def multiply(self, a, b): 14 | self.result = a * b 15 | return self.result 16 | 17 | def divide(self, a, b): 18 | self.result = a / b 19 | return self.result 20 | -------------------------------------------------------------------------------- /Tests/CalculatorFixture/test_calculator.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from calculator import * 3 | 4 | @pytest.fixture 5 | def calculator(): 6 | # Create an instance of the calculator 7 | calc = Calculator() 8 | return calc 9 | 10 | def test_addition(calculator): 11 | result = calculator.add(2, 3) 12 | assert result == 5 13 | 14 | def test_subtraction(calculator): 15 | result = calculator.subtract(5, 2) 16 | assert result == 3 17 | 18 | def test_multiplication(calculator): 19 | result = calculator.multiply(4, 3) 20 | assert result == 12 21 | 22 | def test_division(calculator): 23 | result = calculator.divide(10, 2) 24 | assert result == 5 -------------------------------------------------------------------------------- /TextAnalysis/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/TextAnalysis/.DS_Store -------------------------------------------------------------------------------- /TextAnalysis/.ipynb_checkpoints/Spark NLP-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /TextAnalysis/Render Original Layout of Text Document after Manipulation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# How to Render the Original Layout of a Text Document After a Manipulation\n", 8 | "\n", 9 | "## Document Manipulation\n", 10 | "Anonymise dates" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 229, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import re\n", 20 | "\n", 21 | "def anonymiseDate(txt):\n", 22 | " dateRE=\"(\\d{1,2}[\\.-/]\\d{1,2}[\\.-/]\\d{4})|(\\d{4})\"\n", 23 | " matches=re.findall(dateRE,txt)\n", 24 | " if matches:\n", 25 | " for match in matches:\n", 26 | " index = 0\n", 27 | " if match[index] == '':\n", 28 | " index = 1\n", 29 | " txt = txt.replace(match[index], 'X')\n", 30 | " return txt" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "# Open the document with extension .docx" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 246, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "from docx import Document\n", 47 | "\n", 48 | "doc = Document('source/WilliamShakespeare.docx')" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "# Anonymise the text, paragraph by paragraph and run by run" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 241, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "for ip in doc.paragraphs:\n", 65 | " for run in ip.runs:\n", 66 | " anonymised_txt = anonymiseDate(run.text)\n", 67 | " run.text = anonymised_txt " 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "# Save the document" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 242, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "doc.save('Anonymised.docx')" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "# Open file with different extension" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 261, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "name": "stdout", 100 | "output_type": "stream", 101 | "text": [ 102 | "Process is interrupted.\n" 103 | ] 104 | } 105 | ], 106 | "source": [ 107 | "%%bash\n", 108 | "\n", 109 | "/Applications/OpenOffice.app/Contents/MacOS/soffice --headless --convert-to 'docx' -o WilliamShakespeare_rtf.docx \"source/WilliamShakespeare_rtf.rtf\"\n", 110 | "\n", 111 | "#soffice = '/Applications/OpenOffice.app/Contents/MacOS/soffice'\n", 112 | "#subprocess.call([soffice_path, '--headless', '--convert-to', 'docx', 'source/WilliamShakespeare_rtf.rtf'])" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [] 121 | } 122 | ], 123 | "metadata": { 124 | "kernelspec": { 125 | "display_name": "Python 3", 126 | "language": "python", 127 | "name": "python3" 128 | }, 129 | "language_info": { 130 | "codemirror_mode": { 131 | "name": "ipython", 132 | "version": 3 133 | }, 134 | "file_extension": ".py", 135 | "mimetype": "text/x-python", 136 | "name": "python", 137 | "nbconvert_exporter": "python", 138 | "pygments_lexer": "ipython3", 139 | "version": "3.8.1" 140 | } 141 | }, 142 | "nbformat": 4, 143 | "nbformat_minor": 4 144 | } 145 | -------------------------------------------------------------------------------- /TextAnalysis/register.txt: -------------------------------------------------------------------------------- 1 | On August 21 1826 a son was born to John Bon and named him Francis. 2 | On June 11 1813 a daughter was born to James Donne naming her Mary Sarah. 3 | On January 1 1832 a son was born to his father David Borne and named him John. -------------------------------------------------------------------------------- /TextAnalysis/source/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/TextAnalysis/source/.DS_Store -------------------------------------------------------------------------------- /TextAnalysis/source/WilliamShakespeare.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alod83/data-science/84fdc7eac9ebd365c8d0ca7583923d25a16b6905/TextAnalysis/source/WilliamShakespeare.docx -------------------------------------------------------------------------------- /TextAnalysis/string-similarity/jaro_similarity.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.simplefilter(action='ignore', category=FutureWarning) 3 | 4 | import pandas as pd 5 | from jellyfish import jaro_similarity 6 | 7 | # load the cognomi.csv file as a dataframe 8 | df = pd.read_csv('surnames.csv') 9 | 10 | similarity_df = pd.DataFrame(columns=['surname1', 'surname2', 'jaro_similarity']) 11 | 12 | for i in range(len(df)): 13 | surname1 = df['surname'][i] 14 | print(surname1) 15 | 16 | for j in range(i+1, len(df)): 17 | surname2 = df['surname'][j] 18 | jaro_sim = jaro_similarity(surname1, surname2) 19 | similarity_df = similarity_df.append({'surname1': surname1, 'surname2': surname2, 'jaro_similarity' : jaro_sim}, ignore_index=True) 20 | 21 | # save the new dataframe to a file 22 | similarity_df.to_csv('jaro_similarity.csv', index=False) 23 | 24 | candidates_jaro_df = similarity_df[similarity_df['jaro_similarity'] >= 0.9].reset_index(drop=True) 25 | candidates_jaro_df.to_csv('candidates_jaro.csv', index=False) -------------------------------------------------------------------------------- /TextAnalysis/string-similarity/jaro_soundex_similarity.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.simplefilter(action='ignore', category=FutureWarning) 3 | 4 | import pandas as pd 5 | from jellyfish import soundex, jaro_similarity 6 | 7 | # load the cognomi.csv file as a dataframe 8 | df = pd.read_csv('surnames.csv') 9 | 10 | similarity_df = pd.DataFrame(columns=['surname1', 'surname2', 'jaro_soundex_similarity']) 11 | 12 | for i in range(len(df)): 13 | surname1 = df['surname'][i] 14 | print(surname1) 15 | 16 | for j in range(i+1, len(df)): 17 | surname2 = df['surname'][j] 18 | jaro_sim = jaro_similarity(soundex(surname1), soundex(surname2)) 19 | similarity_df = similarity_df.append({'surname1': surname1, 'surname2': surname2, 'jaro_soundex_similarity' : jaro_sim}, ignore_index=True) 20 | 21 | # save the new dataframe to a file 22 | similarity_df.to_csv('jaro_soundex_similarity.csv', index=False) 23 | 24 | candidates_jaro_df = similarity_df[similarity_df['jaro_soundex_similarity'] >= 0.9].reset_index(drop=True) 25 | candidates_jaro_df.to_csv('candidates_jaro_soundex.csv', index=False) -------------------------------------------------------------------------------- /TextAnalysis/string-similarity/leven_similarity.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.simplefilter(action='ignore', category=FutureWarning) 3 | 4 | import pandas as pd 5 | from jellyfish import levenshtein_distance 6 | 7 | # load the cognomi.csv file as a dataframe 8 | df = pd.read_csv('surnames.csv') 9 | 10 | similarity_df = pd.DataFrame(columns=['surname1', 'surname2', 'levenshtein_distance']) 11 | 12 | for i in range(len(df)): 13 | surname1 = df['surname'][i] 14 | print(surname1) 15 | 16 | for j in range(i+1, len(df)): 17 | surname2 = df['surname'][j] 18 | leven_distance = levenshtein_distance(surname1, surname2) 19 | similarity_df = similarity_df.append({'surname1': surname1, 'surname2': surname2, 'levenshtein_distance' : leven_distance}, ignore_index=True) 20 | 21 | # save the new dataframe to a file 22 | similarity_df.to_csv('leven_similarity.csv', index=False) 23 | 24 | candidates_levenshtein_df = similarity_df[similarity_df['levenshtein_distance'] <= 1].reset_index(drop=True) 25 | candidates_levenshtein_df.to_csv('candidates_levenshtein.csv', index=False) -------------------------------------------------------------------------------- /TextAnalysis/string-similarity/surnames.csv: -------------------------------------------------------------------------------- 1 | surname 2 | Smith 3 | Johnson 4 | Williams 5 | Brown 6 | Jones 7 | Garcia 8 | Davis 9 | Rodriguez 10 | Martinez 11 | Taylor 12 | Andersson 13 | White 14 | Wilson 15 | Thomas 16 | Lee 17 | Hall 18 | Harris 19 | Perez 20 | Jackson 21 | Martin 22 | Thompson 23 | Murphy 24 | Turner 25 | Baker 26 | Kim 27 | Johnsonn 28 | Daviis 29 | Rodriguuez 30 | Martinezz 31 | Tayylor --------------------------------------------------------------------------------