├── Coronavirus.pbix ├── neo4j_recommender ├── users_minimal_test.dat ├── ratings_minimal_test.dat └── movies_minimal_test.dat ├── Arxiv_graph ├── arxiv_df.pkl ├── arxiv_math_20220301.xlsx ├── graph-dbms-neo4j-Mar-18-2022-17-13-25.dump ├── README.md ├── display_graph.html └── Scrap Arxiv and insert to Neo4j v1.ipynb ├── online_retail ├── GDP.xlsx ├── customer_segments_RFM_country.pickle ├── customer_segments_buying_categories.pickle └── Online_retail_Combine_Segmentations.ipynb ├── topological_data_analysis └── README.md ├── clustering_example └── Outlier_example.R ├── README.md └── Markov_text_generation.ipynb /Coronavirus.pbix: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpanagop/data_analytics_examples/master/Coronavirus.pbix -------------------------------------------------------------------------------- /neo4j_recommender/users_minimal_test.dat: -------------------------------------------------------------------------------- 1 | 1::F::1::10::48067 2 | 2::M::56::16::70072 3 | 3::M::25::15::55117 -------------------------------------------------------------------------------- /Arxiv_graph/arxiv_df.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpanagop/data_analytics_examples/master/Arxiv_graph/arxiv_df.pkl -------------------------------------------------------------------------------- /online_retail/GDP.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpanagop/data_analytics_examples/master/online_retail/GDP.xlsx -------------------------------------------------------------------------------- /Arxiv_graph/arxiv_math_20220301.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpanagop/data_analytics_examples/master/Arxiv_graph/arxiv_math_20220301.xlsx -------------------------------------------------------------------------------- /topological_data_analysis/README.md: -------------------------------------------------------------------------------- 1 | # Topological Data Analysis 2 | 3 | This folder contains code with examples of topological data analysis. 4 | -------------------------------------------------------------------------------- /online_retail/customer_segments_RFM_country.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpanagop/data_analytics_examples/master/online_retail/customer_segments_RFM_country.pickle -------------------------------------------------------------------------------- /Arxiv_graph/graph-dbms-neo4j-Mar-18-2022-17-13-25.dump: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpanagop/data_analytics_examples/master/Arxiv_graph/graph-dbms-neo4j-Mar-18-2022-17-13-25.dump -------------------------------------------------------------------------------- /online_retail/customer_segments_buying_categories.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpanagop/data_analytics_examples/master/online_retail/customer_segments_buying_categories.pickle -------------------------------------------------------------------------------- /neo4j_recommender/ratings_minimal_test.dat: -------------------------------------------------------------------------------- 1 | 1::1::5::978300760 2 | 1::2::5::978302109 3 | 1::4::3::978301968 4 | 2::1::5::978300760 5 | 2::2::5::978302109 6 | 3::1::4::978302109 7 | 3::10::5::978302109 -------------------------------------------------------------------------------- /neo4j_recommender/movies_minimal_test.dat: -------------------------------------------------------------------------------- 1 | 1::Toy Story (1995)::Animation|Children's|Comedy 2 | 2::Jumanji (1995)::Adventure|Children's|Fantasy 3 | 3::Grumpier Old Men (1995)::Comedy|Romance 4 | 4::Waiting to Exhale (1995)::Comedy|Drama 5 | 5::Father of the Bride Part II (1995)::Comedy 6 | 6::Heat (1995)::Action|Crime|Thriller 7 | 7::Sabrina (1995)::Comedy|Romance 8 | 8::Tom and Huck (1995)::Adventure|Children's 9 | 9::Sudden Death (1995)::Action 10 | 10::GoldenEye (1995)::Action|Adventure|Thriller -------------------------------------------------------------------------------- /clustering_example/Outlier_example.R: -------------------------------------------------------------------------------- 1 | ### Preample #### 2 | # Loading libraries 3 | library(ggplot2) 4 | library(cowplot) 5 | 6 | # Setting random generator seed 7 | set.seed(42) 8 | 9 | ### No Outlier #### 10 | # First cluster - cluster 0 11 | x1 = rnorm(20,0,0.5) 12 | y1 = rnorm(20,0,0.5) 13 | 14 | # Second cluster - cluster 1 15 | x2 =rnorm(20,2,0.5) 16 | y2 =rnorm(20,2,0.5) 17 | 18 | # Dataset creation 19 | d1=data.frame(x=x1,y=y1,c=rep(0,10)) 20 | d2=data.frame(x=x2,y=y2,c=rep(1,10)) 21 | d=rbind(d1,d2) 22 | 23 | # k-means 24 | d_clst=kmeans(d[,-3],2) 25 | 26 | g1 = ggplot(d)+ 27 | aes(x=x,y=y,color=c,shape=as.factor(d_clst$cluster))+geom_point()+ 28 | theme(legend.position="none")+ scale_color_gradient(low="blue", high="red") 29 | g1 30 | 31 | ### Outlier #### 32 | 33 | # Data 34 | out=data.frame(x=c(4),y=c(4),c=c(1)) 35 | d_out=rbind(d,out) 36 | #k-means 37 | d_out_clst=kmeans(d_out[,-3],2) 38 | # Plot 39 | g2 = ggplot(d_out)+ 40 | aes(x=x,y=y,color=c,shape=as.factor(d_out_clst$cluster))+geom_point()+ 41 | theme(legend.position="none")+ scale_color_gradient(low="blue", high="red") 42 | g2 43 | 44 | # Combining two graphs 45 | plot_grid(g1,g2, labels=c("No outlier", "Outlier"), ncol = 2, nrow = 1) 46 | -------------------------------------------------------------------------------- /Arxiv_graph/README.md: -------------------------------------------------------------------------------- 1 | # arXiv meatadata analysis with neo4j 2 | 3 | This folder contains Python code for: 4 | - extracting (scraping) metadata from arXiv 5 | - importing them in neo4j graph database, 6 | - perform analysis on them. 7 | 8 | In especial, 9 | * [Scrap Arxiv and insert to Neo4j v1.ipynb](https://github.com/dpanagop/data_analytics_examples/blob/master/Arxiv_graph/Scrap%20Arxiv%20and%20insert%20to%20Neo4j%20v1.ipynb) is a jupyter notebook that extracts (scraps) data from arXiv and imports them in neo4j graph database. The extracted data are alos stored in an excel file [arxiv_math_20220301.xlsx](https://github.com/dpanagop/data_analytics_examples/blob/master/Arxiv_graph/arxiv_math_20220301.xlsx) and in a pickle file [arxiv_df.pkl](https://github.com/dpanagop/data_analytics_examples/blob/master/Arxiv_graph/arxiv_df.pkl). 10 | * [Embedding v2.ipynb](https://github.com/dpanagop/data_analytics_examples/blob/master/Arxiv_graph/Embedding%20v2.ipynb) does the analysis. The main highlights, are creating a node embedding with node2vec and using the result with k-means and UMAP. In [Embedding v2-Stress.ipynb](https://github.com/dpanagop/data_analytics_examples/blob/master/Arxiv_graph/Embedding%20v2-Stress.ipynb) you can see the results of the sama analysis for a dataset of around 10k articles. 11 | * [display_graph.html](https://github.com/dpanagop/data_analytics_examples/blob/master/Arxiv_graph/display_graph.html) uses neovis.js to produce a graph the that depicts the relationship between different Mathematics subject classifications. 12 | * [graph-dbms-neo4j-Mar-18-2022-17-13-25.dump](graph-dbms-neo4j-Mar-18-2022-17-13-25.dump) is a dump of the neo4j database. 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # data_analytics_examples 2 | 3 | This repository contains code that is used for various data analysis/data science projects. In most cases, the code is related to a Medium post. 4 | 5 | - [online_retail](https://github.com/dpanagop/data_analytics_examples/tree/master/online_retail) has code related to 6 | an example of segmentation of wholesale customers. It is split into two parts. The [first part](https://towardsdatascience.com/customer-segmentation-part-i-2c5e2145e719) uses natural language processing 7 | to cluster clients based on their transactions. The [second part](https://towardsdatascience.com/customer-segmentation-part-ii-1c94bdc03de5) creates an RFM segmentation which is combined with the results of the first part. 8 | 9 | - [Coronavirus.pbix](https://github.com/dpanagop/data_analytics_examples/blob/master/Coronavirus.pbix) is a Power BI dashboard that about COVID-19. You can read more in the related 10 | [Medium post](https://dpanagop-53386.medium.com/covid-19-dashboard-with-power-bi-78caf8d16856?source=your_stories_page-------------------------------------). 11 | 12 | - [Markov_text_generation.ipynb](https://github.com/dpanagop/data_analytics_examples/blob/master/Markov_text_generation.ipynb) is a jupyter notebook that uses Markov chains to produce text. You can read more in 13 | the related [Medium article](https://towardsdatascience.com/using-a-transition-matrix-to-generate-text-in-python-c5e78495b09b?source=your_stories_page-------------------------------------). 14 | 15 | - [UMAP](https://github.com/dpanagop/data_analytics_examples/blob/master/UMAP.ipynb) is an example of Uniform Manifold Approximation and Projection (UMAP) (for details see [UMAP](https://umap-learn.readthedocs.io/en/latest/) algorithm) 16 | 17 | - [Arxiv_graph](https://github.com/dpanagop/data_analytics_examples/tree/master/Arxiv_graph) arXiv meatadata analysis with neo4j 18 | 19 | - [neo4j_recommender](https://github.com/dpanagop/data_analytics_examples/tree/master/neo4j_recommender) creation of movie recommenders with Neo4j 20 | -------------------------------------------------------------------------------- /Arxiv_graph/display_graph.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | DataViz 4 | 10 | 11 | 12 | 26 | 27 | 41 | 42 | 67 | 68 |
69 | 70 | " -------------------------------------------------------------------------------- /Arxiv_graph/Scrap Arxiv and insert to Neo4j v1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Importing necessary libraries\n", 10 | "import requests\n", 11 | "import pandas as pd\n", 12 | "from bs4 import BeautifulSoup\n", 13 | "import re" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 190, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# Queries\n", 23 | "# Fetch 50 articles in Mathematics from 2022-03-01 to 2022-03-07\n", 24 | "# query = 'https://arxiv.org/search/advanced?advanced=1&terms-0-operator=AND&terms-0-term=&terms-0-field=title&classification-mathematics=y&classification-physics_archives=all&classification-include_cross_list=include&date-year=&date-filter_by=date_range&date-from_date=2022-03-01&date-to_date=2022-03-07&date-date_type=submitted_date&abstracts=show&order=-announced_date_first'\n", 25 | "# Fetch 200 (this is the maximum) articles in Mathematics from 2022-03-01 to 2022-03-07\n", 26 | "query = 'https://arxiv.org/search/advanced?advanced=1&terms-0-term=&terms-0-operator=AND&terms-0-field=title&classification-mathematics=y&classification-physics_archives=all&classification-include_cross_list=include&date-filter_by=date_range&date-year=&date-from_date=2022-03-01&date-to_date=2022-03-07&date-date_type=submitted_date&abstracts=show&size=200&order=-announced_date_first'\n", 27 | "# Fetch next 200 articles in Mathematics from 2022-03-01 to 2022-03-01 (same date)\n", 28 | "# query = 'https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term=&terms-0-field=title&classification-mathematics=y&classification-physics_archives=all&classification-include_cross_list=include&date-filter_by=past_12&date-year=&date-from_date=2022-03-01&date-to_date=2022-03-01&date-date_type=submitted_date&abstracts=show&size=200&order=-announced_date_first&start=200'" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 191, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "page = requests.get(query)\n", 38 | "soup = BeautifulSoup(page.content, 'html.parser')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 248, 44 | "metadata": { 45 | "scrolled": false 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "arxiv_id_list = []\n", 50 | "title_list = []\n", 51 | "authors_list = []\n", 52 | "classification_list = []\n", 53 | "classification_detailed_list= []\n", 54 | "for article in soup.find_all('li',{'class':\"arxiv-result\"}):\n", 55 | " arxiv_id = article.find_all('p',{'class':\"list-title is-inline-block\"}) \n", 56 | " arxiv_id = arxiv_id[0].find_all('a',href=True)[0].get_text()\n", 57 | " title = article.find_all('p',{'class':\"title is-5 mathjax\"})\n", 58 | " title = title[0].get_text()\n", 59 | " title = title.strip()\n", 60 | " authors=article.find_all('p',{'class':\"authors\"})\n", 61 | " authors = [x.get_text().strip() for x in authors[0].find_all('a')]\n", 62 | " comments = article.find_all('p',{'class':\"comments is-size-7\"})\n", 63 | " classification_primary = article.find_all('span',{'class':'tag is-small is-link tooltip is-tooltip-top'})[0].attrs.get('data-tooltip')\n", 64 | " classification_secondary = article.find_all('span',{'class':'tag is-small is-grey tooltip is-tooltip-top'})\n", 65 | " if len(classification_secondary)>0:\n", 66 | " classification_secondary = classification_secondary[0].attrs.get('data-tooltip')\n", 67 | " classification = [classification_primary,classification_secondary]\n", 68 | " else:\n", 69 | " classification = [classification_primary]\n", 70 | " if len(comments)>0 :\n", 71 | " comments = comments[0].get_text()\n", 72 | " classification_detailed = re.search('MSC Class:\\n(.*)\\n',comments)\n", 73 | " if classification_detailed != None:\n", 74 | " classification_detailed = classification_detailed.group(1)\n", 75 | " classification_detailed = classification_detailed.strip().split(sep=';')\n", 76 | " else :\n", 77 | " classification_detailed = ['N/A']\n", 78 | " #print(title)\n", 79 | " #print(authors)\n", 80 | " #print(classification)\n", 81 | " #print(100*'-')\n", 82 | " arxiv_id_list.append(arxiv_id)\n", 83 | " title_list.append(title)\n", 84 | " authors_list.append(authors)\n", 85 | " classification_list.append(classification)\n", 86 | " classification_detailed_list.append(classification_detailed)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 250, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "df = pd.DataFrame({'airxv_id':arxiv_id_list,\n", 96 | " 'title':title_list,\n", 97 | " 'authors':authors_list,\n", 98 | " 'classification':classification_list,\n", 99 | " 'classification_detailed':classification_detailed_list})" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 251, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/html": [ 110 | "
\n", 111 | "\n", 124 | "\n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | "
airxv_idtitleauthorsclassificationclassification_detailed
0arXiv:2203.07873Asymptotic Fermat for signatures $(p,p,2)$ and...[Diana Mocanu][Number Theory][11F80, 11G05, 11D41]
1arXiv:2203.05018Application of neural-network hybrid models in...[Chentong Li, Zhou Changsheng, Junmin Liu, Yao...[Dynamical Systems, Physics and Society][11F80, 11G05, 11D41]
2arXiv:2203.05017Asymmetric Duffing oscillator: jump manifold a...[Jan Kyzioł, Andrzej Okniński][Dynamical Systems][N/A]
3arXiv:2203.04148Numerical solution of optimal control of ather...[F. Nasresfahani, M. R. Eslahchi][Optimization and Control, Numerical Analysis][N/A]
4arXiv:2203.04145The structure of the linearizer of a connected...[Oleg Aristov][Group Theory][N/A]
\n", 178 | "
" 179 | ], 180 | "text/plain": [ 181 | " airxv_id title \\\n", 182 | "0 arXiv:2203.07873 Asymptotic Fermat for signatures $(p,p,2)$ and... \n", 183 | "1 arXiv:2203.05018 Application of neural-network hybrid models in... \n", 184 | "2 arXiv:2203.05017 Asymmetric Duffing oscillator: jump manifold a... \n", 185 | "3 arXiv:2203.04148 Numerical solution of optimal control of ather... \n", 186 | "4 arXiv:2203.04145 The structure of the linearizer of a connected... \n", 187 | "\n", 188 | " authors \\\n", 189 | "0 [Diana Mocanu] \n", 190 | "1 [Chentong Li, Zhou Changsheng, Junmin Liu, Yao... \n", 191 | "2 [Jan Kyzioł, Andrzej Okniński] \n", 192 | "3 [F. Nasresfahani, M. R. Eslahchi] \n", 193 | "4 [Oleg Aristov] \n", 194 | "\n", 195 | " classification classification_detailed \n", 196 | "0 [Number Theory] [11F80, 11G05, 11D41] \n", 197 | "1 [Dynamical Systems, Physics and Society] [11F80, 11G05, 11D41] \n", 198 | "2 [Dynamical Systems] [N/A] \n", 199 | "3 [Optimization and Control, Numerical Analysis] [N/A] \n", 200 | "4 [Group Theory] [N/A] " 201 | ] 202 | }, 203 | "execution_count": 251, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "df.head()" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 195, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "data": { 219 | "text/plain": [ 220 | "200" 221 | ] 222 | }, 223 | "execution_count": 195, 224 | "metadata": {}, 225 | "output_type": "execute_result" 226 | } 227 | ], 228 | "source": [ 229 | "len(df.classification)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 196, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "neo4j_data=[]\n", 239 | "for arxiv_id,title,author,classification in zip(arxiv_id_list,title_list,authors_list,classification_list):\n", 240 | " neo4j_data.append({'arxiv_id':arxiv_id,'title':title,'author':author,'classification':classification})" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 197, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "from neo4j import GraphDatabase\n", 250 | "host = 'bolt://localhost:7687'\n", 251 | "user = 'neo4j'\n", 252 | "password = 'arxiv'\n", 253 | "driver = GraphDatabase.driver(host,auth=(user, password))" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 198, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "def run_query(query, params={}):\n", 263 | " with driver.session() as session:\n", 264 | " result = session.run(query, params)\n", 265 | " return pd.DataFrame([r.values() for r in result], columns=result.keys())" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 199, 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "data": { 275 | "text/html": [ 276 | "
\n", 277 | "\n", 290 | "\n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | "
\n", 299 | "
" 300 | ], 301 | "text/plain": [ 302 | "Empty DataFrame\n", 303 | "Columns: []\n", 304 | "Index: []" 305 | ] 306 | }, 307 | "execution_count": 199, 308 | "metadata": {}, 309 | "output_type": "execute_result" 310 | } 311 | ], 312 | "source": [ 313 | "run_query(\"CREATE CONSTRAINT IF NOT EXISTS ON (a:Article) ASSERT a.arxiv_id IS UNIQUE;\")\n", 314 | "run_query(\"CREATE CONSTRAINT IF NOT EXISTS ON (a:Author) ASSERT a.name IS UNIQUE;\")\n", 315 | "run_query(\"CREATE CONSTRAINT IF NOT EXISTS ON (a:classification) ASSERT a.name IS UNIQUE;\")" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 200, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "import_pubmed_query = \"\"\"\n", 325 | "UNWIND $data AS row\n", 326 | "// Store article\n", 327 | "MERGE (a:Article {arxiv_id: row.arxiv_id})\n", 328 | "SET a.title = row.title\n", 329 | "// Store authors \n", 330 | "FOREACH (author IN row.author |\n", 331 | " MERGE (au:Author {name: author})\n", 332 | " MERGE (a)<-[:AUTHORED]-(au))\n", 333 | "// Store classifications\n", 334 | "FOREACH (class IN row.classification |\n", 335 | " MERGE (cl:classification {name: class})\n", 336 | " MERGE (a)-[:BELONGS]->(cl))\n", 337 | "\"\"\"" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 201, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "result = run_query(import_pubmed_query, {'data': neo4j_data})" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 202, 352 | "metadata": {}, 353 | "outputs": [ 354 | { 355 | "data": { 356 | "text/plain": [ 357 | "{'arxiv_id': 'arXiv:2203.07873',\n", 358 | " 'title': 'Asymptotic Fermat for signatures $(p,p,2)$ and $(p,p,3)$ over totally real fields',\n", 359 | " 'author': ['Diana Mocanu'],\n", 360 | " 'classification': ['11F80', ' 11G05', ' 11D41']}" 361 | ] 362 | }, 363 | "execution_count": 202, 364 | "metadata": {}, 365 | "output_type": "execute_result" 366 | } 367 | ], 368 | "source": [ 369 | "neo4j_data[0]" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 246, 375 | "metadata": {}, 376 | "outputs": [ 377 | { 378 | "ename": "IndexError", 379 | "evalue": "list index out of range", 380 | "output_type": "error", 381 | "traceback": [ 382 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 383 | "\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)", 384 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mclassification_secondary\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0marticle\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_all\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'span'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;34m'class'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;34m'tag is-small is-blue tooltip is-tooltip-top'\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mattrs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'data-tooltip'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 385 | "\u001b[1;31mIndexError\u001b[0m: list index out of range" 386 | ] 387 | } 388 | ], 389 | "source": [ 390 | " classification_secondary = article.find_all('span',{'class':'tag is-small is-blue tooltip is-tooltip-top'})[0].attrs.get('data-tooltip')" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 245, 396 | "metadata": {}, 397 | "outputs": [ 398 | { 399 | "data": { 400 | "text/plain": [ 401 | "13" 402 | ] 403 | }, 404 | "execution_count": 245, 405 | "metadata": {}, 406 | "output_type": "execute_result" 407 | } 408 | ], 409 | "source": [ 410 | "len(classification_secondary)" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [] 419 | } 420 | ], 421 | "metadata": { 422 | "kernelspec": { 423 | "display_name": "Python 3", 424 | "language": "python", 425 | "name": "python3" 426 | }, 427 | "language_info": { 428 | "codemirror_mode": { 429 | "name": "ipython", 430 | "version": 3 431 | }, 432 | "file_extension": ".py", 433 | "mimetype": "text/x-python", 434 | "name": "python", 435 | "nbconvert_exporter": "python", 436 | "pygments_lexer": "ipython3", 437 | "version": "3.7.4" 438 | }, 439 | "varInspector": { 440 | "cols": { 441 | "lenName": 16, 442 | "lenType": 16, 443 | "lenVar": 40 444 | }, 445 | "kernels_config": { 446 | "python": { 447 | "delete_cmd_postfix": "", 448 | "delete_cmd_prefix": "del ", 449 | "library": "var_list.py", 450 | "varRefreshCmd": "print(var_dic_list())" 451 | }, 452 | "r": { 453 | "delete_cmd_postfix": ") ", 454 | "delete_cmd_prefix": "rm(", 455 | "library": "var_list.r", 456 | "varRefreshCmd": "cat(var_dic_list()) " 457 | } 458 | }, 459 | "types_to_exclude": [ 460 | "module", 461 | "function", 462 | "builtin_function_or_method", 463 | "instance", 464 | "_Feature" 465 | ], 466 | "window_display": false 467 | } 468 | }, 469 | "nbformat": 4, 470 | "nbformat_minor": 2 471 | } 472 | -------------------------------------------------------------------------------- /Markov_text_generation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Markov text generation.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "authorship_tag": "ABX9TyPlJht3NIGMzXO1rl324sIR", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "\"Open" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "_p8TtM2IAeYR" 32 | }, 33 | "source": [ 34 | "# Text genaration using Markov chains\n", 35 | "\n", 36 | "This is a short notebook that demonstrates how to genrate text using a transition matrix." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "metadata": { 42 | "id": "LDVxdQgOz6_8" 43 | }, 44 | "source": [ 45 | "# Import libraries\n", 46 | "import re\n", 47 | "from random import random, sample" 48 | ], 49 | "execution_count": null, 50 | "outputs": [] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": { 55 | "id": "aAeVkev_Cuob" 56 | }, 57 | "source": [ 58 | "The transition matrix is defined in a class.\n", 59 | "Actually, we are going to use a dictionary to create what is called a sparse matix. For example the table" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": { 65 | "id": "pEtfPI7xHAnA" 66 | }, 67 | "source": [ 68 | "| | (am,a) | (am,happy) | (am,it) | (nice,thing)|\n", 69 | "| --- | --- | --- | --- | --- |\n", 70 | "| (i,am) | 1 | 2 | 0 | 0 |\n", 71 | "((a,nice) | 0 | 0 | 0 | 3|" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": { 77 | "id": "cnC4moEwHvY4" 78 | }, 79 | "source": [ 80 | "will be represented by the dictionary:\n", 81 | "\n", 82 | "```{ \"i,am\": {\"a\":1, \"happy\":2} \n", 83 | " , \"a,nice\": {\"thing\":3}\n", 84 | "}```\n", 85 | "\n", 86 | "Note that \n", 87 | "- we are not adding entries for cells with zero value\n", 88 | "- for in keys of the columns we are using only the next word \n", 89 | "\n", 90 | "PS. The careful reader will detect that we are not using probabilities in the matrix but instead a count. We will use this counts to calculate the corresponding probabilities." 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "metadata": { 96 | "id": "8M8p6-9zmYla" 97 | }, 98 | "source": [ 99 | "class TransitionMatrix:\n", 100 | " \"\"\" This is the transition matrix class.\n", 101 | "\n", 102 | " Attributes: \n", 103 | " SparseMatrix a dictionary in the form \n", 104 | " key: word_1, word_2\n", 105 | " value: dictionary in form {word:integer, word:integer}\n", 106 | " Methods:\n", 107 | " next_word: add a tripple of word_1, word_2, word_3 in SparseMatrix\n", 108 | " next_word: given a tuple word_1,word_2 generate (next) word with probapilities according to \n", 109 | " dictionary corresponding to key word_1,word_2 in SparseMatrix\n", 110 | " \"\"\"\n", 111 | " def __init__(self):\n", 112 | " self.SparseMatrix={}\n", 113 | " def add_tripple(self, word1,word2,word3):\n", 114 | " \"\"\" for a given tripple word1,word2,word3\n", 115 | " check if word1,word2 is a key of the dictrionary\n", 116 | " if true and word_3 is a key in the corresponding entry increment value of key word_3 by one\n", 117 | " if true and word_3 is not a key then add word_3 as a key with value one\n", 118 | " if word1,word2 is not a key, then add a key word_1,word_2 with value {word_3:1}\n", 119 | " \"\"\" \n", 120 | " key=word1+\",\"+word2\n", 121 | " if key in self.SparseMatrix:\n", 122 | " if word3 in self.SparseMatrix[key]:\n", 123 | " self.SparseMatrix[key][word3]=self.SparseMatrix[key][word3]+1\n", 124 | " else:\n", 125 | " self.SparseMatrix[key][word3]=1\n", 126 | " else:\n", 127 | " self.SparseMatrix[key]={word3:1}\n", 128 | " def next_word(self,word1,word2):\n", 129 | " \"\"\" generate next word for tuple word_1,word_2\n", 130 | " if word_1,word_2 is a key, then\n", 131 | " retrive the corresponding dictionary and based on it pick randomly a word\n", 132 | " see https://stackoverflow.com/questions/2570690/python-algorithm-to-randomly-select-a-key-based-on-proportionality-weight\n", 133 | " if word_1,word_2 is not a key, then select randomly from list [\".\",\"hence\",\"thus\",\"i\"]\n", 134 | " \"\"\"\n", 135 | " key=word1+\",\"+word2\n", 136 | " if key in self.SparseMatrix:\n", 137 | " count=sum(self.SparseMatrix[key].values())\n", 138 | " rand_val = count*random()\n", 139 | " total = 0\n", 140 | " for word,idx in self.SparseMatrix[key].items():\n", 141 | " total += idx\n", 142 | " if rand_val <= total:\n", 143 | " return word\n", 144 | " else:\n", 145 | " return sample([\".\",\",\",\"and\"],1)[0]" 146 | ], 147 | "execution_count": null, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": { 153 | "id": "bPTZIa-YJkoh" 154 | }, 155 | "source": [ 156 | "Example" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "metadata": { 162 | "colab": { 163 | "base_uri": "https://localhost:8080/" 164 | }, 165 | "id": "fzKtEEoCp-f0", 166 | "outputId": "edd910f7-320c-4d70-d433-5a430004cca2" 167 | }, 168 | "source": [ 169 | "# Assign classt to transitiontest and print contents\n", 170 | "transitiontest=TransitionMatrix()\n", 171 | "print(transitiontest.SparseMatrix)\n", 172 | "# Add the tripple (i,am,a) \n", 173 | "transitiontest.add_tripple(\"i\",\"am\",\"a\")\n", 174 | "print(transitiontest.SparseMatrix)\n", 175 | "# Increment count of tripple (i,am,a)\n", 176 | "transitiontest.add_tripple(\"i\",\"am\",\"a\")\n", 177 | "print(transitiontest.SparseMatrix)\n", 178 | "# Add trpple (i,am,happy)\n", 179 | "transitiontest.add_tripple(\"i\",\"am\",\"happy\")\n", 180 | "print(transitiontest.SparseMatrix)" 181 | ], 182 | "execution_count": null, 183 | "outputs": [ 184 | { 185 | "output_type": "stream", 186 | "text": [ 187 | "{}\n", 188 | "{'i,am': {'a': 1}}\n", 189 | "{'i,am': {'a': 2}}\n", 190 | "{'i,am': {'a': 2, 'happy': 1}}\n" 191 | ], 192 | "name": "stdout" 193 | } 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "metadata": { 199 | "colab": { 200 | "base_uri": "https://localhost:8080/" 201 | }, 202 | "id": "ip_918ohwInm", 203 | "outputId": "c3486c11-14f8-4e6b-858b-ecba9d2e432e" 204 | }, 205 | "source": [ 206 | "# Genrate next word\n", 207 | "print(f'Next word for \"i,am\" :{transitiontest.next_word(\"i\",\"am\")}')\n", 208 | "print(f'Next word for \"you,are\": {transitiontest.next_word(\"you\",\"are\")}. Note \"you,are\" is not contained in keys')" 209 | ], 210 | "execution_count": null, 211 | "outputs": [ 212 | { 213 | "output_type": "stream", 214 | "text": [ 215 | "Next word for \"i,am\" :happy\n", 216 | "Next word for \"you,are\": and. Note \"you,are\" is not contained in keys\n" 217 | ], 218 | "name": "stdout" 219 | } 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": { 225 | "id": "FLBiOA1qN0WF" 226 | }, 227 | "source": [ 228 | "## Text preprocessing\n", 229 | "We will need to preprocess the text input.\n", 230 | "Specificaly, we:\n", 231 | "- convert to lowwercase\n", 232 | "- remove special charaacters\n", 233 | "- remove extra spacing\n", 234 | "\n", 235 | "to this end, we define ```preprocess_sentence``` function" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "metadata": { 241 | "id": "R_LoPOVZ0ddd" 242 | }, 243 | "source": [ 244 | "def preprocess_sentence(sentence):\n", 245 | " sentence=sentence.lower()\n", 246 | " sentence=re.sub(r\"[^\\w\\d.!?\\s]+\",'',sentence)\n", 247 | " sentence=re.sub('([.,!?])', r' \\1 ', sentence)\n", 248 | " sentence = re.sub('\\s{2,}', ' ', sentence)\n", 249 | " return sentence" 250 | ], 251 | "execution_count": null, 252 | "outputs": [] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": { 257 | "id": "lHH8Wi2ZPbKF" 258 | }, 259 | "source": [ 260 | "For creating the matrix, we are going to use texts from [Gutenberg Project](www.gutenberg.org)." 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "metadata": { 266 | "colab": { 267 | "base_uri": "https://localhost:8080/" 268 | }, 269 | "id": "yilIVwVuKZ0B", 270 | "outputId": "4fe1c4d4-351c-403a-925f-5407894584bb" 271 | }, 272 | "source": [ 273 | "!wget http://www.gutenberg.org/cache/epub/28/pg28.txt #Aesop's Fables, by Aesop\n", 274 | "!wget https://www.gutenberg.org/files/1727/1727-0.txt #The Odyssey, by Homer\n", 275 | "!wget http://www.gutenberg.org/files/6130/6130-0.txt #The Iliad, by Homer" 276 | ], 277 | "execution_count": null, 278 | "outputs": [ 279 | { 280 | "output_type": "stream", 281 | "text": [ 282 | "--2020-12-05 05:24:55-- http://www.gutenberg.org/cache/epub/28/pg28.txt\n", 283 | "Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47\n", 284 | "Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:80... connected.\n", 285 | "HTTP request sent, awaiting response... 200 OK\n", 286 | "Length: 86418 (84K) [text/plain]\n", 287 | "Saving to: ‘pg28.txt.1’\n", 288 | "\n", 289 | "pg28.txt.1 100%[===================>] 84.39K 310KB/s in 0.3s \n", 290 | "\n", 291 | "2020-12-05 05:24:55 (310 KB/s) - ‘pg28.txt.1’ saved [86418/86418]\n", 292 | "\n", 293 | "--2020-12-05 05:24:55-- https://www.gutenberg.org/files/1727/1727-0.txt\n", 294 | "Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47\n", 295 | "Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.\n", 296 | "HTTP request sent, awaiting response... 200 OK\n", 297 | "Length: 718181 (701K) [text/plain]\n", 298 | "Saving to: ‘1727-0.txt.1’\n", 299 | "\n", 300 | "1727-0.txt.1 100%[===================>] 701.35K 672KB/s in 1.0s \n", 301 | "\n", 302 | "2020-12-05 05:24:57 (672 KB/s) - ‘1727-0.txt.1’ saved [718181/718181]\n", 303 | "\n", 304 | "--2020-12-05 05:24:57-- http://www.gutenberg.org/files/6130/6130-0.txt\n", 305 | "Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47\n", 306 | "Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:80... connected.\n", 307 | "HTTP request sent, awaiting response... 200 OK\n", 308 | "Length: 1161898 (1.1M) [text/plain]\n", 309 | "Saving to: ‘6130-0.txt.1’\n", 310 | "\n", 311 | "6130-0.txt.1 100%[===================>] 1.11M 968KB/s in 1.2s \n", 312 | "\n", 313 | "2020-12-05 05:24:58 (968 KB/s) - ‘6130-0.txt.1’ saved [1161898/1161898]\n", 314 | "\n" 315 | ], 316 | "name": "stdout" 317 | } 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "metadata": { 323 | "colab": { 324 | "base_uri": "https://localhost:8080/" 325 | }, 326 | "id": "KN0yEiMHVekw", 327 | "outputId": "6177a0a1-c6ef-4669-99ba-317764352779" 328 | }, 329 | "source": [ 330 | "#Assign class TransitionMatrix to markov_matrix\n", 331 | "Greeks_matrix=TransitionMatrix()\n", 332 | "Greeks=['pg28.txt','1727-0.txt','6130-0.txt']\n", 333 | "for FileName in Greeks:\n", 334 | " print(f'Processing {FileName}')\n", 335 | " # Open file\n", 336 | " with open(FileName) as f:\n", 337 | " content = f.readlines()\n", 338 | " #remove whitespace characters like `\\n` at the end of each line\n", 339 | " content = [x.strip() for x in content]\n", 340 | " content = [x.strip() for x in content if x!=\"\"]\n", 341 | " # Process file\n", 342 | " for text in content:\n", 343 | " doc=preprocess_sentence(text)\n", 344 | " doc=doc.split()\n", 345 | " l=len(doc)\n", 346 | " for i in range(2,l):\n", 347 | " Greeks_matrix.add_tripple(doc[i-2],doc[i-1],doc[i])" 348 | ], 349 | "execution_count": null, 350 | "outputs": [ 351 | { 352 | "output_type": "stream", 353 | "text": [ 354 | "Processing pg28.txt\n", 355 | "Processing 1727-0.txt\n", 356 | "Processing 6130-0.txt\n" 357 | ], 358 | "name": "stdout" 359 | } 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": { 365 | "id": "oqaJnnikSCfM" 366 | }, 367 | "source": [ 368 | "Finally, generate text" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "metadata": { 374 | "colab": { 375 | "base_uri": "https://localhost:8080/" 376 | }, 377 | "id": "HQRUCzzfO18N", 378 | "outputId": "a818874c-03aa-4869-aa8b-9cb03ba6eb30" 379 | }, 380 | "source": [ 381 | "word1=\"it\"\n", 382 | "word2=\"is\"\n", 383 | "story=word1+\" \"+word2\n", 384 | "for i in range(50):\n", 385 | " new_word=Greeks_matrix.next_word(word1,word2)\n", 386 | " story=story+\" \"+new_word\n", 387 | " if new_word==\".\":\n", 388 | " print(story)\n", 389 | " story=\"\"\n", 390 | " word1=word2\n", 391 | " word2=new_word\n", 392 | "print(story)" 393 | ], 394 | "execution_count": null, 395 | "outputs": [ 396 | { 397 | "output_type": "stream", 398 | "text": [ 399 | "it is enchanted and you can easily find another seat near telemachus he said to her ships and shelters there .\n", 400 | " the writer as stretching all and you do or cause to fight for the soil .\n", 401 | " he fenced the raft .\n", 402 | " at last they shall have plenty of it but by\n" 403 | ], 404 | "name": "stdout" 405 | } 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "metadata": { 411 | "colab": { 412 | "base_uri": "https://localhost:8080/" 413 | }, 414 | "id": "ENX3sUhBSn3P", 415 | "outputId": "3470fdeb-ec1e-49fa-c4ee-a81eb4f6c20e" 416 | }, 417 | "source": [ 418 | "!wget http://www.gutenberg.org/files/2600/2600-0.txt #War and Peace, by Leo Tolstoy\n", 419 | "!wget http://www.gutenberg.org/files/1399/1399-0.txt #Anna Karenina, by Leo Tolstoy\n", 420 | "!wget http://www.gutenberg.org/cache/epub/4761/pg4761.txt #The Cossacks, by Leo Tolstoy" 421 | ], 422 | "execution_count": null, 423 | "outputs": [ 424 | { 425 | "output_type": "stream", 426 | "text": [ 427 | "--2020-12-05 05:25:20-- http://www.gutenberg.org/files/2600/2600-0.txt\n", 428 | "Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47\n", 429 | "Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:80... connected.\n", 430 | "HTTP request sent, awaiting response... 200 OK\n", 431 | "Length: 3359584 (3.2M) [text/plain]\n", 432 | "Saving to: ‘2600-0.txt.2’\n", 433 | "\n", 434 | "2600-0.txt.2 100%[===================>] 3.20M 1.99MB/s in 1.6s \n", 435 | "\n", 436 | "2020-12-05 05:25:22 (1.99 MB/s) - ‘2600-0.txt.2’ saved [3359584/3359584]\n", 437 | "\n", 438 | "--2020-12-05 05:25:22-- http://www.gutenberg.org/files/1399/1399-0.txt\n", 439 | "Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47\n", 440 | "Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:80... connected.\n", 441 | "HTTP request sent, awaiting response... 200 OK\n", 442 | "Length: 2068079 (2.0M) [text/plain]\n", 443 | "Saving to: ‘1399-0.txt.2’\n", 444 | "\n", 445 | "1399-0.txt.2 100%[===================>] 1.97M 1.30MB/s in 1.5s \n", 446 | "\n", 447 | "2020-12-05 05:25:24 (1.30 MB/s) - ‘1399-0.txt.2’ saved [2068079/2068079]\n", 448 | "\n", 449 | "--2020-12-05 05:25:24-- http://www.gutenberg.org/cache/epub/4761/pg4761.txt\n", 450 | "Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47\n", 451 | "Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:80... connected.\n", 452 | "HTTP request sent, awaiting response... 200 OK\n", 453 | "Length: 370896 (362K) [text/plain]\n", 454 | "Saving to: ‘pg4761.txt.2’\n", 455 | "\n", 456 | "pg4761.txt.2 100%[===================>] 362.20K 492KB/s in 0.7s \n", 457 | "\n", 458 | "2020-12-05 05:25:25 (492 KB/s) - ‘pg4761.txt.2’ saved [370896/370896]\n", 459 | "\n" 460 | ], 461 | "name": "stdout" 462 | } 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "metadata": { 468 | "colab": { 469 | "base_uri": "https://localhost:8080/" 470 | }, 471 | "id": "59r5gk7pUsSw", 472 | "outputId": "86257cbd-c405-46d5-b3e6-0f29598102ce" 473 | }, 474 | "source": [ 475 | "#Assign class TransitionMatrix to markov_matrix\n", 476 | "Tolstoy_matrix=TransitionMatrix()\n", 477 | "Tolstoy=['2600-0.txt','1399-0.txt','pg4761.txt']\n", 478 | "for FileName in Tolstoy:\n", 479 | " print(f'Processing {FileName}')\n", 480 | " # Open file\n", 481 | " with open(FileName) as f:\n", 482 | " content = f.readlines()\n", 483 | " #remove whitespace characters like `\\n` at the end of each line\n", 484 | " content = [x.strip() for x in content]\n", 485 | " content = [x.strip() for x in content if x!=\"\"]\n", 486 | " # Process file\n", 487 | " for text in content:\n", 488 | " doc=preprocess_sentence(text)\n", 489 | " doc=doc.split()\n", 490 | " l=len(doc)\n", 491 | " for i in range(2,l):\n", 492 | " Tolstoy_matrix.add_tripple(doc[i-2],doc[i-1],doc[i])" 493 | ], 494 | "execution_count": null, 495 | "outputs": [ 496 | { 497 | "output_type": "stream", 498 | "text": [ 499 | "Processing 2600-0.txt\n", 500 | "Processing 1399-0.txt\n", 501 | "Processing pg4761.txt\n" 502 | ], 503 | "name": "stdout" 504 | } 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "metadata": { 510 | "colab": { 511 | "base_uri": "https://localhost:8080/" 512 | }, 513 | "id": "nlt60GI9Wjcj", 514 | "outputId": "2c785278-5a02-44d0-d467-55ea6ba77e20" 515 | }, 516 | "source": [ 517 | "word1=\"it\"\n", 518 | "word2=\"is\"\n", 519 | "story=word1+\" \"+word2\n", 520 | "for i in range(50):\n", 521 | " new_word=Tolstoy_matrix.next_word(word1,word2)\n", 522 | " story=story+\" \"+new_word\n", 523 | " if new_word==\".\":\n", 524 | " print(story)\n", 525 | " story=\"\"\n", 526 | " word1=word2\n", 527 | " word2=new_word\n", 528 | "print(story)" 529 | ], 530 | "execution_count": null, 531 | "outputs": [ 532 | { 533 | "output_type": "stream", 534 | "text": [ 535 | "it is really ended ? i am an exception .\n", 536 | " .\n", 537 | " but being with nature seeing her patient smiling face and rigid .\n", 538 | " only when they came to a longstanding impression related to the war of 1815 alexander possesses all .\n", 539 | " he could not make out something black .\n", 540 | " pierre received one\n" 541 | ], 542 | "name": "stdout" 543 | } 544 | ] 545 | } 546 | ] 547 | } -------------------------------------------------------------------------------- /online_retail/Online_retail_Combine_Segmentations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Online retail - Combine Segmentations.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "toc_visible": true, 10 | "authorship_tag": "ABX9TyM1c326tjXQeQKeoNdo9kXk", 11 | "include_colab_link": true 12 | }, 13 | "kernelspec": { 14 | "name": "python3", 15 | "display_name": "Python 3" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": { 32 | "id": "vMgo0NvkNM1C" 33 | }, 34 | "source": [ 35 | "# Customer Segmentaion of online retail customers\r\n", 36 | "## Part III - Combine Segmentations" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": { 42 | "id": "JJosaWskQR6X" 43 | }, 44 | "source": [ 45 | "## Introduction\r\n", 46 | "In this notebook wecombine the two customer segmentations of online retail customers dataset ( [Online Retail II Data Set](https://archive.ics.uci.edu/ml/datasets/Online+Retail+II) from UC Irvine Machine Learning Repository)\r\n", 47 | "\r\n", 48 | "Specifiacally, we combine\r\n", 49 | "* Customer [segmentation by category](https://github.com/dpanagop/data_analytics_examples/blob/master/online_retail/Online_retail_Segmentation_by_buying_category.ipynb) of bought items description and\r\n", 50 | "* Customer [segmentation by RFM and country](https://github.com/dpanagop/data_analytics_examples/blob/master/online_retail/Online_retail_Segmentation_by_RFM_Country.ipynb)." 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "id": "JqY04x3ePHbb" 57 | }, 58 | "source": [ 59 | "## Loading libraries and results of segmentations\r\n" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "metadata": { 65 | "id": "SQd4r1DQ23sa" 66 | }, 67 | "source": [ 68 | "import pandas as pd\r\n", 69 | "import numpy as np\r\n", 70 | "import matplotlib.pyplot as plt\r\n", 71 | "import seaborn as sns\r\n", 72 | "sns.set_style(\"whitegrid\")" 73 | ], 74 | "execution_count": 1, 75 | "outputs": [] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "metadata": { 80 | "colab": { 81 | "base_uri": "https://localhost:8080/" 82 | }, 83 | "id": "19vwadc-9Aw1", 84 | "outputId": "82657cff-6208-4f9a-bd84-e1dad2d54599" 85 | }, 86 | "source": [ 87 | "!wget github.com/dpanagop/data_analytics_examples/raw/master/online_retail/customer_segments_RFM_country.pickle\r\n", 88 | "!wget github.com/dpanagop/data_analytics_examples/raw/master/online_retail/customer_segments_buying_categories.pickle" 89 | ], 90 | "execution_count": 2, 91 | "outputs": [ 92 | { 93 | "output_type": "stream", 94 | "text": [ 95 | "--2021-01-14 03:36:11-- http://github.com/dpanagop/data_analytics_examples/raw/master/online_retail/customer_segments_RFM_country.pickle\n", 96 | "Resolving github.com (github.com)... 140.82.114.3\n", 97 | "Connecting to github.com (github.com)|140.82.114.3|:80... connected.\n", 98 | "HTTP request sent, awaiting response... 301 Moved Permanently\n", 99 | "Location: https://github.com/dpanagop/data_analytics_examples/raw/master/online_retail/customer_segments_RFM_country.pickle [following]\n", 100 | "--2021-01-14 03:36:11-- https://github.com/dpanagop/data_analytics_examples/raw/master/online_retail/customer_segments_RFM_country.pickle\n", 101 | "Connecting to github.com (github.com)|140.82.114.3|:443... connected.\n", 102 | "HTTP request sent, awaiting response... 302 Found\n", 103 | "Location: https://raw.githubusercontent.com/dpanagop/data_analytics_examples/master/online_retail/customer_segments_RFM_country.pickle [following]\n", 104 | "--2021-01-14 03:36:11-- https://raw.githubusercontent.com/dpanagop/data_analytics_examples/master/online_retail/customer_segments_RFM_country.pickle\n", 105 | "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", 106 | "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", 107 | "HTTP request sent, awaiting response... 200 OK\n", 108 | "Length: 262528 (256K) [application/octet-stream]\n", 109 | "Saving to: ‘customer_segments_RFM_country.pickle’\n", 110 | "\n", 111 | "customer_segments_R 100%[===================>] 256.38K --.-KB/s in 0.03s \n", 112 | "\n", 113 | "2021-01-14 03:36:11 (7.96 MB/s) - ‘customer_segments_RFM_country.pickle’ saved [262528/262528]\n", 114 | "\n", 115 | "URL transformed to HTTPS due to an HSTS policy\n", 116 | "--2021-01-14 03:36:12-- https://github.com/dpanagop/data_analytics_examples/raw/master/online_retail/customer_segments_buying_categories.pickle\n", 117 | "Resolving github.com (github.com)... 140.82.112.4\n", 118 | "Connecting to github.com (github.com)|140.82.112.4|:443... connected.\n", 119 | "HTTP request sent, awaiting response... 302 Found\n", 120 | "Location: https://raw.githubusercontent.com/dpanagop/data_analytics_examples/master/online_retail/customer_segments_buying_categories.pickle [following]\n", 121 | "--2021-01-14 03:36:12-- https://raw.githubusercontent.com/dpanagop/data_analytics_examples/master/online_retail/customer_segments_buying_categories.pickle\n", 122 | "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", 123 | "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", 124 | "HTTP request sent, awaiting response... 200 OK\n", 125 | "Length: 262338 (256K) [application/octet-stream]\n", 126 | "Saving to: ‘customer_segments_buying_categories.pickle’\n", 127 | "\n", 128 | "customer_segments_b 100%[===================>] 256.19K --.-KB/s in 0.03s \n", 129 | "\n", 130 | "2021-01-14 03:36:12 (8.23 MB/s) - ‘customer_segments_buying_categories.pickle’ saved [262338/262338]\n", 131 | "\n" 132 | ], 133 | "name": "stdout" 134 | } 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "metadata": { 140 | "id": "cwZSxK9P3PM0", 141 | "colab": { 142 | "base_uri": "https://localhost:8080/", 143 | "height": 206 144 | }, 145 | "outputId": "f2f2822d-6489-455e-ff23-4db823c5d0c7" 146 | }, 147 | "source": [ 148 | "customer_spending_per_category=pd.read_pickle('customer_segments_buying_categories.pickle')\r\n", 149 | "customer_spending_per_category.head()" 150 | ], 151 | "execution_count": 3, 152 | "outputs": [ 153 | { 154 | "output_type": "execute_result", 155 | "data": { 156 | "text/html": [ 157 | "
\n", 158 | "\n", 171 | "\n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | "
categoryCustomer ID0123cluster
012346.00.8417160.0000000.1582840.0000001
112347.00.8539720.1012900.0041680.0405691
212348.00.9102700.0000000.0466480.0430821
312349.00.9249070.0084460.0023160.0643311
412350.00.9389950.0610050.0000000.0000001
\n", 231 | "
" 232 | ], 233 | "text/plain": [ 234 | "category Customer ID 0 1 2 3 cluster\n", 235 | "0 12346.0 0.841716 0.000000 0.158284 0.000000 1\n", 236 | "1 12347.0 0.853972 0.101290 0.004168 0.040569 1\n", 237 | "2 12348.0 0.910270 0.000000 0.046648 0.043082 1\n", 238 | "3 12349.0 0.924907 0.008446 0.002316 0.064331 1\n", 239 | "4 12350.0 0.938995 0.061005 0.000000 0.000000 1" 240 | ] 241 | }, 242 | "metadata": { 243 | "tags": [] 244 | }, 245 | "execution_count": 3 246 | } 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "metadata": { 252 | "colab": { 253 | "base_uri": "https://localhost:8080/", 254 | "height": 238 255 | }, 256 | "id": "Z5MVReoK_Cff", 257 | "outputId": "3d50340d-cbf6-4fe1-f8af-3a4c49fbfb4a" 258 | }, 259 | "source": [ 260 | "customer_RFM_GDP=pd.read_pickle('customer_segments_RFM_country.pickle')\r\n", 261 | "customer_RFM_GDP.head()" 262 | ], 263 | "execution_count": 4, 264 | "outputs": [ 265 | { 266 | "output_type": "execute_result", 267 | "data": { 268 | "text/html": [ 269 | "
\n", 270 | "\n", 283 | "\n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | "
recencyfrequencymonetary_valueweighted GDPcluster
Customer ID
12346.032542.647059-3.8047062638296.00
12347.0150.375000704.16500020805.05
12348.07487.400000403.880000267856.05
12349.018147.000000880.9080001848222.02
12350.0309309.000000334.400000366386.05
\n", 345 | "
" 346 | ], 347 | "text/plain": [ 348 | " recency frequency monetary_value weighted GDP cluster\n", 349 | "Customer ID \n", 350 | "12346.0 325 42.647059 -3.804706 2638296.0 0\n", 351 | "12347.0 1 50.375000 704.165000 20805.0 5\n", 352 | "12348.0 74 87.400000 403.880000 267856.0 5\n", 353 | "12349.0 18 147.000000 880.908000 1848222.0 2\n", 354 | "12350.0 309 309.000000 334.400000 366386.0 5" 355 | ] 356 | }, 357 | "metadata": { 358 | "tags": [] 359 | }, 360 | "execution_count": 4 361 | } 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": { 367 | "id": "_6GYpW2j_UXE" 368 | }, 369 | "source": [ 370 | "As said in [Part I](https://github.com/dpanagop/data_analytics_examples/blob/master/online_retail/Online_retail_Segmentation_by_buying_category.ipynb), clustering based on categories spending resulted infour clusters with all of them having a high percentage of spending in category 0. In addition:\r\n", 371 | "* cluster 0 has customers with high spending in category 2,\r\n", 372 | "* cluster 1 has high spending only in category 0,\r\n", 373 | "* cluster 2 has customers with high spending in category 3,\r\n", 374 | "* cluster 3 has customers with high spending in category 1.\r\n", 375 | "\r\n" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "metadata": { 381 | "colab": { 382 | "base_uri": "https://localhost:8080/", 383 | "height": 426 384 | }, 385 | "id": "kG8PDBIa7KO_", 386 | "outputId": "c3c359a7-adcc-458c-d3ce-fd8e06c80977" 387 | }, 388 | "source": [ 389 | "def cluster_profile(customer_spending_per_category):\r\n", 390 | " ''' profile clusters '''\r\n", 391 | " customer_clusters=customer_spending_per_category.groupby(['cluster']).agg({'Customer ID':['count'], \r\n", 392 | " 0:'median',\r\n", 393 | " 1:'median',\r\n", 394 | " 2:'median',\r\n", 395 | " 3:'median'})\r\n", 396 | " print(customer_clusters)\r\n", 397 | " print('\\n')\r\n", 398 | " customer_clusters=customer_clusters.drop([('Customer ID', 'count')],1)\r\n", 399 | " customer_clusters.columns=[0,1,2,3]\r\n", 400 | " \r\n", 401 | " customer_clusters[0]=100*customer_clusters[0]\r\n", 402 | " customer_clusters[1]=100*customer_clusters[1]\r\n", 403 | " customer_clusters[2]=100*customer_clusters[2]\r\n", 404 | " customer_clusters[3]=100*customer_clusters[3]\r\n", 405 | " \r\n", 406 | "\r\n", 407 | " sns.heatmap(customer_clusters, annot=True, linewidths=.5)\r\n", 408 | "\r\n", 409 | "cluster_profile(customer_spending_per_category)" 410 | ], 411 | "execution_count": 5, 412 | "outputs": [ 413 | { 414 | "output_type": "stream", 415 | "text": [ 416 | " Customer ID 0 1 2 3\n", 417 | " count median median median median\n", 418 | "cluster \n", 419 | "0 421 0.670152 0.007819 0.228504 0.043079\n", 420 | "1 4287 0.892189 0.015741 0.001038 0.031568\n", 421 | "2 807 0.672202 0.005246 0.017402 0.246114\n", 422 | "3 427 0.534162 0.362365 0.000000 0.009505\n", 423 | "\n", 424 | "\n" 425 | ], 426 | "name": "stdout" 427 | }, 428 | { 429 | "output_type": "display_data", 430 | "data": { 431 | "image/png": "\n", 432 | "text/plain": [ 433 | "
" 434 | ] 435 | }, 436 | "metadata": { 437 | "tags": [], 438 | "needs_background": "light" 439 | } 440 | } 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "metadata": { 446 | "colab": { 447 | "base_uri": "https://localhost:8080/" 448 | }, 449 | "id": "iOGu1o2b_O9H", 450 | "outputId": "4a30b057-ea03-4212-9ced-d1dbdcce25e4" 451 | }, 452 | "source": [ 453 | "customer_spending_per_category['cluster'].value_counts()" 454 | ], 455 | "execution_count": 6, 456 | "outputs": [ 457 | { 458 | "output_type": "execute_result", 459 | "data": { 460 | "text/plain": [ 461 | "1 4287\n", 462 | "2 807\n", 463 | "3 427\n", 464 | "0 421\n", 465 | "Name: cluster, dtype: int64" 466 | ] 467 | }, 468 | "metadata": { 469 | "tags": [] 470 | }, 471 | "execution_count": 6 472 | } 473 | ] 474 | }, 475 | { 476 | "cell_type": "markdown", 477 | "metadata": { 478 | "id": "O3A-9wA3ALY4" 479 | }, 480 | "source": [ 481 | "As for RFM-Country clustering from [Part II](https://github.com/dpanagop/data_analytics_examples/blob/master/online_retail/Online_retail_Segmentation_by_RFM_Country.ipynb), we have four major clusters that can be ranked by RFM score (from best to to worst) as:\r\n", 482 | "\r\n", 483 | "cluster 4 > cluster 0 > cluster 5 > cluster 2\r\n", 484 | "\r\n", 485 | "and that cluster 5 has customers from countries with lower GDP than the rest.\r\n", 486 | "Thus, we will keep these four clusters plus one more into which we will merge the rest." 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "metadata": { 492 | "colab": { 493 | "base_uri": "https://localhost:8080/", 494 | "height": 748 495 | }, 496 | "id": "njWvI3bc9gd_", 497 | "outputId": "3bbc8040-7069-461f-89c1-8e26868e77e5" 498 | }, 499 | "source": [ 500 | "def cluster_profile_RFM_country(customer_clustering,cut_off=0):\r\n", 501 | " ''' profile clusters with size equal or bigger than cut_off'''\r\n", 502 | "\r\n", 503 | " customer_clusters=customer_clustering.reset_index().groupby(['cluster']).agg({'Customer ID':['count'], \r\n", 504 | " 'recency':'median',\r\n", 505 | " 'frequency':'median',\r\n", 506 | " 'monetary_value':'median',\r\n", 507 | " 'weighted GDP':'median'})\r\n", 508 | " idx= customer_clusters['Customer ID']>=cut_off\r\n", 509 | " idx=idx['count'].to_list()\r\n", 510 | " customer_clusters=customer_clusters[idx]\r\n", 511 | " print(customer_clusters)\r\n", 512 | " print('\\n')\r\n", 513 | " customer_clusters_sum=customer_clusters.sum(axis=0)\r\n", 514 | " #print(customer_clusters_sum)\r\n", 515 | " customer_clusters['monetary_value']=100*customer_clusters['monetary_value']/customer_clusters_sum['monetary_value']\r\n", 516 | " customer_clusters['frequency']=100*customer_clusters['frequency']/customer_clusters_sum['frequency']\r\n", 517 | " customer_clusters['recency']=100*customer_clusters['recency']/customer_clusters_sum['recency']\r\n", 518 | " customer_clusters['weighted GDP']=100*customer_clusters['weighted GDP']/customer_clusters_sum['weighted GDP']\r\n", 519 | " print(customer_clusters)\r\n", 520 | " print('\\n')\r\n", 521 | " \r\n", 522 | " print('HEAT map')\r\n", 523 | " print('Numbers are column percentages')\r\n", 524 | " sns.heatmap(customer_clusters.drop(['Customer ID'],axis=1), annot=True, linewidths=.5)\r\n", 525 | "cluster_profile_RFM_country(customer_RFM_GDP,cut_off=100)" 526 | ], 527 | "execution_count": 8, 528 | "outputs": [ 529 | { 530 | "output_type": "stream", 531 | "text": [ 532 | " Customer ID recency frequency monetary_value weighted GDP\n", 533 | " count median median median median\n", 534 | "cluster \n", 535 | "0 1547 389 227.333333 204.902500 2638296.0\n", 536 | "2 3549 35 58.666667 243.840000 2638296.0\n", 537 | "4 572 619 603.500000 166.050000 2638296.0\n", 538 | "5 247 79 92.125000 393.990625 503416.0\n", 539 | "\n", 540 | "\n", 541 | " Customer ID recency frequency monetary_value weighted GDP\n", 542 | " count median median median median\n", 543 | "cluster \n", 544 | "0 1547 34.670232 23.158878 20.311848 31.339994\n", 545 | "2 3549 3.119430 5.976485 24.171697 31.339994\n", 546 | "4 572 55.169340 61.479689 16.460426 31.339994\n", 547 | "5 247 7.040998 9.384948 39.056029 5.980017\n", 548 | "\n", 549 | "\n", 550 | "HEAT map\n", 551 | "Numbers are column percentages\n" 552 | ], 553 | "name": "stdout" 554 | }, 555 | { 556 | "output_type": "display_data", 557 | "data": { 558 | "image/png": "\n", 559 | "text/plain": [ 560 | "
" 561 | ] 562 | }, 563 | "metadata": { 564 | "tags": [], 565 | "needs_background": "light" 566 | } 567 | } 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "metadata": { 573 | "colab": { 574 | "base_uri": "https://localhost:8080/" 575 | }, 576 | "id": "6bL3wYQ2Blg3", 577 | "outputId": "023d404f-951a-4002-e302-730d166ed7b5" 578 | }, 579 | "source": [ 580 | "customer_RFM_GDP['cluster'].value_counts()" 581 | ], 582 | "execution_count": 7, 583 | "outputs": [ 584 | { 585 | "output_type": "execute_result", 586 | "data": { 587 | "text/plain": [ 588 | "2 3549\n", 589 | "0 1547\n", 590 | "4 572\n", 591 | "5 247\n", 592 | "6 17\n", 593 | "1 9\n", 594 | "3 1\n", 595 | "Name: cluster, dtype: int64" 596 | ] 597 | }, 598 | "metadata": { 599 | "tags": [] 600 | }, 601 | "execution_count": 7 602 | } 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "metadata": { 608 | "colab": { 609 | "base_uri": "https://localhost:8080/" 610 | }, 611 | "id": "JpQX3zOgCwhA", 612 | "outputId": "e8474378-147e-4a1a-936b-3412eff1b955" 613 | }, 614 | "source": [ 615 | "customer_RFM_GDP['RFM_country_cluster']=5\r\n", 616 | "customer_RFM_GDP.loc[customer_RFM_GDP['cluster']==4,'RFM_country_cluster']=1\r\n", 617 | "customer_RFM_GDP.loc[customer_RFM_GDP['cluster']==0,'RFM_country_cluster']=2\r\n", 618 | "customer_RFM_GDP.loc[customer_RFM_GDP['cluster']==5,'RFM_country_cluster']=3\r\n", 619 | "customer_RFM_GDP.loc[customer_RFM_GDP['cluster']==2,'RFM_country_cluster']=4\r\n", 620 | "customer_RFM_GDP['RFM_country_cluster'].value_counts()" 621 | ], 622 | "execution_count": 8, 623 | "outputs": [ 624 | { 625 | "output_type": "execute_result", 626 | "data": { 627 | "text/plain": [ 628 | "4 3549\n", 629 | "2 1547\n", 630 | "1 572\n", 631 | "3 247\n", 632 | "5 27\n", 633 | "Name: RFM_country_cluster, dtype: int64" 634 | ] 635 | }, 636 | "metadata": { 637 | "tags": [] 638 | }, 639 | "execution_count": 8 640 | } 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "metadata": { 646 | "colab": { 647 | "base_uri": "https://localhost:8080/", 648 | "height": 260 649 | }, 650 | "id": "tITyzXxfDdHi", 651 | "outputId": "e1c91bc3-17cc-4e63-a079-fd30a4e40074" 652 | }, 653 | "source": [ 654 | "customer_RFM_GDP=customer_RFM_GDP.reset_index()\r\n", 655 | "customer_RFM_GDP=customer_RFM_GDP[['Customer ID','RFM_country_cluster']]\r\n", 656 | "print(customer_RFM_GDP.shape)\r\n", 657 | "customer_spending_per_category=customer_spending_per_category[['Customer ID','cluster']]\r\n", 658 | "customer_spending_per_category.columns=['Customer ID','item_category_cluster']\r\n", 659 | "print(customer_spending_per_category.shape)\r\n", 660 | "customer_clusters=pd.merge(customer_RFM_GDP,customer_spending_per_category)\r\n", 661 | "print(customer_clusters.shape)\r\n", 662 | "customer_clusters.head()" 663 | ], 664 | "execution_count": 11, 665 | "outputs": [ 666 | { 667 | "output_type": "stream", 668 | "text": [ 669 | "(5942, 2)\n", 670 | "(5942, 2)\n", 671 | "(5942, 3)\n" 672 | ], 673 | "name": "stdout" 674 | }, 675 | { 676 | "output_type": "execute_result", 677 | "data": { 678 | "text/html": [ 679 | "
\n", 680 | "\n", 693 | "\n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | "
Customer IDRFM_country_clusteritem_category_cluster
012346.021
112347.031
212348.031
312349.041
412350.031
\n", 735 | "
" 736 | ], 737 | "text/plain": [ 738 | " Customer ID RFM_country_cluster item_category_cluster\n", 739 | "0 12346.0 2 1\n", 740 | "1 12347.0 3 1\n", 741 | "2 12348.0 3 1\n", 742 | "3 12349.0 4 1\n", 743 | "4 12350.0 3 1" 744 | ] 745 | }, 746 | "metadata": { 747 | "tags": [] 748 | }, 749 | "execution_count": 11 750 | } 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "metadata": { 756 | "colab": { 757 | "base_uri": "https://localhost:8080/", 758 | "height": 238 759 | }, 760 | "id": "B8hUIW5ODyhy", 761 | "outputId": "8a55db22-cc72-4165-c738-92da52e90ce7" 762 | }, 763 | "source": [ 764 | "pd.crosstab(customer_clusters['RFM_country_cluster'],customer_clusters['item_category_cluster'])" 765 | ], 766 | "execution_count": 12, 767 | "outputs": [ 768 | { 769 | "output_type": "execute_result", 770 | "data": { 771 | "text/html": [ 772 | "
\n", 773 | "\n", 786 | "\n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | "
item_category_cluster0123
RFM_country_cluster
1154466942
2109111223690
382071022
42892501489270
502133
\n", 841 | "
" 842 | ], 843 | "text/plain": [ 844 | "item_category_cluster 0 1 2 3\n", 845 | "RFM_country_cluster \n", 846 | "1 15 446 69 42\n", 847 | "2 109 1112 236 90\n", 848 | "3 8 207 10 22\n", 849 | "4 289 2501 489 270\n", 850 | "5 0 21 3 3" 851 | ] 852 | }, 853 | "metadata": { 854 | "tags": [] 855 | }, 856 | "execution_count": 12 857 | } 858 | ] 859 | }, 860 | { 861 | "cell_type": "code", 862 | "metadata": { 863 | "colab": { 864 | "base_uri": "https://localhost:8080/", 865 | "height": 238 866 | }, 867 | "id": "F2TppJD8F821", 868 | "outputId": "4950d4ba-62ea-4da9-e2d6-e24e4b0accfd" 869 | }, 870 | "source": [ 871 | "cross_table=pd.crosstab(customer_clusters['RFM_country_cluster'],customer_clusters['item_category_cluster'])/5942*100\r\n", 872 | "cross_table" 873 | ], 874 | "execution_count": 15, 875 | "outputs": [ 876 | { 877 | "output_type": "execute_result", 878 | "data": { 879 | "text/html": [ 880 | "
\n", 881 | "\n", 894 | "\n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | "
item_category_cluster0123
RFM_country_cluster
10.2524407.5058901.1612250.706833
21.83439918.7142383.9717271.514642
30.1346353.4836760.1682940.370246
44.86368242.0902058.2295524.543925
50.0000000.3534160.0504880.050488
\n", 949 | "
" 950 | ], 951 | "text/plain": [ 952 | "item_category_cluster 0 1 2 3\n", 953 | "RFM_country_cluster \n", 954 | "1 0.252440 7.505890 1.161225 0.706833\n", 955 | "2 1.834399 18.714238 3.971727 1.514642\n", 956 | "3 0.134635 3.483676 0.168294 0.370246\n", 957 | "4 4.863682 42.090205 8.229552 4.543925\n", 958 | "5 0.000000 0.353416 0.050488 0.050488" 959 | ] 960 | }, 961 | "metadata": { 962 | "tags": [] 963 | }, 964 | "execution_count": 15 965 | } 966 | ] 967 | }, 968 | { 969 | "cell_type": "code", 970 | "metadata": { 971 | "id": "F5ZqSD64GX6W", 972 | "colab": { 973 | "base_uri": "https://localhost:8080/", 974 | "height": 335 975 | }, 976 | "outputId": "4990de09-9a9e-4aed-fb3d-d3af3a56cb5b" 977 | }, 978 | "source": [ 979 | "fig, ax = plt.subplots(figsize=(8, 5))\r\n", 980 | "sns.heatmap(cross_table/100,\r\n", 981 | " annot=True,\r\n", 982 | " fmt='.2%',\r\n", 983 | " #cmap='rocket_r',\r\n", 984 | " #linewidths=.5,\r\n", 985 | " ax=ax)\r\n", 986 | "plt.show()" 987 | ], 988 | "execution_count": 22, 989 | "outputs": [ 990 | { 991 | "output_type": "display_data", 992 | "data": { 993 | "image/png": "\n", 994 | "text/plain": [ 995 | "
" 996 | ] 997 | }, 998 | "metadata": { 999 | "tags": [], 1000 | "needs_background": "light" 1001 | } 1002 | } 1003 | ] 1004 | }, 1005 | { 1006 | "cell_type": "code", 1007 | "metadata": { 1008 | "id": "yZWpcwIRB38X" 1009 | }, 1010 | "source": [ 1011 | "" 1012 | ], 1013 | "execution_count": null, 1014 | "outputs": [] 1015 | } 1016 | ] 1017 | } --------------------------------------------------------------------------------