├── scripts
    ├── .gitignore
    ├── demo.py
    ├── ver_unver.json
    ├── timeseries_unver.json
    ├── timeseries.json
    ├── timeseries_ver.json
    ├── main.py
    ├── data_utils.py
    ├── requirements.txt
    ├── sentiment_analysis.py
    ├── ver_counts.json
    ├── data_exploration.py
    └── topic_analysis.py
├── report.pdf
├── webapp
    ├── src
    │   ├── stylus
    │   │   └── main.styl
    │   ├── assets
    │   │   └── logo.png
    │   ├── plugins
    │   │   └── vuetify.js
    │   ├── App.vue
    │   └── main.js
    ├── public
    │   ├── favicon.ico
    │   ├── B005NF5NTK.jpg
    │   ├── B0092KJ9BU.jpg
    │   ├── B00MXWFUQC.jpg
    │   ├── B00UCZGS6S.jpg
    │   ├── B00UH3L82Y.jpg
    │   ├── B00VH88CJ0.jpg
    │   ├── B00X5RV14Y.jpg
    │   ├── B018JW3EOY.jpg
    │   ├── PORTABLECHARGERS.jpg
    │   ├── index.html
    │   └── lda_B00VH88CJ0.html
    ├── vue.config.js
    ├── .gitignore
    ├── README.md
    └── package.json
├── dataframes
    ├── negative.pkl
    ├── positive.pkl
    ├── dominant_topics.pkl
    ├── most_repr_rews.pkl
    └── topic_sents_keywords.pkl
├── figures
    ├── 2_zipf_law.png
    ├── ext
    │   ├── webapp1.png
    │   ├── webapp2.png
    │   ├── webapp3.png
    │   ├── webapp4.png
    │   ├── 1_monthly.png
    │   ├── 1_peruser.png
    │   ├── webapp_plot1.png
    │   ├── webapp_plot2.png
    │   ├── webapp_plot3.png
    │   ├── 1_selfVSforced.png
    │   └── 1_frequentInfrequentYelp.png
    └── 1_opinion_distribution.svg
├── datasets
    └── pull_datasets.sh
├── report
    ├── report.tex
    ├── abstract.tex
    ├── bibliography.bib
    ├── 5_conclusioni.tex
    ├── 1_introduction.tex
    ├── 4_topic_analysis.tex
    ├── 3_sentiment_analysis.tex
    └── 2_esplorazione.tex
├── README.md
└── .gitignore


/scripts/.gitignore:
--------------------------------------------------------------------------------
1 | bin/
2 | lib64
3 | pyvenv.cfg
4 | share/


--------------------------------------------------------------------------------
/report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/report.pdf


--------------------------------------------------------------------------------
/webapp/src/stylus/main.styl:
--------------------------------------------------------------------------------
1 | require('typeface-barlow')
2 | $body-font-family = 'Barlow'
3 | 


--------------------------------------------------------------------------------
/dataframes/negative.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/dataframes/negative.pkl


--------------------------------------------------------------------------------
/dataframes/positive.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/dataframes/positive.pkl


--------------------------------------------------------------------------------
/figures/2_zipf_law.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/figures/2_zipf_law.png


--------------------------------------------------------------------------------
/figures/ext/webapp1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/figures/ext/webapp1.png


--------------------------------------------------------------------------------
/figures/ext/webapp2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/figures/ext/webapp2.png


--------------------------------------------------------------------------------
/figures/ext/webapp3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/figures/ext/webapp3.png


--------------------------------------------------------------------------------
/figures/ext/webapp4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/figures/ext/webapp4.png


--------------------------------------------------------------------------------
/figures/ext/1_monthly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/figures/ext/1_monthly.png


--------------------------------------------------------------------------------
/figures/ext/1_peruser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/figures/ext/1_peruser.png


--------------------------------------------------------------------------------
/webapp/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/webapp/public/favicon.ico


--------------------------------------------------------------------------------
/webapp/src/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/webapp/src/assets/logo.png


--------------------------------------------------------------------------------
/figures/ext/webapp_plot1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/figures/ext/webapp_plot1.png


--------------------------------------------------------------------------------
/figures/ext/webapp_plot2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/figures/ext/webapp_plot2.png


--------------------------------------------------------------------------------
/figures/ext/webapp_plot3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/figures/ext/webapp_plot3.png


--------------------------------------------------------------------------------
/webapp/public/B005NF5NTK.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/webapp/public/B005NF5NTK.jpg


--------------------------------------------------------------------------------
/webapp/public/B0092KJ9BU.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/webapp/public/B0092KJ9BU.jpg


--------------------------------------------------------------------------------
/webapp/public/B00MXWFUQC.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/webapp/public/B00MXWFUQC.jpg


--------------------------------------------------------------------------------
/webapp/public/B00UCZGS6S.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/webapp/public/B00UCZGS6S.jpg


--------------------------------------------------------------------------------
/webapp/public/B00UH3L82Y.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/webapp/public/B00UH3L82Y.jpg


--------------------------------------------------------------------------------
/webapp/public/B00VH88CJ0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/webapp/public/B00VH88CJ0.jpg


--------------------------------------------------------------------------------
/webapp/public/B00X5RV14Y.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/webapp/public/B00X5RV14Y.jpg


--------------------------------------------------------------------------------
/webapp/public/B018JW3EOY.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/webapp/public/B018JW3EOY.jpg


--------------------------------------------------------------------------------
/dataframes/dominant_topics.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/dataframes/dominant_topics.pkl


--------------------------------------------------------------------------------
/dataframes/most_repr_rews.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/dataframes/most_repr_rews.pkl


--------------------------------------------------------------------------------
/figures/ext/1_selfVSforced.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/figures/ext/1_selfVSforced.png


--------------------------------------------------------------------------------
/dataframes/topic_sents_keywords.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/dataframes/topic_sents_keywords.pkl


--------------------------------------------------------------------------------
/webapp/public/PORTABLECHARGERS.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/webapp/public/PORTABLECHARGERS.jpg


--------------------------------------------------------------------------------
/figures/ext/1_frequentInfrequentYelp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avivace/reviews-sentiment/HEAD/figures/ext/1_frequentInfrequentYelp.png


--------------------------------------------------------------------------------
/webapp/public/index.html:
--------------------------------------------------------------------------------
1 | <html>
2 | <head>
3 | 	<link rel="icon" href="favicon.ico">
4 | </head>
5 | <title> Amazon Reviews </title>
6 | <div id="app"></div>
7 | </html>


--------------------------------------------------------------------------------
/webapp/src/plugins/vuetify.js:
--------------------------------------------------------------------------------
1 | import Vue from 'vue';
2 | import Vuetify from 'vuetify/lib';
3 | 
4 | Vue.use(Vuetify);
5 | 
6 | export default new Vuetify({
7 | });
8 | 


--------------------------------------------------------------------------------
/webapp/vue.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   "publicPath": '/reviews-sentiment/',
3 |   "transpileDependencies": [
4 |     "vuetify"
5 |   ],
6 |     lintOnSave: false
7 | }


--------------------------------------------------------------------------------
/webapp/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | node_modules
 3 | /dist
 4 | 
 5 | # local env files
 6 | .env.local
 7 | .env.*.local
 8 | 
 9 | # Log files
10 | npm-debug.log*
11 | yarn-debug.log*
12 | yarn-error.log*
13 | 
14 | # Editor directories and files
15 | .idea
16 | .vscode
17 | *.suo
18 | *.ntvs*
19 | *.njsproj
20 | *.sln
21 | *.sw?
22 | 


--------------------------------------------------------------------------------
/webapp/src/App.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 | 
 3 | 
 4 | 
 5 |       <Demo/>
 6 |     
 7 | 
 8 | </template>
 9 | 
10 | <script>
11 | import Demo from './components/Demo';
12 | 
13 | export default {
14 |   name: 'App',
15 | 
16 |   components: {
17 |     Demo,
18 |   },
19 | 
20 |   data: () => ({
21 |     //
22 |   }),
23 | };
24 | </script>
25 | 
26 | <style>
27 | .navtitle{
28 |   font-size:24px;
29 | }
30 | 
31 | </style>


--------------------------------------------------------------------------------
/webapp/README.md:
--------------------------------------------------------------------------------
 1 | # webapp
 2 | 
 3 | ## Project setup
 4 | ```
 5 | npm install
 6 | ```
 7 | 
 8 | ### Compiles and hot-reloads for development on localhost:8080/reviews-sentiment
 9 | ```
10 | npm run serve
11 | ```
12 | 
13 | ### Compiles and minifies for production
14 | ```
15 | npm run build
16 | ```
17 | 
18 | ### Lints and fixes files
19 | ```
20 | npm run lint
21 | ```
22 | 
23 | ### Deploy
24 | 
25 | scp dist root@IP:/var/www/html
26 | 
27 | ### Customize configuration
28 | See [Configuration Reference](https://cli.vuejs.org/config/).
29 | 


--------------------------------------------------------------------------------
/datasets/pull_datasets.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script pulls the datasets and extract them
 3 | # Data source: http://jmcauley.ucsd.edu/data/amazon/
 4 | 
 5 | #wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFiles/Grocery_and_Gourmet_Food.json.gz
 6 | #wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles/meta_Grocery_and_Gourmet_Food.json.gz
 7 | wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Grocery_and_Gourmet_Food_5.json.gz
 8 | 
 9 | #gunzip Grocery_and_Gourmet_Food.json.gz
10 | #gunzip meta_Grocery_and_Gourmet_Food.json.gz
11 | gunzip Grocery_and_Gourmet_Food_5.json.gz
12 | 


--------------------------------------------------------------------------------
/webapp/src/main.js:
--------------------------------------------------------------------------------
 1 | import Vue from 'vue'
 2 | import App from './App.vue'
 3 | import vuetify from './plugins/vuetify';
 4 | import axios from 'axios'
 5 | import VueApexCharts from 'vue-apexcharts'
 6 | 
 7 | import '@mdi/font/css/materialdesignicons.css'
 8 | import './stylus/main.styl'
 9 | import 'typeface-barlow'
10 | import 'roboto-fontface/css/roboto/roboto-fontface.css'
11 | 
12 | Vue.config.productionTip = false
13 | Vue.prototype.$axios = axios
14 | Vue.use(VueApexCharts)
15 | 
16 | Vue.component('apexchart', VueApexCharts)
17 | 
18 | new Vue({
19 |   vuetify,
20 |   render: h => h(App)
21 | }).$mount('#app')
22 | 


--------------------------------------------------------------------------------
/report/report.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{report}
 2 | \usepackage{amsmath}
 3 | \usepackage{ dsfont }
 4 | \usepackage{ svg }
 5 | 
 6 | \usepackage[colorlinks,citecolor=black,linkcolor=black,urlcolor=blue,bookmarks=false,hypertexnames=true]{hyperref}
 7 | \usepackage[T1]{fontenc}
 8 | \usepackage[utf8]{inputenc}
 9 | \usepackage{graphicx}
10 | \usepackage{subfigure}
11 | \usepackage{caption}
12 | \usepackage{subcaption}
13 | \usepackage{titling}
14 | \usepackage{float}
15 | \usepackage{textcomp}
16 | \setlength\parindent{0pt}
17 | \usepackage[
18 | backend=biber,
19 | style=numeric,
20 | sorting=ynt
21 | ]{biblatex}
22 | 
23 | \addbibresource{bibliography.bib}
24 | 
25 | \begin{document}
26 | \title{%
27 |   \Huge Amazon Reviews \\
28 |   \large Sentiment and Aspect Based Analysis\\
29 |     }
30 | \author{
31 |   Coppola Matteo\\
32 |   \texttt{793329}
33 |   \and
34 |   Palazzi Luca\\
35 |   \texttt{793556}
36 |    \and
37 |   Vivace Antonio\\
38 |   \texttt{793509}
39 | }
40 | \date{Data Analytics, January 2020}
41 | \maketitle
42 | 
43 | \input{abstract.tex}
44 | \tableofcontents
45 | \listoffigures
46 | \listoftables
47 | \pagebreak
48 | %%
49 | \input{1_introduction.tex}
50 | \input{2_esplorazione.tex}
51 | \input{3_sentiment_analysis.tex}
52 | \input{4_topic_analysis.tex}
53 | \input{5_conclusioni.tex}
54 | 
55 | \newpage
56 | \addcontentsline{toc}{chapter}{Bibliografia}
57 | \printbibliography
58 | 
59 | \end{document}


--------------------------------------------------------------------------------
/webapp/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "webapp",
 3 |   "version": "0.1.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "serve": "vue-cli-service serve",
 7 |     "build": "vue-cli-service build",
 8 |     "lint": "vue-cli-service lint",
 9 |     "deploy": "npm run build && npx gh-pages -d dist -b master"
10 |   },
11 |   "dependencies": {
12 |     "@mdi/font": "^3.6.95",
13 |     "apexcharts": "^3.15.5",
14 |     "axios": "^0.19.1",
15 |     "core-js": "^3.4.4",
16 |     "roboto-fontface": "*",
17 |     "vue": "^2.6.10",
18 |     "vue-apexcharts": "^1.5.2",
19 |     "vue-plotly": "^1.1.0",
20 |     "vuetify": "^2.1.0"
21 |   },
22 |   "devDependencies": {
23 |     "@vue/cli-plugin-babel": "^4.1.0",
24 |     "@vue/cli-plugin-eslint": "^4.1.0",
25 |     "@vue/cli-service": "^4.1.0",
26 |     "babel-eslint": "^10.0.3",
27 |     "css-loader": "^3.4.1",
28 |     "eslint": "^5.16.0",
29 |     "eslint-plugin-vue": "^5.0.0",
30 |     "html-webpack-plugin": "^3.2.0",
31 |     "sass": "^1.19.0",
32 |     "sass-loader": "^8.0.0",
33 |     "style-loader": "^1.1.2",
34 |     "stylus": "^0.54.7",
35 |     "stylus-loader": "^3.0.2",
36 |     "typeface-barlow": "0.0.71",
37 |     "vue-cli-plugin-vuetify": "^2.0.4",
38 |     "vue-template-compiler": "^2.6.10",
39 |     "vuetify-loader": "^1.3.0"
40 |   },
41 |   "eslintConfig": {
42 |     "root": true,
43 |     "env": {
44 |       "node": true
45 |     },
46 |     "extends": [
47 |       "plugin:vue/essential",
48 |       "eslint:recommended"
49 |     ],
50 |     "rules": {},
51 |     "parserOptions": {
52 |       "parser": "babel-eslint"
53 |     }
54 |   },
55 |   "browserslist": [
56 |     "> 1%",
57 |     "last 2 versions"
58 |   ]
59 | }
60 | 


--------------------------------------------------------------------------------
/report/abstract.tex:
--------------------------------------------------------------------------------
 1 | \begin{abstract}
 2 | 
 3 | Recentemente il mercato dello shopping online sta acquisendo sempre più rilevanza, superando i limiti della compravendita in negozi fisici e in alcuni ambiti rimpiazzandola. Con la crescita degli acquisti, cresce anche la mole di dati che venditori, produttori, pubblicitari e gestori di piattaforme di e-Commerce si trovano a dover processare per ottenere informazioni sulla natura delle transazioni, dei clienti che le producono e sui trend mercato.
 4 | 
 5 | Una parte fondamentale di questi dati è costituita da quelli prodotti dai consumatori stessi dopo aver effettuato l'acquisto: opinioni, recensioni e valutazioni sul prodotto ed in generale sull'esperienza di acquisto.
 6 | 
 7 | Sentiment Analysis è una materia che sfrutta dati di questa natura (denominati VOC: \textit{Voice of the Costumer}) per estrarre, quantificare e studiare informazioni soggettive in modo sistematico.
 8 | 
 9 | Tra i campi che beneficiano di questi strumenti troviamo: sviluppo di strategie di marketing, sistemi di raccomandazioni, \textit{brand monitoring}, servizio clienti e ricerche di mercato.
10 | 
11 | In questo lavoro, analizziamo un insieme di recensioni pubblicate su Amazon su articoli della categoria "Cellulari e accessori correlati" per uno studio esplorativo, estraendo dati statistici ed evoluzioni temporali sulla natura delle recensioni e delle valutazioni numeriche annesse. Procediamo poi nell'addestrare modelli di Machine Learning (Logistic Regression e Naive Bayes) per valutare la loro efficacia nell'identificare correttamente il sentimento generale delle recensioni, analizzandone poi le metriche e cercando di individuare i migliori iperparametri.
12 | Infine per i sei prodotti più recensiti applichiamo una tecnica di Topic Analysis per individuare i cluster di argomenti.
13 | 
14 | 
15 | \end{abstract}


--------------------------------------------------------------------------------
/scripts/demo.py:
--------------------------------------------------------------------------------
 1 | import dash
 2 | import dash_core_components as dcc
 3 | import dash_html_components as html
 4 | from dash.dependencies import Input, Output, State
 5 | from flask import Flask
 6 | import flask
 7 | import webbrowser
 8 | import os
 9 | 
10 | STATIC_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'static')
11 | 
12 | server = Flask(__name__)
13 | app = dash.Dash(name = __name__, server = server)
14 | 
15 | urls = {
16 |     'plot_1': '/static/lda_B00MXWFUQC.html',
17 |     'plot_2': '/static/lda_B00UCZGS6S.html'
18 | }
19 | 
20 | init_key, init_val = next(iter(urls.items()))
21 | 
22 | print(init_key)
23 | print(init_val)
24 | 
25 | dd = dcc.Dropdown(
26 |     id='dropdown',
27 |     options= [{'label': k, 'value': v} for k, v in urls.items()],
28 |     #value=init_key,
29 |     placeholder="Choose the plot"
30 | )
31 | 
32 | # embedded plot element whose `src` parameter will
33 | # be populated and updated with dropdown values
34 | plot = html.Iframe(
35 |     id='plot',
36 |     style={'border': 'none', 'width': '100%', 'height': 500},
37 |     src=init_val
38 | )
39 | 
40 | # set div containing dropdown and embedded plot
41 | app.layout = html.Div(children=[dd, plot])
42 | 
43 | # update `src` parameter on dropdown select action
44 | @app.callback(
45 |     Output(component_id='plot', component_property='src'),
46 |     [Input(component_id='dropdown', component_property='value')]
47 | )
48 | def update_plot_src(input_value):
49 |     return input_value
50 | 
51 | '''app.layout = html.Div( 
52 |    html.Iframe(src='/static/lda_B00MXWFUQC.html', style=dict(position="absolute", left="0", top="0", width="100%", height="100%"))
53 | )'''
54 | 
55 | @app.server.route('/static/<resource>')
56 | def serve_static(resource):
57 |     return flask.send_from_directory(STATIC_PATH, resource)
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     webbrowser.open('http://127.0.0.1:8050/', new=0, autoraise=True) 
62 |     app.run_server(debug=True, use_reloader=False)


--------------------------------------------------------------------------------
/report/bibliography.bib:
--------------------------------------------------------------------------------
 1 | @misc{usb1,
 2 |     author =       "US Census Bureau News",
 3 |     title =        "QUARTERLY RETAIL E-COMMERCE SALES",
 4 |     year =         "3rd Quarter 2019",
 5 |     howpublished = "\url{https://www.census.gov/retail/mrts/www/data/pdf/ec_current.pdf}"
 6 | }
 7 | 
 8 | @misc{amazondataset,
 9 |     author = "Amazon, Jianmo Ni",
10 |     title = "Amazon Review Data, 2018",
11 |     howpublished = "\url{https://nijianmo.github.io/amazon/index.html}"
12 | }
13 | 
14 | @inproceedings{titov2008joint,
15 |   title={A joint model of text and aspect ratings for sentiment summarization},
16 |   author={Titov, Ivan and McDonald, Ryan},
17 |   booktitle={proceedings of ACL-08: HLT},
18 |   pages={308--316},
19 |   year={2008}
20 | }
21 | 
22 | @article{blei2003latent,
23 |   title={Latent dirichlet allocation},
24 |   author={Blei, David M and Ng, Andrew Y and Jordan, Michael I},
25 |   journal={Journal of machine Learning research},
26 |   volume={3},
27 |   number={Jan},
28 |   pages={993--1022},
29 |   year={2003}
30 | }
31 | 
32 | @inproceedings{lin2009joint,
33 |   title={Joint sentiment/topic model for sentiment analysis},
34 |   author={Lin, Chenghua and He, Yulan},
35 |   booktitle={Proceedings of the 18th ACM conference on Information and knowledge management},
36 |   pages={375--384},
37 |   year={2009},
38 |   organization={ACM}
39 | }
40 | 
41 | @inproceedings{hutto2014vader,
42 |   title={Vader: A parsimonious rule-based model for sentiment analysis of social media text},
43 |   author={Hutto, Clayton J and Gilbert, Eric},
44 |   booktitle={Eighth international AAAI conference on weblogs and social media},
45 |   year={2014}
46 | }
47 | 
48 | @misc{amazon1,
49 | author = "Amazon",
50 | title="Customer Service - Verified Reviews",
51 | howpublished="\url{https://www.amazon.com/gp/help/customer/display.html/ref=hp_20079100_verifiedreviews?nodeId=201145140}"
52 | }
53 | 
54 | @misc{plots1,
55 | author="Uma Gajendragadkar",
56 | title="Product Recommender using Amazon Review dataset",
57 | howpublished="\url{https://towardsdatascience.com/product-recommender-using-amazon-review-dataset-e69d479d81dd}"}
58 | 
59 | @article{schoenmuller2018extreme,
60 |   title={The extreme distribution of online reviews: Prevalence, drivers and implications},
61 |   author={Schoenm{\"u}ller, Verena and Netzer, Oded and Stahl, Florian},
62 |   journal={Columbia Business School Research Paper},
63 |   number={18-10},
64 |   year={2018}
65 | }
66 | 
67 | @inproceedings{sievert2014ldavis,
68 |   title={LDAvis: A method for visualizing and interpreting topics},
69 |   author={Sievert, Carson and Shirley, Kenneth},
70 |   booktitle={Proceedings of the workshop on interactive language learning, visualization, and interfaces},
71 |   pages={63--70},
72 |   year={2014}
73 | }
74 | 
75 | @misc{trends,
76 | author="Edison Trends",
77 | title="eBay and Amazon seles trends",
78 | howpublished="\url{https://trends.edison.tech/research/2018-ebay-vs-amazon.html}"}


--------------------------------------------------------------------------------
/scripts/ver_unver.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "data": [
  4 |             "B005NF5NTK",
  5 |             "B0092KJ9BU",
  6 |             "B00AANQLRI",
  7 |             "B00BT8L2MW",
  8 |             "B00D856NOG",
  9 |             "B00G7UY3EG",
 10 |             "B00IGISUTG",
 11 |             "B00M51DDT2",
 12 |             "B00M6QODH2",
 13 |             "B00MQSMDYU",
 14 |             "B00MXWFUQC",
 15 |             "B00P7N0320",
 16 |             "B00QN1T6NM",
 17 |             "B00UCZGS6S",
 18 |             "B00UH3L82Y",
 19 |             "B00VH88CJ0",
 20 |             "B00X5RV14Y",
 21 |             "B014EB532U",
 22 |             "B018JW3EOY",
 23 |             "B019PV2I3G"
 24 |         ],
 25 |         "name": "products"
 26 |     },
 27 |     {
 28 |         "data": [
 29 |             4.573510773130545,
 30 |             4.160249739854319,
 31 |             4.65080971659919,
 32 |             4.601609657947686,
 33 |             4.691282051282052,
 34 |             4.324240062353858,
 35 |             4.62351868732908,
 36 |             4.63860544217687,
 37 |             4.524484536082475,
 38 |             4.653753026634383,
 39 |             3.9659863945578233,
 40 |             4.711210096510765,
 41 |             4.374885426214482,
 42 |             4.231009365244537,
 43 |             3.7160751565762005,
 44 |             4.757822277847309,
 45 |             4.792971734148205,
 46 |             4.285420944558521,
 47 |             4.715344699777613,
 48 |             4.693006993006993
 49 |         ],
 50 |         "name": "verified"
 51 |     },
 52 |     {
 53 |         "data": [
 54 |             4.196428571428571,
 55 |             3.825242718446602,
 56 |             4.285714285714286,
 57 |             4.235294117647059,
 58 |             4.621212121212121,
 59 |             4.3936170212765955,
 60 |             4.545454545454546,
 61 |             4.428571428571429,
 62 |             4.577114427860696,
 63 |             4.693069306930693,
 64 |             4.427083333333333,
 65 |             4.837349397590361,
 66 |             4.588235294117647,
 67 |             3.702127659574468,
 68 |             4.0,
 69 |             4.678571428571429,
 70 |             4.769607843137255,
 71 |             4.111111111111111,
 72 |             4.834394904458598,
 73 |             4.576923076923077
 74 |         ],
 75 |         "name": "unverified"
 76 |     },
 77 |     {
 78 |         "data": [
 79 |             4.560587515299877,
 80 |             4.12781954887218,
 81 |             4.638318670576735,
 82 |             4.589494163424124,
 83 |             4.686839577329491,
 84 |             4.328976034858388,
 85 |             4.621983914209116,
 86 |             4.623520126282557,
 87 |             4.535312180143296,
 88 |             4.656716417910448,
 89 |             4.011247443762781,
 90 |             4.725049570389953,
 91 |             4.398533007334963,
 92 |             4.2063492063492065,
 93 |             3.7296222664015906,
 94 |             4.7538644470868014,
 95 |             4.789821546596166,
 96 |             4.280719280719281,
 97 |             4.727755644090306,
 98 |             4.687002652519894
 99 |         ],
100 |         "name": "all"
101 |     }
102 | ]


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Analytics on Amazon Reviews
 2 | 
 3 | Data Analytics exam final project, [MSc in Computer Science](https://github.com/avivace/compsci).
 4 | 
 5 | By [Matteo Coppola](https://github.com/matteocoppola), [Luca Palazzi](https://github.com/lucapalazzi), [Antonio Vivace](https://github.com/avivace).
 6 | 
 7 | > Exploration, Sentiment Analysis, Topic Analysis (LDA) and a VueJS web application exposing the trained models.
 8 | 
 9 | [GO. PLAY. WITH THE PLOTS.](https://avivace.github.io/reviews-sentiment) (web demo deployment)
10 | 
11 | [Documentation](report.pdf)
12 | 
13 | 
14 | #### Exploration
15 | 
16 | <img src="figures/1_rew_len_over_time.svg" width="50%"><img src="figures/1_avg_help_25_100_traffic.svg"  width="50%">
17 | 
18 | <img src="figures/1_ver_unver_time_traffic.svg"  width="50%"><img src="figures/1_correlation_words_opinion.svg"  width="50%">
19 | 
20 | #### Web demo
21 | 
22 | <img src="https://github.com/avivace/reviews-sentiment/blob/develop/figures/ext/webapp1.png">
23 | 
24 | <img src="https://github.com/avivace/reviews-sentiment/blob/develop/figures/ext/webapp2.png">
25 | 
26 | <img src="https://github.com/avivace/reviews-sentiment/blob/develop/figures/ext/webapp3.png">
27 | 
28 | <img src="https://github.com/avivace/reviews-sentiment/blob/develop/figures/ext/webapp4.png">
29 | 
30 | <img src="https://github.com/avivace/reviews-sentiment/blob/develop/figures/ext/webapp_plot2.png">
31 | 
32 | <img src="https://github.com/avivace/reviews-sentiment/blob/develop/figures/ext/webapp_plot1.png">
33 | 
34 | <img src="https://github.com/avivace/reviews-sentiment/blob/develop/figures/ext/webapp_plot3.png">
35 | 
36 | 
37 | 
38 | ## Run
39 | 
40 | Set up the a Python virtual environment and install required packages
41 | 
42 | ```bash
43 | cd scripts
44 | python3 -m venv .
45 | source bin/activate
46 | pip3 install -r requirements.txt
47 | python3 -m spacy download en
48 | ```
49 | 
50 | Optionally, install a ipynb kernel to use the venv packages
51 | ```bash
52 | pip3 install --user ipykernel
53 | python -m ipykernel install --user --name=myenv
54 | # Check the installed kernels
55 | jupyter kernelspec list
56 | # Run Jupyter
57 | jupyter lab
58 | ```
59 | 
60 | 
61 | Now, to run the full pipeline:
62 | ```bash
63 | python3 main.py
64 | ```
65 | 
66 | A Flask application exposes a simple API (on port 5000) allowing the trained models to be used on demand via simple HTTP requests (in main.py). The VueJS application needs a recent version of NodeJS and npm.
67 | 
68 | ```bash
69 | cd webapp
70 | npm install
71 | # serve the web application with hot reload at localhost:8080/reviews-sentiment
72 | npm run serve
73 | # builds the web application for production
74 | npm run build
75 | # deploys the build on the master branch, making github serve it on https://avivace.github.io/reviews-sentiment
76 | npm run deploy
77 | ```
78 | 
79 | 
80 | #### Antuz notes
81 | 
82 | Accent is `#B71C1C`, typeface is *Barlow* 500. On the plots and graphs, typeface is *Inter* 600, palette is `#4DAF4A`, `#FF7F00`, `#C73E31`.
83 | 
84 | #### Final notes from our supervisor, E.Fersini
85 | 
86 | Unverified/Spam "boom" happens around the first-publishing of some product, aggregating data from a category will hardly show this (there are papers on this)
87 | 


--------------------------------------------------------------------------------
/report/5_conclusioni.tex:
--------------------------------------------------------------------------------
 1 | \chapter{Web app}
 2 | Abbiamo sviluppato una applicazione web interattiva che dimostra alcuni dei nostri risultati.
 3 | 
 4 | È strutturata in due parti secondo i principi Restful: Il backend utilizza Flask per poter offrire una semplice API attraverso il quale è possibile sfruttare i modelli allenati ed esporne la funzione \texttt{pred\_prob}, in modo da visualizzare in tempo reale il comportamento del classificatore su di un testo personalizzato, non facente parte del dataset iniziale. Il sistema che abbiamo costruito per esporre i modelli fittati è un buon modo di rendere utilizzabile da chiunque, senza dover passare da script, i risultati del nostro progetto. È facilmente estendibile per altri classificatori e parametri.
 5 | \par
 6 | Il frontend è sviluppato con Vue JS, un framework Javascript per sviluppare applicazioni reattive. Offre un'interfaccia che consuma l'API appena descritta, visualizzando il risultato della computazione.
 7 | \par
 8 | Per LDA, un'altra pagina nella stessa applicazione raccoglie sei plot interattivi generati da pyLDAvis, permettendo di consularli e mostrando descrizione, codice e titolo di ognuno degli articoli a cui si riferiscono.
 9 | 
10 | \begin{figure}[H]
11 |   \centering
12 |   \captionsetup{margin=1cm}
13 |   \includegraphics[width=1\linewidth]{figures/ext/webapp1.png}
14 |   \caption{Vista Sentiment Analysis della Demo}
15 |   \label{zipf_law}
16 | \end{figure}
17 | 
18 | \begin{figure}[H]
19 |   \centering
20 |   \captionsetup{margin=1cm}
21 |   \includegraphics[width=1\linewidth]{figures/ext/webapp2.png}
22 |   \caption{Vista LDA della Demo (pyLDAvis)}
23 |   \label{zipf_law}
24 | \end{figure}
25 | 
26 | \begin{figure}[H]
27 |   \centering
28 |   \captionsetup{margin=1cm}
29 |   \includegraphics[width=1\linewidth]{figures/ext/webapp3.png}
30 |   \caption{Export pyLDAvis per un singolo prodotto}
31 |   \label{zipf_law}
32 | \end{figure}
33 | 
34 | \chapter{Conclusioni}
35 | L'esplorazione delle recensioni di prodotti Amazon ci ha permesso di constatare l'enorme numero di informazioni che possono essere estratte da opinioni degli acquirenti con lo scopo di stilare statistiche e valutazioni e poter quindi prendere decisioni in ambito aziendale per migliorare i servizi offerti o centrare meglio la propria clientela.
36 | \par
37 | Lo studio di sentiment analysis dimostra che si possono ottenere modelli addestrati con metriche di controllo molto soddisfacenti e pronti per essere usati nell'analisi sentimentale delle future recensioni.
38 | \par
39 | Per quanto concerne la topic analysis, gli argomenti individuati sui prodotti più recensiti attraverso il modello LDA non sempre sono facili da interpretare. Abbiamo evitato la creazione di argomenti troppo generali ma non abbiamo sempre ottenuto argomenti facilmente utilizzabili per fare ragionamenti complessi sui prodotti. Con ogni probabilità, strumenti più avanzati di topic sentiment analysis porterebbero ad una scelta degli argomenti più logica ed intuitiva.
40 | \par
41 | I modelli utilizzati sono facilmente adattabili a qualsiasi categoria di e-Commerce dotata di una forma di recensioni e, considerati i risultati raggiunti con strumenti base di Data Analytics, non c'è da stupirsi se Amazon sia riuscito a raggiungere la vetta facendo leva su queste tecnologie.


--------------------------------------------------------------------------------
/scripts/timeseries_unver.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "x": 1359590400000,
  4 |     "y": 694
  5 |   },
  6 |   {
  7 |     "x": 1362009600000,
  8 |     "y": 548
  9 |   },
 10 |   {
 11 |     "x": 1364688000000,
 12 |     "y": 537
 13 |   },
 14 |   {
 15 |     "x": 1367280000000,
 16 |     "y": 501
 17 |   },
 18 |   {
 19 |     "x": 1369958400000,
 20 |     "y": 607
 21 |   },
 22 |   {
 23 |     "x": 1372550400000,
 24 |     "y": 587
 25 |   },
 26 |   {
 27 |     "x": 1375228800000,
 28 |     "y": 663
 29 |   },
 30 |   {
 31 |     "x": 1377907200000,
 32 |     "y": 583
 33 |   },
 34 |   {
 35 |     "x": 1380499200000,
 36 |     "y": 532
 37 |   },
 38 |   {
 39 |     "x": 1383177600000,
 40 |     "y": 665
 41 |   },
 42 |   {
 43 |     "x": 1385769600000,
 44 |     "y": 605
 45 |   },
 46 |   {
 47 |     "x": 1388448000000,
 48 |     "y": 732
 49 |   },
 50 |   {
 51 |     "x": 1391126400000,
 52 |     "y": 795
 53 |   },
 54 |   {
 55 |     "x": 1393545600000,
 56 |     "y": 682
 57 |   },
 58 |   {
 59 |     "x": 1396224000000,
 60 |     "y": 824
 61 |   },
 62 |   {
 63 |     "x": 1398816000000,
 64 |     "y": 868
 65 |   },
 66 |   {
 67 |     "x": 1401494400000,
 68 |     "y": 985
 69 |   },
 70 |   {
 71 |     "x": 1404086400000,
 72 |     "y": 1847
 73 |   },
 74 |   {
 75 |     "x": 1406764800000,
 76 |     "y": 4862
 77 |   },
 78 |   {
 79 |     "x": 1409443200000,
 80 |     "y": 5429
 81 |   },
 82 |   {
 83 |     "x": 1412035200000,
 84 |     "y": 5189
 85 |   },
 86 |   {
 87 |     "x": 1414713600000,
 88 |     "y": 5991
 89 |   },
 90 |   {
 91 |     "x": 1417305600000,
 92 |     "y": 3784
 93 |   },
 94 |   {
 95 |     "x": 1419984000000,
 96 |     "y": 2402
 97 |   },
 98 |   {
 99 |     "x": 1422662400000,
100 |     "y": 2805
101 |   },
102 |   {
103 |     "x": 1425081600000,
104 |     "y": 2738
105 |   },
106 |   {
107 |     "x": 1427760000000,
108 |     "y": 3529
109 |   },
110 |   {
111 |     "x": 1430352000000,
112 |     "y": 3527
113 |   },
114 |   {
115 |     "x": 1433030400000,
116 |     "y": 3331
117 |   },
118 |   {
119 |     "x": 1435622400000,
120 |     "y": 2781
121 |   },
122 |   {
123 |     "x": 1438300800000,
124 |     "y": 2593
125 |   },
126 |   {
127 |     "x": 1440979200000,
128 |     "y": 2829
129 |   },
130 |   {
131 |     "x": 1443571200000,
132 |     "y": 3360
133 |   },
134 |   {
135 |     "x": 1446249600000,
136 |     "y": 3333
137 |   },
138 |   {
139 |     "x": 1448841600000,
140 |     "y": 4304
141 |   },
142 |   {
143 |     "x": 1451520000000,
144 |     "y": 4071
145 |   },
146 |   {
147 |     "x": 1454198400000,
148 |     "y": 4149
149 |   },
150 |   {
151 |     "x": 1456704000000,
152 |     "y": 3320
153 |   },
154 |   {
155 |     "x": 1459382400000,
156 |     "y": 4051
157 |   },
158 |   {
159 |     "x": 1461974400000,
160 |     "y": 4759
161 |   },
162 |   {
163 |     "x": 1464652800000,
164 |     "y": 4484
165 |   },
166 |   {
167 |     "x": 1467244800000,
168 |     "y": 6143
169 |   },
170 |   {
171 |     "x": 1469923200000,
172 |     "y": 4830
173 |   },
174 |   {
175 |     "x": 1472601600000,
176 |     "y": 3115
177 |   },
178 |   {
179 |     "x": 1475193600000,
180 |     "y": 2665
181 |   },
182 |   {
183 |     "x": 1477872000000,
184 |     "y": 1869
185 |   },
186 |   {
187 |     "x": 1480464000000,
188 |     "y": 1260
189 |   },
190 |   {
191 |     "x": 1483142400000,
192 |     "y": 1396
193 |   },
194 |   {
195 |     "x": 1485820800000,
196 |     "y": 1132
197 |   },
198 |   {
199 |     "x": 1488240000000,
200 |     "y": 723
201 |   },
202 |   {
203 |     "x": 1490918400000,
204 |     "y": 800
205 |   },
206 |   {
207 |     "x": 1493510400000,
208 |     "y": 746
209 |   },
210 |   {
211 |     "x": 1496188800000,
212 |     "y": 666
213 |   },
214 |   {
215 |     "x": 1498780800000,
216 |     "y": 666
217 |   },
218 |   {
219 |     "x": 1501459200000,
220 |     "y": 833
221 |   },
222 |   {
223 |     "x": 1504137600000,
224 |     "y": 679
225 |   },
226 |   {
227 |     "x": 1506729600000,
228 |     "y": 500
229 |   },
230 |   {
231 |     "x": 1509408000000,
232 |     "y": 459
233 |   },
234 |   {
235 |     "x": 1512000000000,
236 |     "y": 407
237 |   },
238 |   {
239 |     "x": 1514678400000,
240 |     "y": 403
241 |   }
242 | ]


--------------------------------------------------------------------------------
/scripts/timeseries.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "x": 1359590400000,
  4 |     "y": 7389
  5 |   },
  6 |   {
  7 |     "x": 1362009600000,
  8 |     "y": 6121
  9 |   },
 10 |   {
 11 |     "x": 1364688000000,
 12 |     "y": 6232
 13 |   },
 14 |   {
 15 |     "x": 1367280000000,
 16 |     "y": 5897
 17 |   },
 18 |   {
 19 |     "x": 1369958400000,
 20 |     "y": 6590
 21 |   },
 22 |   {
 23 |     "x": 1372550400000,
 24 |     "y": 6730
 25 |   },
 26 |   {
 27 |     "x": 1375228800000,
 28 |     "y": 7585
 29 |   },
 30 |   {
 31 |     "x": 1377907200000,
 32 |     "y": 7784
 33 |   },
 34 |   {
 35 |     "x": 1380499200000,
 36 |     "y": 6660
 37 |   },
 38 |   {
 39 |     "x": 1383177600000,
 40 |     "y": 7830
 41 |   },
 42 |   {
 43 |     "x": 1385769600000,
 44 |     "y": 7970
 45 |   },
 46 |   {
 47 |     "x": 1388448000000,
 48 |     "y": 9832
 49 |   },
 50 |   {
 51 |     "x": 1391126400000,
 52 |     "y": 10697
 53 |   },
 54 |   {
 55 |     "x": 1393545600000,
 56 |     "y": 8668
 57 |   },
 58 |   {
 59 |     "x": 1396224000000,
 60 |     "y": 9675
 61 |   },
 62 |   {
 63 |     "x": 1398816000000,
 64 |     "y": 9086
 65 |   },
 66 |   {
 67 |     "x": 1401494400000,
 68 |     "y": 9135
 69 |   },
 70 |   {
 71 |     "x": 1404086400000,
 72 |     "y": 9717
 73 |   },
 74 |   {
 75 |     "x": 1406764800000,
 76 |     "y": 17970
 77 |   },
 78 |   {
 79 |     "x": 1409443200000,
 80 |     "y": 19101
 81 |   },
 82 |   {
 83 |     "x": 1412035200000,
 84 |     "y": 18890
 85 |   },
 86 |   {
 87 |     "x": 1414713600000,
 88 |     "y": 22104
 89 |   },
 90 |   {
 91 |     "x": 1417305600000,
 92 |     "y": 22577
 93 |   },
 94 |   {
 95 |     "x": 1419984000000,
 96 |     "y": 27335
 97 |   },
 98 |   {
 99 |     "x": 1422662400000,
100 |     "y": 28805
101 |   },
102 |   {
103 |     "x": 1425081600000,
104 |     "y": 28347
105 |   },
106 |   {
107 |     "x": 1427760000000,
108 |     "y": 28717
109 |   },
110 |   {
111 |     "x": 1430352000000,
112 |     "y": 25657
113 |   },
114 |   {
115 |     "x": 1433030400000,
116 |     "y": 24539
117 |   },
118 |   {
119 |     "x": 1435622400000,
120 |     "y": 24519
121 |   },
122 |   {
123 |     "x": 1438300800000,
124 |     "y": 25943
125 |   },
126 |   {
127 |     "x": 1440979200000,
128 |     "y": 26416
129 |   },
130 |   {
131 |     "x": 1443571200000,
132 |     "y": 26011
133 |   },
134 |   {
135 |     "x": 1446249600000,
136 |     "y": 28206
137 |   },
138 |   {
139 |     "x": 1448841600000,
140 |     "y": 27989
141 |   },
142 |   {
143 |     "x": 1451520000000,
144 |     "y": 27912
145 |   },
146 |   {
147 |     "x": 1454198400000,
148 |     "y": 30385
149 |   },
150 |   {
151 |     "x": 1456704000000,
152 |     "y": 26183
153 |   },
154 |   {
155 |     "x": 1459382400000,
156 |     "y": 29246
157 |   },
158 |   {
159 |     "x": 1461974400000,
160 |     "y": 28392
161 |   },
162 |   {
163 |     "x": 1464652800000,
164 |     "y": 28183
165 |   },
166 |   {
167 |     "x": 1467244800000,
168 |     "y": 28803
169 |   },
170 |   {
171 |     "x": 1469923200000,
172 |     "y": 28538
173 |   },
174 |   {
175 |     "x": 1472601600000,
176 |     "y": 27613
177 |   },
178 |   {
179 |     "x": 1475193600000,
180 |     "y": 24331
181 |   },
182 |   {
183 |     "x": 1477872000000,
184 |     "y": 20732
185 |   },
186 |   {
187 |     "x": 1480464000000,
188 |     "y": 18169
189 |   },
190 |   {
191 |     "x": 1483142400000,
192 |     "y": 20972
193 |   },
194 |   {
195 |     "x": 1485820800000,
196 |     "y": 20163
197 |   },
198 |   {
199 |     "x": 1488240000000,
200 |     "y": 14324
201 |   },
202 |   {
203 |     "x": 1490918400000,
204 |     "y": 15718
205 |   },
206 |   {
207 |     "x": 1493510400000,
208 |     "y": 12842
209 |   },
210 |   {
211 |     "x": 1496188800000,
212 |     "y": 11181
213 |   },
214 |   {
215 |     "x": 1498780800000,
216 |     "y": 10209
217 |   },
218 |   {
219 |     "x": 1501459200000,
220 |     "y": 10101
221 |   },
222 |   {
223 |     "x": 1504137600000,
224 |     "y": 9644
225 |   },
226 |   {
227 |     "x": 1506729600000,
228 |     "y": 8017
229 |   },
230 |   {
231 |     "x": 1509408000000,
232 |     "y": 7343
233 |   },
234 |   {
235 |     "x": 1512000000000,
236 |     "y": 7196
237 |   },
238 |   {
239 |     "x": 1514678400000,
240 |     "y": 6432
241 |   }
242 | ]


--------------------------------------------------------------------------------
/scripts/timeseries_ver.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "x": 1359590400000,
  4 |     "y": 6695
  5 |   },
  6 |   {
  7 |     "x": 1362009600000,
  8 |     "y": 5573
  9 |   },
 10 |   {
 11 |     "x": 1364688000000,
 12 |     "y": 5695
 13 |   },
 14 |   {
 15 |     "x": 1367280000000,
 16 |     "y": 5396
 17 |   },
 18 |   {
 19 |     "x": 1369958400000,
 20 |     "y": 5983
 21 |   },
 22 |   {
 23 |     "x": 1372550400000,
 24 |     "y": 6143
 25 |   },
 26 |   {
 27 |     "x": 1375228800000,
 28 |     "y": 6922
 29 |   },
 30 |   {
 31 |     "x": 1377907200000,
 32 |     "y": 7201
 33 |   },
 34 |   {
 35 |     "x": 1380499200000,
 36 |     "y": 6128
 37 |   },
 38 |   {
 39 |     "x": 1383177600000,
 40 |     "y": 7165
 41 |   },
 42 |   {
 43 |     "x": 1385769600000,
 44 |     "y": 7365
 45 |   },
 46 |   {
 47 |     "x": 1388448000000,
 48 |     "y": 9100
 49 |   },
 50 |   {
 51 |     "x": 1391126400000,
 52 |     "y": 9902
 53 |   },
 54 |   {
 55 |     "x": 1393545600000,
 56 |     "y": 7986
 57 |   },
 58 |   {
 59 |     "x": 1396224000000,
 60 |     "y": 8851
 61 |   },
 62 |   {
 63 |     "x": 1398816000000,
 64 |     "y": 8218
 65 |   },
 66 |   {
 67 |     "x": 1401494400000,
 68 |     "y": 8150
 69 |   },
 70 |   {
 71 |     "x": 1404086400000,
 72 |     "y": 7870
 73 |   },
 74 |   {
 75 |     "x": 1406764800000,
 76 |     "y": 13108
 77 |   },
 78 |   {
 79 |     "x": 1409443200000,
 80 |     "y": 13672
 81 |   },
 82 |   {
 83 |     "x": 1412035200000,
 84 |     "y": 13701
 85 |   },
 86 |   {
 87 |     "x": 1414713600000,
 88 |     "y": 16113
 89 |   },
 90 |   {
 91 |     "x": 1417305600000,
 92 |     "y": 18793
 93 |   },
 94 |   {
 95 |     "x": 1419984000000,
 96 |     "y": 24933
 97 |   },
 98 |   {
 99 |     "x": 1422662400000,
100 |     "y": 26000
101 |   },
102 |   {
103 |     "x": 1425081600000,
104 |     "y": 25609
105 |   },
106 |   {
107 |     "x": 1427760000000,
108 |     "y": 25188
109 |   },
110 |   {
111 |     "x": 1430352000000,
112 |     "y": 22130
113 |   },
114 |   {
115 |     "x": 1433030400000,
116 |     "y": 21208
117 |   },
118 |   {
119 |     "x": 1435622400000,
120 |     "y": 21738
121 |   },
122 |   {
123 |     "x": 1438300800000,
124 |     "y": 23350
125 |   },
126 |   {
127 |     "x": 1440979200000,
128 |     "y": 23587
129 |   },
130 |   {
131 |     "x": 1443571200000,
132 |     "y": 22651
133 |   },
134 |   {
135 |     "x": 1446249600000,
136 |     "y": 24873
137 |   },
138 |   {
139 |     "x": 1448841600000,
140 |     "y": 23685
141 |   },
142 |   {
143 |     "x": 1451520000000,
144 |     "y": 23841
145 |   },
146 |   {
147 |     "x": 1454198400000,
148 |     "y": 26236
149 |   },
150 |   {
151 |     "x": 1456704000000,
152 |     "y": 22863
153 |   },
154 |   {
155 |     "x": 1459382400000,
156 |     "y": 25195
157 |   },
158 |   {
159 |     "x": 1461974400000,
160 |     "y": 23633
161 |   },
162 |   {
163 |     "x": 1464652800000,
164 |     "y": 23699
165 |   },
166 |   {
167 |     "x": 1467244800000,
168 |     "y": 22660
169 |   },
170 |   {
171 |     "x": 1469923200000,
172 |     "y": 23708
173 |   },
174 |   {
175 |     "x": 1472601600000,
176 |     "y": 24498
177 |   },
178 |   {
179 |     "x": 1475193600000,
180 |     "y": 21666
181 |   },
182 |   {
183 |     "x": 1477872000000,
184 |     "y": 18863
185 |   },
186 |   {
187 |     "x": 1480464000000,
188 |     "y": 16909
189 |   },
190 |   {
191 |     "x": 1483142400000,
192 |     "y": 19576
193 |   },
194 |   {
195 |     "x": 1485820800000,
196 |     "y": 19031
197 |   },
198 |   {
199 |     "x": 1488240000000,
200 |     "y": 13601
201 |   },
202 |   {
203 |     "x": 1490918400000,
204 |     "y": 14918
205 |   },
206 |   {
207 |     "x": 1493510400000,
208 |     "y": 12096
209 |   },
210 |   {
211 |     "x": 1496188800000,
212 |     "y": 10515
213 |   },
214 |   {
215 |     "x": 1498780800000,
216 |     "y": 9543
217 |   },
218 |   {
219 |     "x": 1501459200000,
220 |     "y": 9268
221 |   },
222 |   {
223 |     "x": 1504137600000,
224 |     "y": 8965
225 |   },
226 |   {
227 |     "x": 1506729600000,
228 |     "y": 7517
229 |   },
230 |   {
231 |     "x": 1509408000000,
232 |     "y": 6884
233 |   },
234 |   {
235 |     "x": 1512000000000,
236 |     "y": 6789
237 |   },
238 |   {
239 |     "x": 1514678400000,
240 |     "y": 6029
241 |   }
242 | ]


--------------------------------------------------------------------------------
/scripts/main.py:
--------------------------------------------------------------------------------
  1 | print("# Initialisation")
  2 | import data_exploration
  3 | import sentiment_analysis
  4 | import topic_analysis
  5 | import pandas as pd
  6 | from data_utils import load_dataset
  7 | from data_utils import feature_manipulation
  8 | from data_utils import add_features
  9 | from pathlib import Path
 10 | import re
 11 | 
 12 | from flask import Flask
 13 | from flask import request
 14 | from flask import jsonify
 15 | from flask_cors import CORS
 16 | 
 17 | from bs4 import BeautifulSoup
 18 | 
 19 | app = Flask(__name__)
 20 | CORS(app)
 21 | 
 22 | def load_initial_dataset():
 23 |     dataset_folder = Path("../datasets/")
 24 |     try:
 25 |     	# Try to load a cached version of the dataframe
 26 |         print("Trying to load the cached dataframe...")
 27 |         df = pd.read_pickle(dataset_folder / 'cached_dataframe.pkl')
 28 |         print("Done")
 29 |     except:
 30 |         print("No cached dataframe, loading the dataset from disk")
 31 |         path_file = dataset_folder / 'Cell_Phones_and_Accessories.json'
 32 |         print(path_file)
 33 |         df = load_dataset(path_file)
 34 |         # Store the dataframe on disk
 35 |         print("Caching the dataframe")
 36 |         df.to_pickle(dataset_folder / 'cached_dataframe.pkl')
 37 |     return df
 38 | 
 39 | 
 40 | def pp_add_features(df):
 41 |     preprocessed = df.copy(True)
 42 |     # Runs vote_to_opinion(), words_count(), transform_unix_date()
 43 |     add_features(preprocessed)
 44 |     return preprocessed
 45 |     
 46 | 
 47 | def preprocessing_post_exploration_dataset(df):
 48 |     dataset_folder = Path("../datasets/")
 49 |     try:
 50 |         print("Trying to load the cached preprocessed dataframe...")
 51 |         preprocessed = pd.read_pickle(dataset_folder / 'cached_preprocessed_dataframe.pkl')
 52 |         print("Done")
 53 |     except:
 54 |         print("No cached dataframe, loading the dataset from disk")
 55 |         preprocessed = df.copy(True)
 56 |         feature_manipulation(preprocessed)
 57 |         print("Caching the preprocessed dataframe")
 58 |         preprocessed.to_pickle(dataset_folder / 'cached_preprocessed_dataframe.pkl')
 59 |     return preprocessed
 60 | 
 61 | @app.route('/')
 62 | def hello():
 63 |     result ={}
 64 |     result["positive"] = sentiment_analysis.compute_single(request.args.get('text'), best_nb, count_vector)[0][1]
 65 |     return jsonify(result)
 66 | 
 67 | def striphtml(reviews):
 68 |     n = 0
 69 |     filtered_reviews = []
 70 |     for text in df['reviewText']:
 71 |         m = re.search('<\s*a[^>]*>(.*?)<\s*/\s*a>', text)
 72 |         if m:
 73 |             soup = BeautifulSoup(text,features="html.parser")
 74 |             stripped_text = soup.get_text()
 75 |             filtered_reviews.append(stripped_text)
 76 |             n = n + 1
 77 |         else:
 78 |             filtered_reviews.append(text)
 79 |             
 80 |     print("HTML stripped on",n,"reviews")
 81 |     return filtered_reviews
 82 | 
 83 | def clean_dirt(df):
 84 |     reviews = df['reviewText'].tolist()
 85 |     htmlcleaned_reviews = striphtml(reviews)
 86 |     df['reviewText'] = [''.join(review) for review in htmlcleaned_reviews]
 87 |     
 88 | 
 89 | def check_dirt(df):
 90 |     c = 0
 91 |     for text in df['reviewText']:
 92 |         m = re.search('<\s*a[^>]*>(.*?)<\s*/\s*a>', text)
 93 |         if m:
 94 |             c = c+1        
 95 |     print(c,"dirty reviews")
 96 | 
 97 | def find_reviews_with_custom_text(df):
 98 |     list_res = []
 99 |     i = 0
100 | 
101 |     while i < 200: #while len(df.index) > 0:
102 |         print("Finding reviews at chunk ", i)
103 |         temp_df = df.head(10000)
104 |         for index, row in temp_df.iterrows():
105 |             if "portable charger" in row['reviewText']:
106 |                 list_res.append(row)
107 |         df = df.iloc[10000:]
108 |         i += 1
109 | 
110 |     result = pd.DataFrame(list_res, columns=df.columns)
111 |     print("Result df is ", len(result.index))
112 |     return result
113 | 
114 | def preprocess_for_custom_LDA(df):
115 |     print("Preprocessing for custom LDA")
116 |     preprocessed = df.copy(True)
117 |     add_features(preprocessed)
118 |     feature_manipulation(preprocessed)
119 |     return preprocessed
120 | 
121 | 
122 | if __name__ == "__main__":
123 |     df = load_initial_dataset()
124 | 
125 |     # Remember to set this back to df
126 |     df_rich = pp_add_features(df)
127 | 
128 |     #print(df_exploration.columns)
129 |     #print("MIN YEAR ", df_exploration.year.min())
130 |     #print("MAX YEAR ", df_exploration.year.max())
131 |     print("\n# Data Exploration")
132 |     # data_exploration.top_50_products_verified_unverified_both(df_rich)
133 |     #data_exploration.count_reviews(df_rich)
134 |     data_exploration.export_week_day(df_rich)
135 |     data_exploration.export_month(df_rich)
136 |     #data_exploration.export_year(df_rich)
137 |     #data_exploration.run(df_exploration)
138 |     #df_analysis = preprocessing_post_exploration_dataset(df_exploration)
139 |     
140 |     #check_dirt(df_analysis)
141 |     #clean_dirt(df_analysis)
142 |     #check_dirt(df_analysis)
143 | 
144 |     # Web server exposing the trained models
145 |     #best_nb, best_lr, count_vector = sentiment_analysis.run(df_analysis)
146 |     #app.run()
147 | 
148 |     # Enable Topic Analysis
149 |     #topic_analysis.run(df_analysis)
150 | 
151 |     #"Portable charger" reviews topic analysis
152 |     #df = find_reviews_with_custom_text(df)
153 |     #df_final = preprocess_for_custom_LDA(df)
154 |     #topic_analysis.run_for_custom_analysis(df_final)
155 | 
156 | 


--------------------------------------------------------------------------------
/report/1_introduction.tex:
--------------------------------------------------------------------------------
 1 | \chapter{Introduzione}
 2 | 
 3 | Negli ultimi decenni, l'avvento e la popolarizzazione di servizi online ha cambiato il volto dello shopping su larga scala. Piattaforme come Amazon ed eBay fanno parte della vita di tutti i giorni ed è frequente consultare risorse online prima di acquistare. Nel 2017, il volume di vendite nel mercato statunitense che vengono effettuate online ha raggiunto il 9\% e ci si aspetta che arrivi al 12\% nel 2021 \cite{usb1}.
 4 | 
 5 | La crescita del traffico e della portata dei portali di commercio online genera una quantità crescente di dati sulla natura delle transizioni e degli utenti di questo servizio.
 6 | 
 7 | Una parte importante di questi dati è costituita dai contenuti generati dagli utenti che valutano i prodotti acquistati e condividono la loro esperienza. Si tratta principalmente di valutazioni numeriche, spesso corredate da un breve paragrafo testuale.
 8 | 
 9 | Avere a disposizione un insieme di strumenti che possa processare in modo automatico questa mole di dati è fondamentale per tutti gli attori coinvolti nelle transazioni: produttori, clienti/consumatori, venditori, piattaforme di vendita e pubblicitari.
10 | 
11 | Discipline come la Sentiment Analysis estraggono dei dati strutturati da questi contenuti testuali, permettendo uno sguardo statistico sulle tendenze di comunità di acquirenti sotto diversi aspetti di diversi prodotti. Avere un'idea di quali siano gli elementi più o meno apprezzati di un prodotto, secondo le diverse categorie di utenti permette di agire in modo dinamico e veloce sul loro sviluppo e sulla loro pubblicizzazione. I gestori di questi portali invece saranno interessanti a profilare gruppi di utenti, estraendone le preferenze, e gli elementi di successo dei prodotti, per proporre raccomandazioni sempre più vincenti, accurate e vicine ai desideri dell'utente.
12 | 
13 | Un altro aspetto da non sottovalutare è quello del valore "genuino" che i contenuti generati da altri consumatori riescono a trasmettere. Le recensioni vengono infatti recepite come fonti affidabili e privi di  natura pubblicitaria, rappresentando uno strumento molto potente.
14 | 
15 | Amazon ha sviluppato un sistema per assegnare rilevanza alle recensioni e non è raro che venga usato insieme ad altre tecniche di (auto) marketing, promuovendo articoli con recensioni positive e utili, presentandole ordinate dalla più "convincente" all'utente che sta attraversando il processo di decisione.
16 | 
17 | \section{Obiettivo del progetto}
18 | 
19 | Questo lavoro si sviluppa in tre fasi di seguito sintetizzate.
20 | 
21 | \subsection{Esplorazione}
22 | 
23 | Per approfondire e comprendere la natura di questi contributi, sono state effettuate analisi preliminari sulle recensioni, concentrandoci sulla distribuzione delle opinioni, sulle caratteristiche delle recensioni sui prodotti più rilevanti e sull'evoluzione di questi ultimi fattori nel tempo.
24 | \par
25 | L'obbiettivo è sviluppare una visione su diversi aspetti soggettivi (e variabili nel tempo e per categoria) che caratterizzano le recensioni, al fine di comprendere in che modo vengano prodotte ed interpretate.
26 | \par
27 | Abbiamo approfondito caratteristiche come \texttt{verified} e in che modo influenzano il totale dei dati.
28 | \par
29 | Infine, vengono brevemente presentate alcune ricerche che investigano la questione dello sbilanciamento delle recensioni.
30 | 
31 | \subsection{Sentiment Analysis}
32 | 
33 | In questa fase, analizziamo sistematicamente le parti testuali delle recensioni per estrarne un'opinione.
34 | \par
35 | Una parte preliminare pre-processa e prepara il dataset. Vengono scartate recensioni prolisse e ritenute inutili e fatte ulteriori esplorazioni sul nuovo (ristretto) corpo di recensioni.
36 | \par
37 | Infine, alleniamo due classificatori, Naive Bayes e Logistic Regression, che etichettano queste istanze con la variabile target \texttt{opinion} e valutiamo le loro performance.
38 | 
39 | \subsection{Topic Analysis}
40 | 
41 | In questa fase, viene utilizzato un algoritmo che consente di identificare gli argomenti più discussi all'interno di un corpus di documenti. 
42 | \par La fase di preparazione del dataset è la stessa della fase di sentiment analysis.
43 | \par
44 | Vengono inoltre analizzati gli svantaggi e alcune possibili soluzioni del metodo analizzato. Infine, gli argomenti risultanti dalla sua applicazione vengono visualizzati in maniera interattiva.
45 | 
46 | \section{Dataset}
47 | 
48 | Il dataset utilizzato \cite{amazondataset} proviene da un gruppo di ricerca dell'Università di San Diego, che ha estratto e processato le recensioni rilasciate dagli utenti sul sito Amazon.com fino al 2018 in formato JSON.
49 | \par
50 | Abbiamo scelto il dataset della categoria "Cellulari ed Accessori", in una versione densa, contenente solo i dati generati da utenti con almeno 5 recensioni (\textit{5-core}).
51 | 
52 | \section{Software utilizzati}
53 | 
54 | Python è stato lo strumento fondamentale in questo lavoro, scelta dovuta alla grande quantità di strumenti e librerie open source disponibili per questo linguaggio.
55 | \paragraph{}
56 | Tra le librerie utilizzate, ricordiamo:
57 | \begin{itemize}
58 |     \item Pandas per il caricamento, manipolazione e querying dei dataset
59 |     \item Matplotlib per il rendering di grafici e figure direttamente da dataframe pandas
60 |     \item numpy per un supporto efficiente a matrici e vettori di grosse dimensioni
61 |     \item sklearn per machine learning
62 |     \item pyLDAvis per la visualizzazione interattiva dei topic model
63 |     \item NLTK per Natural Language Processing
64 |     \item VueJS per applicazioni web reattive
65 |     \item Flask per realizzare un'API Restful con le funzionalità implementate
66 | \end{itemize}
67 | 
68 | Il versionamento del codice e la possibilità di lavorare in gruppo sono due importanti strumenti offerti da Git, mentre la documentazione è scritta in \LaTeX{}. 
69 | \par
70 | I prodotti del progetto sono: script per ogni fase della pipeline, Notebook Jupyter interattivi, figure e grafici vettoriali ed un'applicazione web composta da un backend in Python e un frontend in Vue.js che offre un un'interfaccia utente di facile utilizzo che espone alcune funzionalità del nostro lavoro.
71 | \par
72 | Inoltre, per lo sviluppo della demo sono state utilizzate tecnologie frontend basate su Javascript.


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | scripts/DatasetsPath.txt
  2 | # Don't try to commit gigabytes of datasets
  3 | datasets/*
  4 | # But keep the script to pull them there
  5 | !datasets/pull_datasets.sh
  6 | 
  7 | # Ignore cached dataframes
  8 | *.pkl
  9 | 
 10 | # Exported images and web pages
 11 | *.svg
 12 | 
 13 | ### Python ###
 14 | # Byte-compiled / optimized / DLL files
 15 | __pycache__/
 16 | *.py[cod]
 17 | *$py.class
 18 | 
 19 | # C extensions
 20 | *.so
 21 | 
 22 | # Distribution / packaging
 23 | .Python
 24 | build/
 25 | develop-eggs/
 26 | dist/
 27 | downloads/
 28 | eggs/
 29 | .eggs/
 30 | lib/
 31 | lib64/
 32 | parts/
 33 | sdist/
 34 | var/
 35 | wheels/
 36 | pip-wheel-metadata/
 37 | share/python-wheels/
 38 | *.egg-info/
 39 | .installed.cfg
 40 | *.egg
 41 | MANIFEST
 42 | 
 43 | # PyInstaller
 44 | #  Usually these files are written by a python script from a template
 45 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 46 | *.manifest
 47 | *.spec
 48 | 
 49 | # Installer logs
 50 | pip-log.txt
 51 | pip-delete-this-directory.txt
 52 | 
 53 | # Unit test / coverage reports
 54 | htmlcov/
 55 | .tox/
 56 | .nox/
 57 | .coverage
 58 | .coverage.*
 59 | .cache
 60 | nosetests.xml
 61 | coverage.xml
 62 | *.cover
 63 | .hypothesis/
 64 | .pytest_cache/
 65 | 
 66 | # Translations
 67 | *.mo
 68 | *.pot
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # pyenv
 80 | .python-version
 81 | 
 82 | # pipenv
 83 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 84 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 85 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 86 | #   install all needed dependencies.
 87 | #Pipfile.lock
 88 | 
 89 | # celery beat schedule file
 90 | celerybeat-schedule
 91 | 
 92 | # SageMath parsed files
 93 | *.sage.py
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # Mr Developer
103 | .mr.developer.cfg
104 | .project
105 | .pydevproject
106 | 
107 | # mkdocs documentation
108 | /site
109 | 
110 | # mypy
111 | .mypy_cache/
112 | .dmypy.json
113 | dmypy.json
114 | 
115 | # Pyre type checker
116 | .pyre/
117 | 
118 | ### LaTeX ###
119 | ## Core latex/pdflatex auxiliary files:
120 | *.aux
121 | *.lof
122 | *.log
123 | *.lot
124 | *.fls
125 | *.out
126 | *.toc
127 | *.fmt
128 | *.fot
129 | *.cb
130 | *.cb2
131 | .*.lb
132 | 
133 | ## Intermediate documents:
134 | *.dvi
135 | *.xdv
136 | *-converted-to.*
137 | # these rules might exclude image files for figures etc.
138 | # *.ps
139 | # *.eps
140 | # *.pdf
141 | 
142 | ## Generated if empty string is given at "Please type another file name for output:"
143 | .pdf
144 | 
145 | ## Bibliography auxiliary files (bibtex/biblatex/biber):
146 | *.bbl
147 | *.bcf
148 | *.blg
149 | *-blx.aux
150 | *-blx.bib
151 | *.run.xml
152 | 
153 | ## Build tool auxiliary files:
154 | *.fdb_latexmk
155 | *.synctex
156 | *.synctex(busy)
157 | *.synctex.gz
158 | *.synctex.gz(busy)
159 | *.pdfsync
160 | 
161 | ## Build tool directories for auxiliary files
162 | # latexrun
163 | latex.out/
164 | 
165 | ## Auxiliary and intermediate files from other packages:
166 | # algorithms
167 | *.alg
168 | *.loa
169 | 
170 | # achemso
171 | acs-*.bib
172 | 
173 | # amsthm
174 | *.thm
175 | 
176 | # beamer
177 | *.nav
178 | *.pre
179 | *.snm
180 | *.vrb
181 | 
182 | # changes
183 | *.soc
184 | 
185 | # comment
186 | *.cut
187 | 
188 | # cprotect
189 | *.cpt
190 | 
191 | # elsarticle (documentclass of Elsevier journals)
192 | *.spl
193 | 
194 | # endnotes
195 | *.ent
196 | 
197 | # fixme
198 | *.lox
199 | 
200 | # feynmf/feynmp
201 | *.mf
202 | *.mp
203 | *.t[1-9]
204 | *.t[1-9][0-9]
205 | *.tfm
206 | 
207 | #(r)(e)ledmac/(r)(e)ledpar
208 | *.end
209 | *.?end
210 | *.[1-9]
211 | *.[1-9][0-9]
212 | *.[1-9][0-9][0-9]
213 | *.[1-9]R
214 | *.[1-9][0-9]R
215 | *.[1-9][0-9][0-9]R
216 | *.eledsec[1-9]
217 | *.eledsec[1-9]R
218 | *.eledsec[1-9][0-9]
219 | *.eledsec[1-9][0-9]R
220 | *.eledsec[1-9][0-9][0-9]
221 | *.eledsec[1-9][0-9][0-9]R
222 | 
223 | # glossaries
224 | *.acn
225 | *.acr
226 | *.glg
227 | *.glo
228 | *.gls
229 | *.glsdefs
230 | 
231 | # uncomment this for glossaries-extra (will ignore makeindex's style files!)
232 | # *.ist
233 | 
234 | # gnuplottex
235 | *-gnuplottex-*
236 | 
237 | # gregoriotex
238 | *.gaux
239 | *.gtex
240 | 
241 | # htlatex
242 | *.4ct
243 | *.4tc
244 | *.idv
245 | *.lg
246 | *.trc
247 | *.xref
248 | 
249 | # hyperref
250 | *.brf
251 | 
252 | # knitr
253 | *-concordance.tex
254 | # TODO Comment the next line if you want to keep your tikz graphics files
255 | *.tikz
256 | *-tikzDictionary
257 | 
258 | # listings
259 | *.lol
260 | 
261 | # luatexja-ruby
262 | *.ltjruby
263 | 
264 | # makeidx
265 | *.idx
266 | *.ilg
267 | *.ind
268 | 
269 | # minitoc
270 | *.maf
271 | *.mlf
272 | *.mlt
273 | *.mtc[0-9]*
274 | *.slf[0-9]*
275 | *.slt[0-9]*
276 | *.stc[0-9]*
277 | 
278 | # minted
279 | _minted*
280 | *.pyg
281 | 
282 | # morewrites
283 | *.mw
284 | 
285 | # nomencl
286 | *.nlg
287 | *.nlo
288 | *.nls
289 | 
290 | # pax
291 | *.pax
292 | 
293 | # pdfpcnotes
294 | *.pdfpc
295 | 
296 | # sagetex
297 | *.sagetex.sage
298 | *.sagetex.py
299 | *.sagetex.scmd
300 | 
301 | # scrwfile
302 | *.wrt
303 | 
304 | # sympy
305 | *.sout
306 | *.sympy
307 | sympy-plots-for-*.tex/
308 | 
309 | # pdfcomment
310 | *.upa
311 | *.upb
312 | 
313 | # pythontex
314 | *.pytxcode
315 | pythontex-files-*/
316 | 
317 | # tcolorbox
318 | *.listing
319 | 
320 | # thmtools
321 | *.loe
322 | 
323 | # TikZ & PGF
324 | *.dpth
325 | *.md5
326 | *.auxlock
327 | 
328 | # todonotes
329 | *.tdo
330 | 
331 | # vhistory
332 | *.hst
333 | *.ver
334 | 
335 | # easy-todo
336 | *.lod
337 | 
338 | # xcolor
339 | *.xcp
340 | 
341 | # xmpincl
342 | *.xmpi
343 | 
344 | # xindy
345 | *.xdy
346 | 
347 | # xypic precompiled matrices
348 | *.xyc
349 | 
350 | # endfloat
351 | *.ttt
352 | *.fff
353 | 
354 | # Latexian
355 | TSWLatexianTemp*
356 | 
357 | ## Editors:
358 | # WinEdt
359 | *.bak
360 | *.sav
361 | 
362 | # Texpad
363 | .texpadtmp
364 | 
365 | # LyX
366 | *.lyx~
367 | 
368 | # Kile
369 | *.backup
370 | 
371 | # KBibTeX
372 | *~[0-9]*
373 | 
374 | # auto folder when using emacs and auctex
375 | ./auto/*
376 | *.el
377 | 
378 | # expex forward references with \gathertags
379 | *-tags.tex
380 | 
381 | # standalone packages
382 | *.sta
383 | 
384 | ### LaTeX Patch ###
385 | # glossaries
386 | *.glstex
387 | 
388 | # 
389 | */etc/*
390 | */.idea/*
391 | */.ipynb_checkpoints/*
392 | 


--------------------------------------------------------------------------------
/scripts/data_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | ### Import libraries ###
  4 | 
  5 | import pandas as pd
  6 | import numpy as np
  7 | import re
  8 | import nltk
  9 | from nltk.tokenize import RegexpTokenizer
 10 | nltk.download('punkt')
 11 | nltk.download('stopwords')
 12 | import spacy
 13 | 
 14 | from collections import defaultdict
 15 | 
 16 | from html.parser import HTMLParser
 17 | 
 18 | ### Functions ###
 19 | 
 20 | def load_dataset(pathfile):
 21 |     df = pd.read_json(pathfile, lines=True)
 22 |     df['vote'].fillna(0, inplace=True)
 23 |     df['vote'] = pd.to_numeric(df['vote'], errors='coerce')
 24 |     df = df[np.isfinite(df['vote'])]
 25 |     df['vote'] = df['vote'].astype(int)
 26 |     df.dropna(subset=['reviewText'], inplace=True)
 27 |     return df
 28 | 
 29 | 
 30 | ### DATA MANIPULATION ###
 31 | def remove_cols(df):
 32 |     df.drop(['image',
 33 |              'reviewTime',
 34 |              'reviewerName',
 35 |              'style',
 36 |              'unixReviewTime'], axis=1, inplace=True)
 37 | 
 38 | 
 39 | def vote_to_opinion(df):
 40 |     df.loc[df.overall == 3, 'opinion'] = "neutral"
 41 |     df.loc[df.overall > 3, 'opinion'] = "positive"
 42 |     df.loc[df.overall < 3, 'opinion'] = "negative"
 43 | 
 44 | 
 45 | def words_count(df):
 46 |     df['n_words'] = [len(t) for t in df['reviewText']]
 47 |     
 48 | 
 49 | def transform_unix_date(df):
 50 |     df['date'] = pd.to_datetime(df['unixReviewTime'], unit='s')
 51 |     df['month_year'] = df['date'].dt.to_period('M')
 52 |     df['month'] = df['date'].dt.month
 53 |     df['year'] = df['date'].dt.year
 54 |     df['week_day'] = df['date'].dt.dayofweek
 55 | 
 56 | contractions_dict = {
 57 |     "ain't": "am not / are not / is not / has not / have not",
 58 |     "aren't": "are not / am not",
 59 |     "can't": "cannot",
 60 |     "can't've": "cannot have",
 61 |     "'cause": "because",
 62 |     "could've": "could have",
 63 |     "couldn't": "could not",
 64 |     "couldn't've": "could not have",
 65 |     "didn't": "did not",
 66 |     "doesn't": "does not",
 67 |     "don't": "do not",
 68 |     "hadn't": "had not",
 69 |     "hadn't've": "had not have",
 70 |     "hasn't": "has not",
 71 |     "haven't": "have not",
 72 |     "he'd": "he had / he would",
 73 |     "he'd've": "he would have",
 74 |     "he'll": "he shall / he will",
 75 |     "he'll've": "he shall have / he will have",
 76 |     "he's": "he has / he is",
 77 |     "how'd": "how did",
 78 |     "how'd'y": "how do you",
 79 |     "how'll": "how will",
 80 |     "how's": "how has / how is / how does",
 81 |     "I'd": "I had / I would",
 82 |     "I'd've": "I would have",
 83 |     "I'll": "I shall / I will",
 84 |     "I'll've": "I shall have / I will have",
 85 |     "I'm": "I am",
 86 |     "I've": "I have",
 87 |     "isn't": "is not",
 88 |     "it'd": "it had / it would",
 89 |     "it'd've": "it would have",
 90 |     "it'll": "it shall / it will",
 91 |     "it'll've": "it shall have / it will have",
 92 |     "it's": "it has / it is",
 93 |     "let's": "let us",
 94 |     "ma'am": "madam",
 95 |     "mayn't": "may not",
 96 |     "might've": "might have",
 97 |     "mightn't": "might not",
 98 |     "mightn't've": "might not have",
 99 |     "must've": "must have",
100 |     "mustn't": "must not",
101 |     "mustn't've": "must not have",
102 |     "needn't": "need not",
103 |     "needn't've": "need not have",
104 |     "o'clock": "of the clock",
105 |     "oughtn't": "ought not",
106 |     "oughtn't've": "ought not have",
107 |     "shan't": "shall not",
108 |     "sha'n't": "shall not",
109 |     "shan't've": "shall not have",
110 |     "she'd": "she had / she would",
111 |     "she'd've": "she would have",
112 |     "she'll": "she shall / she will",
113 |     "she'll've": "she shall have / she will have",
114 |     "she's": "she has / she is",
115 |     "should've": "should have",
116 |     "shouldn't": "should not",
117 |     "shouldn't've": "should not have",
118 |     "so've": "so have",
119 |     "so's": "so as / so is",
120 |     "that'd": "that would / that had",
121 |     "that'd've": "that would have",
122 |     "that's": "that has / that is",
123 |     "there'd": "there had / there would",
124 |     "there'd've": "there would have",
125 |     "there's": "there has / there is",
126 |     "they'd": "they had / they would",
127 |     "they'd've": "they would have",
128 |     "they'll": "they shall / they will",
129 |     "they'll've": "they shall have / they will have",
130 |     "they're": "they are",
131 |     "they've": "they have",
132 |     "to've": "to have",
133 |     "wasn't": "was not",
134 |     "we'd": "we had / we would",
135 |     "we'd've": "we would have",
136 |     "we'll": "we will",
137 |     "we'll've": "we will have",
138 |     "we're": "we are",
139 |     "we've": "we have",
140 |     "weren't": "were not",
141 |     "what'll": "what shall / what will",
142 |     "what'll've": "what shall have / what will have",
143 |     "what're": "what are",
144 |     "what's": "what has / what is",
145 |     "what've": "what have",
146 |     "when's": "when has / when is",
147 |     "when've": "when have",
148 |     "where'd": "where did",
149 |     "where's": "where has / where is",
150 |     "where've": "where have",
151 |     "who'll": "who shall / who will",
152 |     "who'll've": "who shall have / who will have",
153 |     "who's": "who has / who is",
154 |     "who've": "who have",
155 |     "why's": "why has / why is",
156 |     "why've": "why have",
157 |     "will've": "will have",
158 |     "won't": "will not",
159 |     "won't've": "will not have",
160 |     "would've": "would have",
161 |     "wouldn't": "would not",
162 |     "wouldn't've": "would not have",
163 |     "y'all": "you all",
164 |     "y'all'd": "you all would",
165 |     "y'all'd've": "you all would have",
166 |     "y'all're": "you all are",
167 |     "y'all've": "you all have",
168 |     "you'd": "you had / you would",
169 |     "you'd've": "you would have",
170 |     "you'll": "you shall / you will",
171 |     "you'll've": "you shall have / you will have",
172 |     "you're": "you are",
173 |     "you've": "you have"
174 | }
175 | contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))
176 | punctuation_re = re.compile('([!,.:;?])(\w)')
177 | tokenizer = RegexpTokenizer(r'\w+')
178 | 
179 | 
180 | def expand_contractions(string, contractions_dict=contractions_dict):
181 |     def replace(match):
182 |         return contractions_dict[match.group(0)]
183 | 
184 |     return contractions_re.sub(replace, string)
185 | 
186 | 
187 | def fix_punctuation(string, contractions_dict=contractions_dict):
188 |     def replace(match):
189 |         # print(match)
190 |         # print(match.group(1) + ' ' + match.group(2))
191 |         return match.group(1) + ' ' + match.group(2)
192 | 
193 |     return punctuation_re.sub(replace, string)
194 | 
195 | 
196 | def remove_less_frequent_words(reviews):
197 |     frequency = defaultdict(int)
198 |     for review in reviews:
199 |         for token in review:
200 |             frequency[token] += 1
201 | 
202 |     cleaned = [[token for token in review if frequency[token] > 1] for review in reviews]
203 |     return cleaned
204 | 
205 | 
206 | nlp = spacy.load('en', disable=['parser', 'ner'])
207 | 
208 | 
209 | def lemmatization(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
210 |     """https://spacy.io/api/annotation"""
211 |     doc = nlp(' '.join(text))
212 |     return [token.lemma_ for token in doc if token.pos_ in allowed_postags]
213 | 
214 | class MLStripper(HTMLParser):
215 |     def __init__(self):
216 |         super().__init__()
217 |         self.reset()
218 |         self.fed = []
219 |     def handle_data(self, d):
220 |         self.fed.append(d)
221 |     def get_data(self):
222 |         return ''.join(self.fed)
223 | 
224 | def remove_html(review):
225 |     s = MLStripper()
226 |     s.feed(review)
227 |     return s.get_data()
228 | 
229 | def text_preprocessing(reviews, remove_less_frequent=True):
230 |     #print(reviews)
231 |     #reviews = reviews.lower()
232 |     reviews = [review.lower() for review in reviews]
233 |     reviews = [remove_html(review) for review in reviews]
234 |     stopwords = nltk.corpus.stopwords.words("english")
235 |     filtered_reviews = []
236 |     no_review = 0
237 |     for review in reviews:
238 |         no_review += 1
239 |         if no_review % 100 == 0:
240 |             print('Review n.', no_review, '/', len(reviews))
241 |         try:
242 |             review = fix_punctuation(review)
243 |             review = expand_contractions(review)
244 |         except:
245 |             print(review, "something happened")
246 |         filtered_review = []
247 |         for word in tokenizer.tokenize(review):
248 |             if word not in stopwords and len(word) > 2:
249 |                 filtered_review.append(word)
250 |         lemmatized = lemmatization(filtered_review)
251 |         filtered_reviews.append(lemmatized)
252 |     
253 |     if (remove_less_frequent):
254 |         filtered_reviews = remove_less_frequent_words(filtered_reviews)
255 |     
256 |     return filtered_reviews
257 | 
258 | 
259 | def preprocessed_reviews(df):
260 |     reviews = df['reviewText'].tolist()
261 |     preprocessed_reviews = text_preprocessing(reviews)
262 |     df['preprocessedReview'] = [' '.join(review) for review in preprocessed_reviews]
263 |     df = df[df.preprocessedReview != '']
264 | 
265 | 
266 | def feature_manipulation(df):
267 |     remove_cols(df)
268 |     preprocessed_reviews(df)
269 |     return df
270 | 
271 |     
272 | def add_features(df):
273 |     vote_to_opinion(df)
274 |     words_count(df)
275 |     transform_unix_date(df)
276 |     
277 |     
278 | def most_reviewed_products(df, n_products):
279 |     reviews_per_product = df['asin'].value_counts()
280 |     most_reviews = reviews_per_product.nlargest(n_products)
281 |     most_reviews = most_reviews.reset_index()
282 |     most_reviews = most_reviews.drop('asin', axis=1)
283 |     definitive = df.merge(most_reviews, left_on='asin', right_on='index')
284 |     definitive = definitive.drop('index', axis=1)
285 |     return definitive
286 | 


--------------------------------------------------------------------------------
/report/4_topic_analysis.tex:
--------------------------------------------------------------------------------
  1 | \chapter{Topic analysis}
  2 | La topic analysis consente di identificare gli argomenti più discussi semplicemente contando le parole all’interno di un corpus di documenti e raggruppando modelli di parole simili. 
  3 | \par 
  4 | È una tecnica di machine learning che organizza e comprende grandi raccolte di dati testuali, assegnando tag o categorie in base all’argomento o al tema di ogni singolo testo. 
  5 | \par 
  6 | I risultati sono più dettagliati e interessanti rispetto alla sentiment analysis, in quanto la topic analysis esamina più da vicino le informazioni dietro un testo.
  7 | \par
  8 | Sono comunque due metodi che, se usati in combinazione, consentono di restringere ulteriormente queste informazioni per trovare con precisione quali temi vengono discussi, fornendo quindi informazioni fruibili riguardanti il prodotto.
  9 | 
 10 | \section{Algoritmo utilizzato}
 11 | Il metodo di riferimento di topic analysis è Latent Dirichlet Allocation (LDA) \cite{blei2003latent}: è una tecnica di machine learning non supervisionata che consente di inferire schemi e raggruppare espressioni simili senza la necessità di definire gli argomenti a priori. L'assunzione di LDA è che ogni documento può essere descritto da una distribuzione di argomenti, e ciascun argomento può altresì essere descritto da una distribuzione di parole.
 12 | \par
 13 | LDA è un modello ampiamente documentato in Python e per questo motivo è stata la scelta naturale per effettuare il task di topic analysis. 
 14 | \par 
 15 | Detto questo, è da far notare che la letteratura scientifica negli ultimi anni ha prodotto diversi modelli che mettono in risalto alcuni svantaggi del modello LDA. 
 16 | \subsection{Individuazione topic}
 17 | \label{indiv_topic}
 18 | Una limitazione è stata riscontrata nello sviluppo del modello MG-LDA in cui viene asserito che i modelli standard (come LDA) tendono a produrre topic che corrispondono alle proprietà globali degli oggetti in analisi piuttosto che agli aspetti di un oggetto che tendono ad essere valutati da un utente. 
 19 | \par
 20 | La soluzione adottata in questo progetto, vista l'assenza di una implementazione per il modello MG-LDA \cite{titov2008joint}, è stata quella di applicare LDA su prodotti presi singolarmente - considerando ovviamente i prodotti con più osservazioni all'interno del dataset - nonostante la pratica più diffusa in letteratura sembra quella di applicare il modello sull'intero corpus di documenti a prescindere dall'eterogeneità dei documenti stessi. Questa scelta è stata presa di proprosito per evitare la formazione di macro-topic.
 21 | \subsection{Sentiment topic}
 22 | Un'altra carenza riscontrata è l'assenza di rilevazione del sentiment: questo compito è risolto da diversi modelli (per esempio JST \cite{lin2009joint}, basato su LDA), che suddividono il testo in argomenti, assegnando simultaneamente a ciascuno un livello di sentimento.
 23 | \par
 24 | L'idea per questo progetto, vista la mancanza di una implementazione del modello JST, è stata quella di utilizzare un approccio lexicon-based (VADER \cite{hutto2014vader}) in combinazione con l'approccio non supervisionato, in modo da poter fornire una visione generale ed approssimata del sentiment dei topic prodotti da LDA. Di seguito viene presentata la procedura implementata:
 25 | \begin{itemize}
 26 |     \item Per ogni recensione viene calcolato il rispettivo sentiment. In particolare, l'output fornito è una variabile \texttt{compound} $\in [-1, +1]$ che rappresenta un sentiment:
 27 |         \begin{itemize}
 28 |             \item \textbf{positivo} se \texttt{compound} $\geq 0.05$
 29 |             \item \textbf{neutrale} se -0.05 < \texttt{compound} < 0.05
 30 |             \item \textbf{negativo} se \texttt{compound} $\leq -0.05$
 31 |         \end{itemize}
 32 |     \item In seguito all'applicazione del modello LDA, ogni recensione ha associata la probabilità con la quale ha contribuito alla formazione del topic.
 33 |     \item Per ogni recensione viene estratto il topic con probabilità maggiore (se > 0.7). In questo caso, si assume che la recensione sia inerente al suddetto topic, altrimenti viene scartata dal conto.
 34 |     \item Per ogni topic viene effettuata la media del sentiment delle recensioni associate al suddetto topic
 35 | \end{itemize}
 36 | 
 37 | Questa soluzione non è parsa consistente in quanto circa il 0.1\% delle recensioni appartenevano ad un topic specifico con probabilità superiore a 0.7. L'abbassamento della soglia di probabilità è anch'essa una soluzione inconsistente.
 38 | 
 39 | \section{Procedimento}
 40 | 
 41 | La fase di topic analysis condivide le operazioni preliminari di preparazione del dataset presentate nel Capitolo \ref{preprocessing} e nel Capitolo \ref{bow}.
 42 | \par
 43 | L'algoritmo LDA è stato applicato su oggetti presi singolarmente per i problemi sottolineati nel Capitolo \ref{indiv_topic}. Il criterio di scelta dei prodotti da analizzare è la loro popolarità in termini di recensioni.
 44 | \par 
 45 | In particolare, soffermandoci sulla Figura \ref{opinion_bestseller_products} si può notare quanto sono dominanti, in percentuale, le recensioni positive. Ciò rispecchia connotati già riscontrati durante l'esplorazione del dataset; nonostante ciò alcuni prodotti tra i più recensiti mostrano una percentuale di recensioni negative e neutrali elevata rispetto alla media.
 46 | \par 
 47 | Considerando i tempi di computazione, un totale di 6 prodotti si è rivelato essere un buon trade-off per un'analisi variegata, prendendo i 3 prodotti con la media di \texttt{overall} più alta e i 3 prodotti con la media di \texttt{overall} più bassa.
 48 | \par
 49 | Essendo LDA un algoritmo non supervisionato che produce argomenti astratti senza conoscerne il numero a priori, solitamente necessita di un tuning degli iperparametri per individuare il modello migliore. Gli iperparametri sono:
 50 | 
 51 | \begin{itemize}
 52 |     \item K: è il numero di argomenti da estrarre dal corpus di documenti disponibile
 53 |     \item $\alpha$: è il parametro relativo alla distribuzione che regola l’aspetto della distribuzione degli argomenti per tutti i documenti del corpus. Tipicamente viene scelto un valore di $\alpha < 1$ per ottenere una distribuzione sparsa di argomenti per documento.
 54 |     \item $\beta$: è il parametro relativo alla distribuzione che regola l’aspetto della distribuzione delle parole in ciascun argomento. Per lo stesso motivo di $\alpha$, viene scelto un valore $\beta < 1$.
 55 | \end{itemize}{}
 56 | 
 57 | Nella Tabella \ref{values_hyper} vengono mostrati i possibili valori assumibili dagli iperparametri.
 58 | \begin{table}[H]
 59 | \small  
 60 | \centering
 61 | \begin{tabular}{|p{0.20\textwidth}|p{0.28\textwidth}|}
 62 | \hline
 63 | Iperparametro & Valori possibili\\
 64 | \hline
 65 | K & [2, 3, 4, 5, 6, 7, 8, 9, 10]\\
 66 | $\alpha$ & [0.1, 1]\\
 67 | $\beta$ & [0.01, 0.1, 1]\\
 68 | \hline
 69 | \end{tabular}
 70 | \caption{Possibili valori degli iperparametri di LDA}
 71 | \label{values_hyper}
 72 | \end{table}
 73 | 
 74 | Per valutare la qualità degli argomenti appresi viene usato il punteggio di \textit{coherence}. Per ogni prodotto:
 75 | \begin{itemize}
 76 |     \item Viene applicato l'algoritmo iterando sull'insieme di iperparametri
 77 |     \item Ogni modello risultante ottiene un punteggio
 78 |     \item Il modello con il punteggio più alto è l'ottimale
 79 | \end{itemize}{}
 80 | 
 81 | Nella Tabella \ref{hyper_opt} vengono mostrati i modelli ottimali per ciascun prodotto considerato per l'analisi. 
 82 | 
 83 | \begin{table}[H]
 84 | \small  
 85 | \centering
 86 | \begin{tabular}{|p{0.20\textwidth}|p{0.05\textwidth}|p{0.05\textwidth}|p{0.05\textwidth}||p{0.20\textwidth}|}
 87 | \hline
 88 | Codice prodotto & $\alpha$ & $\beta$ & K & Coherence score\\
 89 | \hline
 90 | B00MXWFUQC & 1 & 1 & 3 & 0.48 \\
 91 | B0092KJ9BU & 0.1 & 1 & 7 & 0.50 \\
 92 | B00UC7G565 & 0.1 & 0.01 & 2 & 0.55 \\
 93 | B00VH88CJ0 & 0.1 & 0.01 & 2 & 0.57 \\
 94 | B005NF5NTK & 1 & 0.1 & 3 & 0.60 \\
 95 | B00X5RV14Y & 0.1 & 1 & 6 & 0.55 \\
 96 | \hline
 97 | \end{tabular}
 98 | \caption{Iperparametri del modello ottimale con rispettivo punteggio di coherence}
 99 | \label{hyper_opt}
100 | \end{table}
101 | 
102 | In Figura \ref{coherence_plots} vengono invece mostrati i grafici del punteggio di \textit{coherence} per ciascun prodotto e con gli iperparametri $\alpha$ e $\beta$ mostrati nella Tabella \ref{hyper_opt}. 
103 | 
104 | \begin{figure}[H]
105 |     \centering
106 |     \subfigure[Coherence plot of B005NF5NTK]{\includesvg[width=0.42\textwidth]{figures/3_coherence_plot_B005NF5NTK.svg}} 
107 |     \subfigure[Coherence plot of B0092KJ9BU]{\includesvg[width=0.42\textwidth]{figures/3_coherence_plot_B0092KJ9BU.svg}}
108 |     \subfigure[Coherence plot of B00MXWFUQC]{\includesvg[width=0.42\textwidth]{figures/3_coherence_plot_B00MXWFUQC.svg}}
109 |     \subfigure[Coherence plot of B00UCZGS6S]{\includesvg[width=0.42\textwidth]{figures/3_coherence_plot_B00UCZGS6S.svg}}
110 |     \subfigure[Coherence plot of B00VH88CJ0]{\includesvg[width=0.42\textwidth]{figures/3_coherence_plot_B00VH88CJ0.svg}}
111 |     \subfigure[Coherence plot of B00X5RV14Y]{\includesvg[width=0.42\textwidth]{figures/3_coherence_plot_B00X5RV14Y.svg}}
112 |     \caption{Coherence plots of products}
113 |     \label{coherence_plots}
114 | \end{figure}
115 | 
116 | \section{Visualizzazione dei risultati}
117 | 
118 | Per valutare i risultati prodotti dall'algoritmo LDA abbiamo usufruito di pyLDAvis, una libreria Python basata su \cite{sievert2014ldavis} che permette di visualizzare gli argomenti in maniera interattiva. 
119 | \par
120 | Fornisce una visione globale degli argomenti e di come differiscono l'uno dall'altro, consentendo allo stesso tempo un'analisi approfondita dei termini maggiormente associati a ciascun singolo argomento.
121 | \par
122 | Il pannello di sinistra visualizza gli argomenti come cerchi nel piano bidimensionale i cui centri sono determinati calcolando la divergenza di Jensen-Shannon tra gli argomenti.
123 | \par
124 | Il pannello di destra mostra un grafico a barre orizzontali, le cui barre rappresentano i (30) singoli termini che sono i più rilevanti per interpretare l'argomento attualmente selezionato a sinistra. Una coppia di barre sovrapposte rappresenta sia la frequenza di un determinato termine a livello di corpus (barre blu) sia la frequenza specifica dell'argomento del termine (barre rosse).
125 | \par
126 | Sempre nel pannello di destra, appena sopra il grafico a barre orizzontali, è possibile regolare per mezzo di uno slider il valore $\lambda$, con $0 \leq \lambda \leq 1$. Esso consente di classificare la pertinenza di un termine rispetto a un argomento.
127 | \par 
128 | Valori di $\lambda$ vicino a 0 evidenziano i termini potenzialmente rari ma esclusivi per l'argomento selezionato, mentre valori di $ \lambda$ vicino a 1 evidenziano i termini frequenti ma non necessariamente esclusivi per l'argomento selezionato.
129 | \par
130 | L'impostazione consigliata in \cite{sievert2014ldavis} suggerisce un valore di $\lambda$ intorno a 0.6, che è stato dimostrato essere di aiuto per gli utenti per interpretare l'argomento, nonostante sia fatto presente che il valore ottimale può variare in base al dataset e gli argomenti stessi.


--------------------------------------------------------------------------------
/scripts/requirements.txt:
--------------------------------------------------------------------------------
  1 | alabaster==0.7.12
  2 | argh==0.26.2
  3 | astroid==2.3.3
  4 | atomicwrites==1.3.0
  5 | attrs==19.3.0
  6 | autopep8==1.4.4
  7 | Babel==2.7.0
  8 | backcall==0.1.0
  9 | bleach==3.1.4
 10 | blis==0.4.1
 11 | boto==2.49.0
 12 | boto3==1.10.34
 13 | botocore==1.13.34
 14 | catalogue==0.0.8
 15 | certifi==2019.11.28
 16 | cffi==1.13.2
 17 | chardet==3.0.4
 18 | cloudpickle==1.2.2
 19 | cryptography==2.8
 20 | cycler==0.10.0
 21 | cymem==2.0.3
 22 | decorator==4.4.1
 23 | defusedxml==0.6.0
 24 | diff-match-patch==20181111
 25 | docutils==0.15.2
 26 | entrypoints==0.3
 27 | flake8==3.7.9
 28 | funcy==1.14
 29 | future==0.18.2
 30 | gensim==3.8.1
 31 | helpdev==0.6.10
 32 | idna==2.8
 33 | imagesize==1.1.0
 34 | importlib-metadata==1.2.0
 35 | intervaltree==3.0.2
 36 | ipykernel==5.1.3
 37 | ipython==7.0.0
 38 | ipython-genutils==0.2.0
 39 | isort==4.3.21
 40 | jedi==0.14.1
 41 | jeepney==0.4.1
 42 | Jinja2==2.10.3
 43 | jmespath==0.9.4
 44 | joblib==0.14.0
 45 | jsonschema==3.2.0
 46 | jupyter-client==5.3.4
 47 | jupyter-core==4.6.1
 48 | keyring==20.0.0
 49 | kiwisolver==1.1.0
 50 | lazy-object-proxy==1.4.3
 51 | MarkupSafe==1.1.1
 52 | matplotlib==3.1.2
 53 | mccabe==0.6.1
 54 | mistune==0.8.4
 55 | more-itertools==8.0.2
 56 | murmurhash==1.0.2
 57 | nbconvert==5.6.1
 58 | nbformat==4.4.0
 59 | nltk==3.4.5
 60 | numexpr==2.7.0
 61 | numpy==1.17.4
 62 | numpydoc==0.9.1
 63 | packaging==19.2
 64 | pandas==0.25.3
 65 | pandocfilters==1.4.2
 66 | parso==0.5.1
 67 | pathtools==0.1.2
 68 | pexpect==4.7.0
 69 | pickleshare==0.7.5
 70 | Pillow==6.2.1
 71 | plac==1.1.3
 72 | pluggy==0.13.1
 73 | preshed==3.0.2
 74 | prompt-toolkit==2.0.10
 75 | psutil==5.6.7
 76 | ptyprocess==0.6.0
 77 | py==1.8.0
 78 | pycodestyle==2.5.0
 79 | pycparser==2.19
 80 | pydocstyle==4.0.1
 81 | pyflakes==2.1.1
 82 | Pygments==2.5.2
 83 | pyLDAvis==2.1.2
 84 | pylint==2.4.4
 85 | Pympler==0.8
 86 | pyparsing==2.4.5
 87 | PyQt5==5.12.3
 88 | PyQt5-sip==12.7.0
 89 | PyQtWebEngine==5.12.1
 90 | pyrsistent==0.15.6
 91 | pytest==5.3.1
 92 | python-dateutil==2.8.1
 93 | python-jsonrpc-server==0.3.2
 94 | python-language-server==0.31.2
 95 | pytz==2019.3
 96 | pyxdg==0.26
 97 | PyYAML==5.2
 98 | pyzmq==18.1.1
 99 | QDarkStyle==2.7
100 | QtAwesome==0.6.0
101 | qtconsole==4.6.0
102 | QtPy==1.9.0
103 | requests==2.22.0
104 | rope==0.14.0
105 | s3transfer==0.2.1
106 | scikit-learn==0.22
107 | scipy==1.3.3
108 | seaborn==0.9.0
109 | SecretStorage==3.1.1
110 | simplegeneric==0.8.1
111 | six==1.13.0
112 | sklearn==0.0
113 | smart-open==1.9.0
114 | snowballstemmer==2.0.0
115 | sortedcontainers==2.1.0
116 | spacy==2.2.3
117 | Sphinx==2.2.2
118 | sphinxcontrib-applehelp==1.0.1
119 | sphinxcontrib-devhelp==1.0.1
120 | sphinxcontrib-htmlhelp==1.0.2
121 | sphinxcontrib-jsmath==1.0.1
122 | sphinxcontrib-qthelp==1.0.2
123 | sphinxcontrib-serializinghtml==1.1.3
124 | spyder==4.0.0
125 | spyder-kernels==1.8.1
126 | srsly==0.2.0
127 | testpath==0.4.4
128 | thinc==7.3.1
129 | tornado==6.0.3
130 | tqdm==4.40.1
131 | traitlets==4.3.3
132 | typed-ast==1.4.0
133 | ujson==1.35
134 | urllib3==1.25.7
135 | wasabi==0.4.2
136 | watchdog==0.9.0
137 | wcwidth==0.1.7
138 | webencodings==0.5.1
139 | wordcloud==1.6.0
140 | wrapt==1.11.2
141 | wurlitzer==2.0.0
142 | yapf==0.29.0
143 | zipp==0.6.0
144 | alabaster==0.7.12
145 | argh==0.26.2
146 | astroid==2.3.3
147 | astropy==3.2.3
148 | atomicwrites==1.3.0
149 | attrs==19.3.0
150 | autopep8==1.4.4
151 | Babel==2.7.0
152 | backcall==0.1.0
153 | bleach==3.1.4
154 | blis==0.4.1
155 | boto==2.49.0
156 | boto3==1.10.34
157 | botocore==1.13.34
158 | catalogue==0.0.8
159 | certifi==2019.11.28
160 | cffi==1.13.2
161 | chardet==3.0.4
162 | Click==7.0
163 | cloudpickle==1.2.2
164 | confuse==1.0.0
165 | cryptography==2.8
166 | cycler==0.10.0
167 | cymem==2.0.3
168 | dash==1.7.0
169 | dash-core-components==1.6.0
170 | dash-html-components==1.0.2
171 | dash-renderer==1.2.2
172 | dash-table==4.5.1
173 | decorator==4.4.1
174 | defusedxml==0.6.0
175 | diff-match-patch==20181111
176 | docutils==0.15.2
177 | entrypoints==0.3
178 | flake8==3.7.9
179 | Flask==1.1.1
180 | Flask-Compress==1.4.0
181 | funcy==1.14
182 | future==0.18.2
183 | gensim==3.8.1
184 | helpdev==0.6.10
185 | htmlmin==0.1.12
186 | idna==2.8
187 | imagesize==1.1.0
188 | importlib-metadata==1.2.0
189 | intervaltree==3.0.2
190 | ipykernel==5.1.3
191 | ipython==7.0.0
192 | ipython-genutils==0.2.0
193 | ipywidgets==7.5.1
194 | isort==4.3.21
195 | itsdangerous==1.1.0
196 | jedi==0.14.1
197 | jeepney==0.4.1
198 | Jinja2==2.10.3
199 | jmespath==0.9.4
200 | joblib==0.14.0
201 | json5==0.8.5
202 | jsonschema==3.2.0
203 | jupyter==1.0.0
204 | jupyter-client==5.3.4
205 | jupyter-console==6.0.0
206 | jupyter-core==4.6.1
207 | jupyterlab==1.2.4
208 | jupyterlab-server==1.0.6
209 | keyring==20.0.0
210 | kiwisolver==1.1.0
211 | lab==5.1
212 | lazy-object-proxy==1.4.3
213 | llvmlite==0.30.0
214 | MarkupSafe==1.1.1
215 | matplotlib==3.1.2
216 | mccabe==0.6.1
217 | missingno==0.4.2
218 | mistune==0.8.4
219 | more-itertools==8.0.2
220 | murmurhash==1.0.2
221 | nbconvert==5.6.1
222 | nbformat==4.4.0
223 | nltk==3.4.5
224 | notebook==6.0.2
225 | numba==0.46.0
226 | numexpr==2.7.0
227 | numpy==1.17.4
228 | numpydoc==0.9.1
229 | packaging==19.2
230 | pandas==0.25.3
231 | pandas-profiling==2.3.0
232 | pandocfilters==1.4.2
233 | parso==0.5.1
234 | pathtools==0.1.2
235 | pexpect==4.7.0
236 | phik==0.9.8
237 | pickleshare==0.7.5
238 | Pillow==6.2.1
239 | plac==1.1.3
240 | plotly==4.4.1
241 | pluggy==0.13.1
242 | preshed==3.0.2
243 | prometheus-client==0.7.1
244 | prompt-toolkit==2.0.10
245 | psutil==5.6.7
246 | ptyprocess==0.6.0
247 | py==1.8.0
248 | pycodestyle==2.5.0
249 | pycparser==2.19
250 | pydocstyle==4.0.1
251 | pyflakes==2.1.1
252 | Pygments==2.5.2
253 | pyLDAvis==2.1.2
254 | pylint==2.4.4
255 | Pympler==0.8
256 | pyparsing==2.4.5
257 | PyQt5==5.12.3
258 | PyQt5-sip==12.7.0
259 | PyQtWebEngine==5.12.1
260 | pyrsistent==0.15.6
261 | pytest==5.3.1
262 | pytest-pylint==0.14.1
263 | python-dateutil==2.8.1
264 | python-jsonrpc-server==0.3.2
265 | python-language-server==0.31.2
266 | pytz==2019.3
267 | pyxdg==0.26
268 | PyYAML==5.2
269 | pyzmq==18.1.1
270 | QDarkStyle==2.7
271 | QtAwesome==0.6.0
272 | qtconsole==4.6.0
273 | QtPy==1.9.0
274 | requests==2.22.0
275 | retrying==1.3.3
276 | rope==0.14.0
277 | s3transfer==0.2.1
278 | scikit-learn==0.22
279 | scipy==1.3.3
280 | seaborn==0.9.0
281 | SecretStorage==3.1.1
282 | Send2Trash==1.5.0
283 | simplegeneric==0.8.1
284 | simplejson==3.17.0
285 | six==1.13.0
286 | sklearn==0.0
287 | smart-open==1.9.0
288 | snowballstemmer==2.0.0
289 | sortedcontainers==2.1.0
290 | spacy==2.2.3
291 | Sphinx==2.2.2
292 | sphinxcontrib-applehelp==1.0.1
293 | sphinxcontrib-devhelp==1.0.1
294 | sphinxcontrib-htmlhelp==1.0.2
295 | sphinxcontrib-jsmath==1.0.1
296 | sphinxcontrib-qthelp==1.0.2
297 | sphinxcontrib-serializinghtml==1.1.3
298 | spyder==4.0.0
299 | spyder-kernels==1.8.1
300 | srsly==0.2.0
301 | terminado==0.8.3
302 | testpath==0.4.4
303 | thinc==7.3.1
304 | tornado==6.0.3
305 | tqdm==4.40.1
306 | traitlets==4.3.3
307 | typed-ast==1.4.0
308 | ujson==1.35
309 | urllib3==1.25.7
310 | wasabi==0.4.2
311 | watchdog==0.9.0
312 | wcwidth==0.1.7
313 | webencodings==0.5.1
314 | Werkzeug==0.16.0
315 | widgetsnbextension==3.5.1
316 | wordcloud==1.6.0
317 | wrapt==1.11.2
318 | wurlitzer==2.0.0
319 | yapf==0.29.0
320 | zipp==0.6.0
321 | alabaster==0.7.12
322 | argh==0.26.2
323 | astroid==2.3.3
324 | astropy==3.2.3
325 | atomicwrites==1.3.0
326 | attrs==19.3.0
327 | autopep8==1.4.4
328 | Babel==2.7.0
329 | backcall==0.1.0
330 | bleach==3.1.4
331 | blis==0.4.1
332 | boto==2.49.0
333 | boto3==1.10.34
334 | botocore==1.13.34
335 | catalogue==0.0.8
336 | certifi==2019.11.28
337 | cffi==1.13.2
338 | chardet==3.0.4
339 | Click==7.0
340 | cloudpickle==1.2.2
341 | confuse==1.0.0
342 | cryptography==2.8
343 | cycler==0.10.0
344 | cymem==2.0.3
345 | dash==1.7.0
346 | dash-core-components==1.6.0
347 | dash-html-components==1.0.2
348 | dash-renderer==1.2.2
349 | dash-table==4.5.1
350 | decorator==4.4.1
351 | defusedxml==0.6.0
352 | diff-match-patch==20181111
353 | docutils==0.15.2
354 | entrypoints==0.3
355 | flake8==3.7.9
356 | Flask==1.1.1
357 | Flask-Compress==1.4.0
358 | Flask-Cors==3.0.8
359 | funcy==1.14
360 | future==0.18.2
361 | gensim==3.8.1
362 | helpdev==0.6.10
363 | htmlmin==0.1.12
364 | idna==2.8
365 | imagesize==1.1.0
366 | importlib-metadata==1.2.0
367 | intervaltree==3.0.2
368 | ipykernel==5.1.3
369 | ipython==7.0.0
370 | ipython-genutils==0.2.0
371 | ipywidgets==7.5.1
372 | isort==4.3.21
373 | itsdangerous==1.1.0
374 | jedi==0.14.1
375 | jeepney==0.4.1
376 | Jinja2==2.10.3
377 | jmespath==0.9.4
378 | joblib==0.14.0
379 | json5==0.8.5
380 | jsonschema==3.2.0
381 | jupyter==1.0.0
382 | jupyter-client==5.3.4
383 | jupyter-console==6.0.0
384 | jupyter-core==4.6.1
385 | jupyterlab==1.2.4
386 | jupyterlab-server==1.0.6
387 | keyring==20.0.0
388 | kiwisolver==1.1.0
389 | lab==5.1
390 | lazy-object-proxy==1.4.3
391 | llvmlite==0.30.0
392 | MarkupSafe==1.1.1
393 | matplotlib==3.1.2
394 | mccabe==0.6.1
395 | missingno==0.4.2
396 | mistune==0.8.4
397 | more-itertools==8.0.2
398 | murmurhash==1.0.2
399 | nbconvert==5.6.1
400 | nbformat==4.4.0
401 | nltk==3.4.5
402 | notebook==6.0.2
403 | numba==0.46.0
404 | numexpr==2.7.0
405 | numpy==1.17.4
406 | numpydoc==0.9.1
407 | packaging==19.2
408 | pandas==0.25.3
409 | pandas-profiling==2.3.0
410 | pandocfilters==1.4.2
411 | parso==0.5.1
412 | pathtools==0.1.2
413 | pexpect==4.7.0
414 | phik==0.9.8
415 | pickleshare==0.7.5
416 | Pillow==6.2.1
417 | plac==1.1.3
418 | plotly==4.4.1
419 | pluggy==0.13.1
420 | preshed==3.0.2
421 | prometheus-client==0.7.1
422 | prompt-toolkit==2.0.10
423 | psutil==5.6.7
424 | ptyprocess==0.6.0
425 | py==1.8.0
426 | pycodestyle==2.5.0
427 | pycparser==2.19
428 | pydocstyle==4.0.1
429 | pyflakes==2.1.1
430 | Pygments==2.5.2
431 | pyLDAvis==2.1.2
432 | pylint==2.4.4
433 | Pympler==0.8
434 | pyparsing==2.4.5
435 | PyQt5==5.12.3
436 | PyQt5-sip==12.7.0
437 | PyQtWebEngine==5.12.1
438 | pyrsistent==0.15.6
439 | pytest==5.3.1
440 | pytest-pylint==0.14.1
441 | python-dateutil==2.8.1
442 | python-jsonrpc-server==0.3.2
443 | python-language-server==0.31.2
444 | pytz==2019.3
445 | pyxdg==0.26
446 | PyYAML==5.2
447 | pyzmq==18.1.1
448 | QDarkStyle==2.7
449 | QtAwesome==0.6.0
450 | qtconsole==4.6.0
451 | QtPy==1.9.0
452 | requests==2.22.0
453 | retrying==1.3.3
454 | rope==0.14.0
455 | s3transfer==0.2.1
456 | scikit-learn==0.22
457 | scipy==1.3.3
458 | seaborn==0.9.0
459 | SecretStorage==3.1.1
460 | Send2Trash==1.5.0
461 | simplegeneric==0.8.1
462 | simplejson==3.17.0
463 | six==1.13.0
464 | sklearn==0.0
465 | smart-open==1.9.0
466 | snowballstemmer==2.0.0
467 | sortedcontainers==2.1.0
468 | spacy==2.2.3
469 | Sphinx==2.2.2
470 | sphinxcontrib-applehelp==1.0.1
471 | sphinxcontrib-devhelp==1.0.1
472 | sphinxcontrib-htmlhelp==1.0.2
473 | sphinxcontrib-jsmath==1.0.1
474 | sphinxcontrib-qthelp==1.0.2
475 | sphinxcontrib-serializinghtml==1.1.3
476 | spyder==4.0.0
477 | spyder-kernels==1.8.1
478 | srsly==0.2.0
479 | terminado==0.8.3
480 | testpath==0.4.4
481 | thinc==7.3.1
482 | tornado==6.0.3
483 | tqdm==4.40.1
484 | traitlets==4.3.3
485 | typed-ast==1.4.0
486 | ujson==1.35
487 | urllib3==1.25.7
488 | wasabi==0.4.2
489 | watchdog==0.9.0
490 | wcwidth==0.1.7
491 | webencodings==0.5.1
492 | Werkzeug==0.16.0
493 | widgetsnbextension==3.5.1
494 | wordcloud==1.6.0
495 | wrapt==1.11.2
496 | wurlitzer==2.0.0
497 | yapf==0.29.0
498 | zipp==0.6.0
499 | bs4


--------------------------------------------------------------------------------
/scripts/sentiment_analysis.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | ### Import libraries ###
  4 | 
  5 | import pandas as pd
  6 | import numpy as np
  7 | import matplotlib.pyplot as plt
  8 | from wordcloud import WordCloud
  9 | from sklearn.feature_extraction.text import CountVectorizer
 10 | from sklearn.model_selection import train_test_split, GridSearchCV
 11 | from sklearn import metrics
 12 | from sklearn.linear_model import LogisticRegression
 13 | from sklearn.metrics import classification_report
 14 | from sklearn.naive_bayes import MultinomialNB
 15 | import itertools
 16 | from pathlib import Path
 17 | from data_utils import text_preprocessing
 18 | 
 19 | 
 20 | 
 21 | figOutputPath = Path("../figures/")
 22 | 
 23 | ### Functions ###
 24 | 
 25 | def plot_confusion_matrix(cm, name_img, classes=['negative', 'positive']):
 26 |     fig, ax = plt.subplots(figsize=(10,10))
 27 |     img = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
 28 |     #ax.set_title('Confusion matrix {}'.format(title))
 29 |     ax.axis('off')
 30 |     fig.colorbar(img)
 31 |     tick_marks = np.arange(len(classes))
 32 |     ax.set_xticks(tick_marks, classes)
 33 |     ax.set_yticks(tick_marks, classes)
 34 |     fmt = '.2f'
 35 |     thresh = cm.max() / 2.
 36 |     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
 37 |         ax.text(j, i, format(cm[i, j], fmt),
 38 |                  horizontalalignment="center",
 39 |                  color="white" if cm[i, j] > thresh else "black")
 40 |     
 41 |     fig.tight_layout()
 42 |     ax.set_ylabel('True label')
 43 |     ax.set_xlabel('Predicted label')
 44 |     ax.figure.savefig(figOutputPath / '2_confusion_matrix_{}.svg'.format(name_img),
 45 |                       format='svg')
 46 | 
 47 | 
 48 | def plot_roc(y_true, y_pred, name_img, pos_label=1):
 49 |     fpr, tpr, _ = metrics.roc_curve(y_true, y_pred[:, 1], pos_label)
 50 |     roc_auc = metrics.auc(fpr, tpr)
 51 |     fig, ax = plt.subplots(figsize=(10,10))
 52 |     #ax.set_title('Receiver Operating Characteristic of {}'.format(title))
 53 |     ax.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
 54 |     ax.legend(loc='lower right')
 55 |     ax.plot([0, 1], [0, 1], 'r--')
 56 |     ax.set_xlim(0, 1)
 57 |     ax.set_ylim(0, 1)
 58 |     ax.set_ylabel('True Positive Rate')
 59 |     ax.set_xlabel('False Positive Rate')
 60 |     ax.figure.savefig(figOutputPath / '2_roc_{}.svg'.format(name_img),
 61 |                       format='svg')
 62 | 
 63 | 
 64 | def wordcloud(text, sentiment, title=None):
 65 |     wordcloud = WordCloud(
 66 |         background_color='whitesmoke',
 67 |         max_words=200,
 68 |         max_font_size=120,
 69 |         scale=3,
 70 |         random_state=42,
 71 |         width=1000,
 72 |         height=1000,
 73 |     ).generate(str(text))
 74 | 
 75 |     fig, ax = plt.subplots(figsize=(10, 10))
 76 |     ax.axis('off')   
 77 |     ax.imshow(wordcloud, interpolation='nearest')
 78 |     ax.figure.savefig(figOutputPath / '2_wordcloud_{}.svg'.format(sentiment),
 79 |                       format='svg')
 80 |     print('Exported 2_wordcloud_{}.svg'.format(sentiment))
 81 |     
 82 |     
 83 | def retrieve_opinion(df, sentiment):
 84 |     opinion = df[df['opinion'] == sentiment]
 85 |     reviews = opinion['preprocessedReview'].tolist()
 86 |     wordcloud(reviews, sentiment)
 87 |     
 88 | 
 89 | def get_term_frequency(df, cvector):
 90 |     cvector.fit(df.preprocessedReview)
 91 |     
 92 |     negative_matrix = cvector.transform(df[df['opinion'] == 'negative']['preprocessedReview'])
 93 |     negative_words = negative_matrix.sum(axis=0)
 94 |     negative_frequency = [(word, negative_words[0, idx]) for word, idx in cvector.vocabulary_.items()]
 95 |     negative_tf = pd.DataFrame(list(sorted(negative_frequency, key = lambda x: x[1], reverse=True)),
 96 |                                columns=['Terms','negative'])
 97 |     negative_tf = negative_tf.set_index('Terms')
 98 |     
 99 |     positive_matrix = cvector.transform(df[df['opinion'] == 'positive']['preprocessedReview'])
100 |     positive_words = positive_matrix.sum(axis=0)
101 |     positive_frequency = [(word, positive_words[0, idx]) for word, idx in cvector.vocabulary_.items()]
102 |     positive_tf = pd.DataFrame(list(sorted(positive_frequency, key = lambda x: x[1], reverse=True)),
103 |                                columns=['Terms','positive'])
104 |     positive_tf = positive_tf.set_index('Terms')
105 |     
106 |     term_frequency_df = pd.concat([negative_tf, positive_tf], axis=1)
107 |     term_frequency_df['total'] = term_frequency_df['negative'] + term_frequency_df['positive']
108 |     return term_frequency_df
109 | 
110 | 
111 | def plot_frequency(df):
112 |     #Frequency plot
113 |     y_pos = np.arange(500)
114 |     fig, ax = plt.subplots(figsize=(10,8))
115 |     s = 1
116 |     expected_zipf = [df.sort_values(by='total', ascending=False)['total'][0]/(i+1)**s for i in y_pos]
117 |     ax.bar(y_pos, df.sort_values(by='total', ascending=False)['total'][:500], align='center', alpha=0.5)
118 |     ax.plot(y_pos, expected_zipf, color='r', linestyle='--', linewidth=2, alpha=0.5)
119 |     ax.set_ylabel('Frequency')
120 |     ax.set_xlabel('Token')
121 |     ax.set_yticks([0, 200000, 400000])
122 |     ax.set_yticklabels(["0", "200K", "400K"])
123 |     #ax.set_title('Top 500 tokens in reviews')
124 |     ax.figure.savefig(figOutputPath / '2_plot_frequency.svg', format='svg')
125 |     print('Exported 2_plot_frequency.svg')
126 |     
127 | 
128 | def token_frequency(df, sentiment):
129 |     y_pos = np.arange(50)
130 |     plt.figure(figsize=(12,10))
131 |     plt.bar(y_pos, df.sort_values(by=sentiment, ascending=False)[sentiment][:50], align='center', alpha=0.5)
132 |     plt.xticks(y_pos, df.sort_values(by=sentiment, ascending=False)[sentiment][:50].index, rotation='vertical')
133 |     plt.ylabel('Frequency')
134 |     plt.xlabel('Token')
135 |     #plt.title('Top 50 tokens in {} reviews'.format(sentiment))
136 |     plt.savefig(figOutputPath / '2_token_frequency_{}.svg'.format(sentiment), format='svg')
137 |     print('Exported 2_token_frequency_{}.svg'.format(sentiment))
138 | 
139 | 
140 | def zipf_law(df):
141 |     # Plot of absolute frequency
142 |     from pylab import arange, argsort, loglog, logspace, log10, text
143 |     counts = df.total
144 |     tokens = df.index
145 |     ranks = arange(1, len(counts)+1)
146 |     indices = argsort(-counts)
147 |     frequencies = counts[indices]
148 |     fig, ax = plt.subplots(figsize=(8,6))
149 |     ax.set_ylim(1,10**6)
150 |     ax.set_xlim(1,10**6)
151 |     loglog(ranks, frequencies, marker=".")
152 |     ax.plot([1,frequencies[0]],[frequencies[0],1],color='r')
153 |     #ax.set_title("Zipf plot for phrases tokens")
154 |     ax.set_xlabel("Frequency rank of token")
155 |     ax.set_ylabel("Absolute frequency of token")
156 |     ax.grid(True)
157 |     for n in list(logspace(-0.5, log10(len(counts)-2), 15).astype(int)):
158 |         dummy = text(ranks[n], frequencies[n], " " + tokens[indices[n]], 
159 |                      verticalalignment="bottom",
160 |                      horizontalalignment="left")
161 |     ax.figure.savefig(figOutputPath / '2_zipf_law.png', format='png')
162 |     print('Exported 2_zipf_law.png')
163 |     
164 | 
165 | def undersampling(df):
166 |     positive, negative = df.opinion.value_counts()
167 |     df_positive = df[df.opinion == 'positive']
168 |     df_positive = df_positive.sample(negative, random_state=42)
169 |     df_negative = df[df.opinion == 'negative']
170 |     df = pd.concat([df_positive, df_negative])
171 |     df = df.sample(frac=1)
172 |     return df
173 | 
174 | 
175 |     
176 | def run(df):   
177 |     df.drop(df[df.opinion == 'neutral'].index, inplace=True)
178 |     count_vector_exploration = CountVectorizer(max_features=10000, ngram_range=(1, 2))
179 |     df['words'] = [len(t) for t in df['preprocessedReview']]
180 |     df = df[df['words'] <= 300]
181 |     df = df[df['words'] > 5]
182 |     retrieve_opinion(df, 'positive')
183 |     retrieve_opinion(df, 'negative')
184 |     term_frequency = get_term_frequency(df, count_vector_exploration)
185 |     zipf_law(term_frequency)
186 |     plot_frequency(term_frequency)
187 |     token_frequency(term_frequency, 'positive')
188 |     token_frequency(term_frequency, 'negative')
189 |     ### Machine learning ###
190 |     df = undersampling(df)
191 |     count_vector_sentiment = CountVectorizer(max_features=10000, ngram_range=(1, 2))
192 |     reviews = np.array(df['preprocessedReview'])
193 |     sentiments = np.array(df['opinion'])
194 |     sentiments[sentiments == 'positive'] = 1
195 |     sentiments[sentiments == 'negative'] = 0
196 |     sentiments = sentiments.astype('int')
197 | 
198 |     # Logistic Regression CV with grid search su BOW
199 |     reviews_train, reviews_validation, sentiment_train, sentiment_validation = train_test_split(reviews, sentiments, test_size=0.2, random_state=42)
200 |     count_vector_features = count_vector_sentiment.fit_transform(reviews_train)
201 |     count_vector_validation_features = count_vector_sentiment.transform(reviews_validation)
202 | 
203 |     param_grid = [
204 |         {
205 |             'C': np.logspace(0, 4, 4)
206 |         }
207 |     ]
208 | 
209 |     # Create grid search object
210 |     lr = LogisticRegression(max_iter=10000, random_state=42)
211 |     lr_grid = GridSearchCV(lr, param_grid=param_grid, cv=5, verbose=True, n_jobs=-1)
212 |     # Fit on data
213 |     best_lr = lr_grid.fit(count_vector_features, sentiment_train)
214 |     print("Best params")
215 |     for i in best_lr.best_params_:
216 |         print(i, best_lr.best_params_[i])
217 | 
218 |     y_true, y_pred, y_pred_roc = sentiment_validation, best_lr.predict(count_vector_validation_features), best_lr.predict_proba(count_vector_validation_features)
219 |     print("Report on validation set")
220 |     print(classification_report(y_true, y_pred))
221 |     cm = metrics.confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1])
222 |     plot_confusion_matrix(cm, 'lr')
223 |     plot_roc(y_true, y_pred_roc, 'lr')
224 | 
225 |     # Multinomial Bayes CV with grid search su BOW
226 |     reviews_train, reviews_validation, sentiment_train, sentiment_validation = train_test_split(reviews, sentiments, test_size=0.2, random_state=42)
227 |     count_vector_features = count_vector_sentiment.fit_transform(reviews_train)
228 |     count_vector_validation_features = count_vector_sentiment.transform(reviews_validation)
229 | 
230 |     param_grid = [
231 |         {
232 |             'alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001)
233 |         }
234 |     ]
235 | 
236 |     # Create grid search object
237 |     nb = MultinomialNB()
238 |     nb_grid = GridSearchCV(nb, param_grid=param_grid, cv=5, verbose=True, n_jobs=-1)
239 |     # Fit on data
240 |     best_nb = nb_grid.fit(count_vector_features, sentiment_train)
241 | 
242 |     print("Best params")
243 |     for i in best_nb.best_params_:
244 |         print(i, best_nb.best_params_[i])
245 | 
246 |     y_true, y_pred, y_pred_roc = sentiment_validation, best_nb.predict(count_vector_validation_features), best_nb.predict_proba(count_vector_validation_features)
247 |     print("Report on validation set")
248 |     print(classification_report(y_true, y_pred))
249 |     cm = metrics.confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1])
250 |     plot_confusion_matrix(cm, 'nb')
251 |     plot_roc(y_true, y_pred_roc, 'nb')
252 | 
253 |     return best_nb, best_lr, count_vector_sentiment
254 | 
255 | 
256 |     
257 | 
258 | def compute_single(string, model, count_vector_sentiment):
259 |     a = text_preprocessing([string], remove_less_frequent=False)
260 |     a = np.asarray([ " ".join(a[0]) ])
261 |     count_vector_validation_features2 = count_vector_sentiment.transform(a)
262 |     return model.predict_proba(count_vector_validation_features2)


--------------------------------------------------------------------------------
/report/3_sentiment_analysis.tex:
--------------------------------------------------------------------------------
  1 | \chapter{Sentiment analysis}
  2 | \label{sentiment_analysis}
  3 | Oggigiorno siamo affetti da e produciamo un tale sovraccarico di dati che le aziende si stanno ridefinendo per raccogliere queste informazioni, come per esempio i feedback dei clienti, e strutturare il processo decisionale. L'ottenimento di questi dati è impensabile se fatto manualmente.
  4 | \par 
  5 | In particolare, per le opinioni su prodotti e servizi viene in aiuto la sentiment analysis, una discplina che può fornire risposte riguardo le questioni più importanti dal punto di vista dei clienti.
  6 | \par
  7 | Il processo di sentiment analysis permette, attraverso l'elaborazione del linguaggio naturale, di estrarre e analizzare in modo automatizzato opinioni soggettive espresse dall'utente, determinarne la polarità (positiva, neutrale, negativa) e, successivamente, riassumerle in maniera da poter essere di valore per l'azienda.
  8 | \par 
  9 | In questo modo, le decisioni possono essere prese sulla base di una quantità di dati significativa, piuttosto che da una semplice intuizione che non sempre si rivela corretta. Il rischio infatti a cui si va incontro maggiormente è quello di interpretare i messaggi avendo già un pregiudizio sull’argomento in questione, influenzando il modo in cui il testo da analizzare può essere interpretato.
 10 | \par
 11 | La sentiment analysis è importante perché le aziende vogliono che il loro marchio sia recepito positivamente (con un occhio alle aziende concorrenti). A tal proposito, ci si può concentrare su commenti positivi o negativi oltre che sul feedback del cliente, per valutare sia i punti di forza che i punti su cui migliorare.
 12 | 
 13 | 
 14 | \section{Preprocessing}
 15 | \label{preprocessing}
 16 | Prima di partire con lo svolgimento del task di sentiment analysis, è necessaria una fase di preprocessing.
 17 | \par
 18 | Innanzitutto, sono stati rimossi dal dataset i campi ritenuti superflui per l'analisi.
 19 | \par
 20 | Successivamente, è stato manipolato il campo \texttt{reviewText}. La manipolazione è avvenuta sequenzialmente e con step standard per analisi di questo tipo:
 21 | 
 22 | \begin{itemize}
 23 |     \item \textbf{Normalization} - conversione delle recensioni in caratteri minuscoli. Se presenti, modificate alcune espressioni contratte tipiche della lingua inglese (per esempio: \textit{hadn't} trasformata in \textit{had not}).
 24 |     \item \textbf{Tokenization} - suddivisione in token per ogni recensione
 25 |     \item \textbf{Removal} - rimozione di token altamente ricorrenti nella lingua considerata (stopwords). Inoltre, sono stati eliminati token composti da 1 o 2 caratteri o token estremamente rari (frequenza nel dataset = 1)
 26 |     \item \textbf{Lemmatization} - conversione del token nel proprio lemma linguistico
 27 |     
 28 | \end{itemize}{}
 29 | 
 30 | Questa manipolazione ha portato alla creazione del campo \texttt{preprocessedReview}.
 31 | Di seguito vengono mostrate la recensione originale e la recensione dopo l'intera fase di preprocessing.
 32 | \par
 33 | \begin{itemize}
 34 |   \item[\textbf{Prima}] \texttt{Overall a great product with a fair price. I have had absolutely no problems with the product except for the volume level, which is *NOT* below standard, it is just simply what is to be expected from a headset. Very comfortable, and I personally prefer the boom mic to be longer (unlike the newer models of this headset which have shortened mics). Recommended.}
 35 |   \item[\textbf{Dopo}] \texttt{overall great product fair price absolutely problem product volume level standard simply expect headset comfortable personally prefer boom mic longer new model shorten mic recommend}
 36 | \end{itemize}
 37 | 
 38 | \section{Creazione di Bag of Words}
 39 | \label{bow}
 40 | Il campo \texttt{preprocessedReview} non è direttamente trattabile dagli algoritmi di machine learning e quindi è necessario ottenere una rappresentazione comprensibile. 
 41 | Innanzitutto, abbiamo rimosso dall'analisi del campo \texttt{preprocessedReview} le recensioni:
 42 | \begin{itemize}
 43 |     \item prolisse - rientrano in questa categoria le osservazioni con più di 300 parole
 44 |     \item irrilevanti - rientrano in questa categoria le osservazioni con meno di 5 parole
 45 | \end{itemize}{}
 46 | 
 47 | Dopodiché, le recensioni restanti sono state utilizzate per costruire una Bag of Words composta da 10000 feature. Oltre ai token vengono considerati anche i bigrammi, poiché un loro utilizzo può aumentare l'accuratezza del modello rispetto al solo utilizzo di token singoli.
 48 | \section{Esplorazione}
 49 | A partire dalla nuova rappresentazione matriciale è stato creato un DataFrame fittizio composto dalle 10000 feature individuate nel Capitolo \ref{bow} e, per ognuna di esse, viene segnata la frequenza con cui appaiono rispettivamente nelle recensioni positive e nelle recensioni negative, oltre che la frequenza totale data dalla loro somma.
 50 | \par
 51 | Le recensioni neutrali non sono state considerate in quanto aggiungerebbero un livello ulteriore di complessità nell'apprendimento dei modelli.
 52 | 
 53 | \subsection{Wordcloud}
 54 | Una Wordcloud è una rappresentazione grafica delle parole usate di frequente in un corpus di documenti e fornisce un'idea generale di che tipologia di parole possiamo trovarvi. La grandezza di ogni parola nell'immagine è un'indicazione della frequenza di occorrenza della parola nell'intero corpus. Per questo motivo, sono molto utili quando si vuole eseguire un'analisi del testo.
 55 | 
 56 | \begin{figure}[H]
 57 |   \centering
 58 |   \includesvg[width=0.9\linewidth]{figures/2_wordcloud_positive}
 59 |   \caption{Wordcloud of positive reviews}
 60 |   \label{pos_wordcloud}
 61 | \end{figure}
 62 | 
 63 | \begin{figure}[H]
 64 |   \centering
 65 |   \includesvg[width=0.9\linewidth]{figures/2_wordcloud_negative}
 66 |   \caption{Wordcloud of negative reviews}
 67 |   \label{neg_wordcloud}
 68 | \end{figure}
 69 | 
 70 | \subsection{Frequenza dei token}
 71 | 
 72 | Un'ulteriore analisi che si può fare riguarda la distribuzione dei token all'interno del dataset. Nella Figura \ref{50_pos} e nella Figura \ref{50_neg} vengono mostrati gli istogrammi (ordinati dal token più frequente al token meno frequente) rispettivamente per le recensioni positive e le recensioni negative.
 73 | \begin{figure}[H]
 74 |   \centering
 75 |   \includesvg[width=0.8\linewidth]{figures/2_token_frequency_positive}
 76 |   \caption{Top 50 tokens in positive reviews}
 77 |   \label{50_pos}
 78 | \end{figure}
 79 | 
 80 | \begin{figure}[H]
 81 |   \centering
 82 |   \includesvg[width=0.8\linewidth]{figures/2_token_frequency_negative}
 83 |   \caption{Top 50 tokens in negative reviews}
 84 |   \label{50_neg}
 85 | \end{figure}
 86 | 
 87 | Un modello comunemente utilizzato è la legge di Zipf, ovvero una legge empirica formulata nel 1959 in cui vi si afferma che, dato un corpus di documenti, la frequenza di ogni parola è inversamente proporzionale al suo rango nella tabella delle frequenze. Pertanto, la parola più frequente ricorre approssimativamente il doppio rispetto alla seconda parola più frequente, il triplo rispetto alla terza parola più frequente e così via.
 88 | 
 89 | \begin{figure}[H]
 90 |   \centering
 91 |   \includesvg[width=0.8\linewidth]{figures/2_plot_frequency}
 92 |   \caption{Distribution of words in review for each opinion}
 93 |   \label{distribution_words_opinion}
 94 | \end{figure}
 95 | 
 96 | La legge di Zipf viene osservata più facilmente tracciando i dati su scala logaritmica in entrambi gli assi come mostrato in Figura \ref{zipf_law}.
 97 | 
 98 | \begin{figure}[H]
 99 |   \centering
100 |   \captionsetup{margin=1cm}
101 |   \includegraphics[width=0.8\linewidth]{figures/2_zipf_law.png}
102 |   \caption{Distribution of words in verified reviews}
103 |   \label{zipf_law}
104 | \end{figure}
105 | 
106 | 
107 | \section{Machine learning}
108 | La tecnica con cui abbiamo affrontato la fase di sentiment analysis è il machine learning. Grazie ad essa il modello viene addestrato per riconoscere il sentimento in base alle parole usando un training set etichettato. Questo approccio dipende in gran parte dal tipo di algoritmo e dalla qualità dei dati utilizzati per l'addestramento.
109 | 
110 | \par
111 | Grazie all'attenta fase di preprocessing, è stato possibile addestrare due modelli diversi con gli stessi dati e valutarne i risultati, confrontandoli.
112 | \par
113 | Per prima cosa abbiamo dovuto risolvere il problema dello sbilanciamento tra classi: le recensioni positive sono in numero molto maggiore rispetto a quelle negative e qualsiasi modello addestrato rischierebbe di ottimizzarsi più sulla classe maggiore.
114 | La soluzione più adeguata in questo caso è l'utilizzo di una tecnica di undersampling in modo da ridurre gli elementi della classe maggioritaria senza introdurre bias e rendendo equiparabili le cardinalità delle classi.
115 | \par
116 | Il problema di machine learning è di natura binaria in quanto la variabile target ha solo due possibili valori: \textit{positive} e \textit{negative}.
117 | Data questa semplicità ci sono numerosi modelli che possono essere addestrati. Sono stati scelti Multinomial Naive Bayes e Logistic Regression per il loro ottimo compromesso tra performance e tempo di addestramento. Abbiamo provato ad implementare anche una Support Vector Machine ma le risorse hardware richieste per l'addestramento non erano disponibili.
118 | \par
119 | Per entrambi i modelli è stata eseguita la tecnica di Cross Validation (CV) su 5 folds, dividendo quindi il training set in 5 sottoinsiemi e usandone uno come test set, per 5 volte.
120 | 
121 | to do:split da aggiungere
122 | Per entrambi i modelli è stato tenuto da parte un validation set per la verifica finale e l'analisi delle varie metriche.
123 | 
124 | \subsection{Analisi dei risultati}
125 | 
126 | Al di sopra della CV è stata eseguita anche una Grid Search nel tentativo di trovare i migliori iperparametri per i due modelli.
127 | In Naive Bayes è stato trovato il migliore valore per alpha (0.1), mentre nella Logistic Regression il migliore valore dell'iperparametro C è stato 1.
128 | \par
129 | Analizzando le matrici di confusione dei due modelli in Figura \ref{cm_nb} e in Figura \ref{cm_lr} notiamo che il modello di Logistic Regression individua in totale meno falsi positivi e falsi negativi rispetto a Naive Bayes; ciò è confermato dal valore di \textit{Accuracy} leggermente più alto.
130 | 
131 | \begin{figure}[H]
132 |   \centering
133 |   \includesvg[width=0.9\linewidth]{figures/2_confusion_matrix_nb.svg}
134 |   \caption{Confusion Matrix per Naive Bayes}
135 |   \label{cm_nb}
136 | \end{figure}
137 | 
138 | \begin{figure}[H]
139 |   \centering
140 |   \includesvg[width=0.9\linewidth]{figures/2_confusion_matrix_lr.svg}
141 |   \caption{Confusion Matrix per Logistic Regression}
142 |   \label{cm_lr}
143 | \end{figure}
144 | 
145 | \par
146 | Sia i valori di \textit{Precision} che quelli di \textit{Recall} di Logistic Regression sono più alti indicando che per entrambe le classi tale modello trova un miglior numero di True e un minor numero di False.
147 | 
148 | \begin{table}[H]
149 | \centering
150 |   \begin{tabular}{l l} 
151 |   Accuracy complessiva & 0.84\\
152 |   Precision per la classe \textit{positive} & 0.84\\
153 |   Precision per la classe \textit{negative} & 0.85\\
154 |   Recall per la classe \textit{positive} & 0.85\\
155 |   Recall per la classe \textit{negative} & 0.84\\
156 |     \end{tabular}
157 |     \caption{Metriche risultate dell'esecuzione della cross validation su Naive Bayes}
158 | \end{table}
159 | 
160 | \begin{table}[H]
161 | \centering
162 |   \begin{tabular}{l l} 
163 |   Accuracy complessiva & 0.87\\
164 |   Precision per la classe \textit{positive} & 0.87\\
165 |   Precision per la classe \textit{negative} & 0.86\\
166 |   Recall per la classe \textit{positive} & 0.86\\
167 |   Recall per la classe \textit{negative} & 0.88\\
168 |     \end{tabular}
169 |     \caption{Metriche risultate dell'esecuzione della cross validation su Logistic Regression}
170 | \end{table}
171 | 
172 | 
173 | \par
174 | Le due ROC in Figura \ref{roc_nb} e in Figura \ref{roc_lr} mostrano l'andamento del rapporto tra True Positive Rate e False Positive Rate al variare del valore di cut-off di ogni modello: anche in questo grafico possiamo vedere una performance migliore da parte della Logistic Regression poichè la sua curva si avvicina di più a quella "ideale" e la sua inclinazione è più verticale.
175 | 
176 | \begin{figure}[H]
177 |   \centering
178 |   \includesvg[width=0.9\linewidth]{figures/2_roc_nb.svg}
179 |   \caption{ROC per Naive Bayes}
180 |   \label{roc_nb}
181 | \end{figure}
182 | 
183 | \begin{figure}[H]
184 |   \centering
185 |   \includesvg[width=0.9\linewidth]{figures/2_roc_lr.svg}
186 |   \caption{ROC per Logistic Regression}
187 |   \label{roc_lr}
188 | \end{figure}
189 | 
190 | 


--------------------------------------------------------------------------------
/scripts/ver_counts.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "count": [
  3 |     [
  4 |       {
  5 |         "data": [
  6 |           47,
  7 |           5
  8 |         ],
  9 |         "name": 1
 10 |       },
 11 |       {
 12 |         "data": [
 13 |           32,
 14 |           2
 15 |         ],
 16 |         "name": 2
 17 |       },
 18 |       {
 19 |         "data": [
 20 |           79,
 21 |           5
 22 |         ],
 23 |         "name": 3
 24 |       },
 25 |       {
 26 |         "data": [
 27 |           231,
 28 |           9
 29 |         ],
 30 |         "name": 4
 31 |       },
 32 |       {
 33 |         "data": [
 34 |           1189,
 35 |           35
 36 |         ],
 37 |         "name": 5
 38 |       }
 39 |     ],
 40 |     [
 41 |       {
 42 |         "data": [
 43 |           64,
 44 |           8
 45 |         ],
 46 |         "name": 1
 47 |       },
 48 |       {
 49 |         "data": [
 50 |           72,
 51 |           9
 52 |         ],
 53 |         "name": 2
 54 |       },
 55 |       {
 56 |         "data": [
 57 |           90,
 58 |           17
 59 |         ],
 60 |         "name": 3
 61 |       },
 62 |       {
 63 |         "data": [
 64 |           155,
 65 |           28
 66 |         ],
 67 |         "name": 4
 68 |       },
 69 |       {
 70 |         "data": [
 71 |           580,
 72 |           41
 73 |         ],
 74 |         "name": 5
 75 |       }
 76 |     ],
 77 |     [
 78 |       {
 79 |         "data": [
 80 |           22,
 81 |           3
 82 |         ],
 83 |         "name": 1
 84 |       },
 85 |       {
 86 |         "data": [
 87 |           14,
 88 |           1
 89 |         ],
 90 |         "name": 2
 91 |       },
 92 |       {
 93 |         "data": [
 94 |           37,
 95 |           4
 96 |         ],
 97 |         "name": 3
 98 |       },
 99 |       {
100 |         "data": [
101 |           141,
102 |           2
103 |         ],
104 |         "name": 4
105 |       },
106 |       {
107 |         "data": [
108 |           774,
109 |           25
110 |         ],
111 |         "name": 5
112 |       }
113 |     ],
114 |     [
115 |       {
116 |         "data": [
117 |           35,
118 |           3
119 |         ],
120 |         "name": 1
121 |       },
122 |       {
123 |         "data": [
124 |           31,
125 |           3
126 |         ],
127 |         "name": 2
128 |       },
129 |       {
130 |         "data": [
131 |           35,
132 |           1
133 |         ],
134 |         "name": 3
135 |       },
136 |       {
137 |         "data": [
138 |           93,
139 |           3
140 |         ],
141 |         "name": 4
142 |       },
143 |       {
144 |         "data": [
145 |           800,
146 |           24
147 |         ],
148 |         "name": 5
149 |       }
150 |     ],
151 |     [
152 |       {
153 |         "data": [
154 |           11,
155 |           2
156 |         ],
157 |         "name": 1
158 |       },
159 |       {
160 |         "data": [
161 |           14,
162 |           1
163 |         ],
164 |         "name": 2
165 |       },
166 |       {
167 |         "data": [
168 |           38,
169 |           1
170 |         ],
171 |         "name": 3
172 |       },
173 |       {
174 |         "data": [
175 |           139,
176 |           12
177 |         ],
178 |         "name": 4
179 |       },
180 |       {
181 |         "data": [
182 |           773,
183 |           50
184 |         ],
185 |         "name": 5
186 |       }
187 |     ],
188 |     [
189 |       {
190 |         "data": [
191 |           132,
192 |           11
193 |         ],
194 |         "name": 1
195 |       },
196 |       {
197 |         "data": [
198 |           41,
199 |           0
200 |         ],
201 |         "name": 2
202 |       },
203 |       {
204 |         "data": [
205 |           48,
206 |           1
207 |         ],
208 |         "name": 3
209 |       },
210 |       {
211 |         "data": [
212 |           120,
213 |           11
214 |         ],
215 |         "name": 4
216 |       },
217 |       {
218 |         "data": [
219 |           942,
220 |           71
221 |         ],
222 |         "name": 5
223 |       }
224 |     ],
225 |     [
226 |       {
227 |         "data": [
228 |           40,
229 |           2
230 |         ],
231 |         "name": 1
232 |       },
233 |       {
234 |         "data": [
235 |           26,
236 |           0
237 |         ],
238 |         "name": 2
239 |       },
240 |       {
241 |         "data": [
242 |           36,
243 |           0
244 |         ],
245 |         "name": 3
246 |       },
247 |       {
248 |         "data": [
249 |           103,
250 |           2
251 |         ],
252 |         "name": 4
253 |       },
254 |       {
255 |         "data": [
256 |           892,
257 |           18
258 |         ],
259 |         "name": 5
260 |       }
261 |     ],
262 |     [
263 |       {
264 |         "data": [
265 |           40,
266 |           5
267 |         ],
268 |         "name": 1
269 |       },
270 |       {
271 |         "data": [
272 |           21,
273 |           6
274 |         ],
275 |         "name": 2
276 |       },
277 |       {
278 |         "data": [
279 |           41,
280 |           3
281 |         ],
282 |         "name": 3
283 |       },
284 |       {
285 |         "data": [
286 |           120,
287 |           8
288 |         ],
289 |         "name": 4
290 |       },
291 |       {
292 |         "data": [
293 |           954,
294 |           69
295 |         ],
296 |         "name": 5
297 |       }
298 |     ],
299 |     [
300 |       {
301 |         "data": [
302 |           33,
303 |           7
304 |         ],
305 |         "name": 1
306 |       },
307 |       {
308 |         "data": [
309 |           24,
310 |           7
311 |         ],
312 |         "name": 2
313 |       },
314 |       {
315 |         "data": [
316 |           38,
317 |           5
318 |         ],
319 |         "name": 3
320 |       },
321 |       {
322 |         "data": [
323 |           89,
324 |           26
325 |         ],
326 |         "name": 4
327 |       },
328 |       {
329 |         "data": [
330 |           592,
331 |           156
332 |         ],
333 |         "name": 5
334 |       }
335 |     ],
336 |     [
337 |       {
338 |         "data": [
339 |           39,
340 |           2
341 |         ],
342 |         "name": 1
343 |       },
344 |       {
345 |         "data": [
346 |           19,
347 |           3
348 |         ],
349 |         "name": 2
350 |       },
351 |       {
352 |         "data": [
353 |           41,
354 |           2
355 |         ],
356 |         "name": 3
357 |       },
358 |       {
359 |         "data": [
360 |           134,
361 |           10
362 |         ],
363 |         "name": 4
364 |       },
365 |       {
366 |         "data": [
367 |           1006,
368 |           84
369 |         ],
370 |         "name": 5
371 |       }
372 |     ],
373 |     [
374 |       {
375 |         "data": [
376 |           97,
377 |           7
378 |         ],
379 |         "name": 1
380 |       },
381 |       {
382 |         "data": [
383 |           70,
384 |           3
385 |         ],
386 |         "name": 2
387 |       },
388 |       {
389 |         "data": [
390 |           83,
391 |           2
392 |         ],
393 |         "name": 3
394 |       },
395 |       {
396 |         "data": [
397 |           148,
398 |           14
399 |         ],
400 |         "name": 4
401 |       },
402 |       {
403 |         "data": [
404 |           484,
405 |           70
406 |         ],
407 |         "name": 5
408 |       }
409 |     ],
410 |     [
411 |       {
412 |         "data": [
413 |           24,
414 |           3
415 |         ],
416 |         "name": 1
417 |       },
418 |       {
419 |         "data": [
420 |           19,
421 |           0
422 |         ],
423 |         "name": 2
424 |       },
425 |       {
426 |         "data": [
427 |           41,
428 |           1
429 |         ],
430 |         "name": 3
431 |       },
432 |       {
433 |         "data": [
434 |           154,
435 |           13
436 |         ],
437 |         "name": 4
438 |       },
439 |       {
440 |         "data": [
441 |           1109,
442 |           149
443 |         ],
444 |         "name": 5
445 |       }
446 |     ],
447 |     [
448 |       {
449 |         "data": [
450 |           62,
451 |           4
452 |         ],
453 |         "name": 1
454 |       },
455 |       {
456 |         "data": [
457 |           46,
458 |           2
459 |         ],
460 |         "name": 2
461 |       },
462 |       {
463 |         "data": [
464 |           67,
465 |           6
466 |         ],
467 |         "name": 3
468 |       },
469 |       {
470 |         "data": [
471 |           162,
472 |           22
473 |         ],
474 |         "name": 4
475 |       },
476 |       {
477 |         "data": [
478 |           754,
479 |           102
480 |         ],
481 |         "name": 5
482 |       }
483 |     ],
484 |     [
485 |       {
486 |         "data": [
487 |           75,
488 |           8
489 |         ],
490 |         "name": 1
491 |       },
492 |       {
493 |         "data": [
494 |           46,
495 |           5
496 |         ],
497 |         "name": 2
498 |       },
499 |       {
500 |         "data": [
501 |           70,
502 |           3
503 |         ],
504 |         "name": 3
505 |       },
506 |       {
507 |         "data": [
508 |           161,
509 |           8
510 |         ],
511 |         "name": 4
512 |       },
513 |       {
514 |         "data": [
515 |           609,
516 |           23
517 |         ],
518 |         "name": 5
519 |       }
520 |     ],
521 |     [
522 |       {
523 |         "data": [
524 |           152,
525 |           6
526 |         ],
527 |         "name": 1
528 |       },
529 |       {
530 |         "data": [
531 |           81,
532 |           2
533 |         ],
534 |         "name": 2
535 |       },
536 |       {
537 |         "data": [
538 |           121,
539 |           4
540 |         ],
541 |         "name": 3
542 |       },
543 |       {
544 |         "data": [
545 |           137,
546 |           10
547 |         ],
548 |         "name": 4
549 |       },
550 |       {
551 |         "data": [
552 |           467,
553 |           26
554 |         ],
555 |         "name": 5
556 |       }
557 |     ],
558 |     [
559 |       {
560 |         "data": [
561 |           25,
562 |           2
563 |         ],
564 |         "name": 1
565 |       },
566 |       {
567 |         "data": [
568 |           20,
569 |           1
570 |         ],
571 |         "name": 2
572 |       },
573 |       {
574 |         "data": [
575 |           29,
576 |           4
577 |         ],
578 |         "name": 3
579 |       },
580 |       {
581 |         "data": [
582 |           169,
583 |           8
584 |         ],
585 |         "name": 4
586 |       },
587 |       {
588 |         "data": [
589 |           1355,
590 |           69
591 |         ],
592 |         "name": 5
593 |       }
594 |     ],
595 |     [
596 |       {
597 |         "data": [
598 |           14,
599 |           5
600 |         ],
601 |         "name": 1
602 |       },
603 |       {
604 |         "data": [
605 |           11,
606 |           1
607 |         ],
608 |         "name": 2
609 |       },
610 |       {
611 |         "data": [
612 |           31,
613 |           3
614 |         ],
615 |         "name": 3
616 |       },
617 |       {
618 |         "data": [
619 |           120,
620 |           18
621 |         ],
622 |         "name": 4
623 |       },
624 |       {
625 |         "data": [
626 |           1133,
627 |           177
628 |         ],
629 |         "name": 5
630 |       }
631 |     ],
632 |     [
633 |       {
634 |         "data": [
635 |           80,
636 |           4
637 |         ],
638 |         "name": 1
639 |       },
640 |       {
641 |         "data": [
642 |           45,
643 |           1
644 |         ],
645 |         "name": 2
646 |       },
647 |       {
648 |         "data": [
649 |           60,
650 |           0
651 |         ],
652 |         "name": 3
653 |       },
654 |       {
655 |         "data": [
656 |           121,
657 |           5
658 |         ],
659 |         "name": 4
660 |       },
661 |       {
662 |         "data": [
663 |           668,
664 |           17
665 |         ],
666 |         "name": 5
667 |       }
668 |     ],
669 |     [
670 |       {
671 |         "data": [
672 |           23,
673 |           3
674 |         ],
675 |         "name": 1
676 |       },
677 |       {
678 |         "data": [
679 |           18,
680 |           0
681 |         ],
682 |         "name": 2
683 |       },
684 |       {
685 |         "data": [
686 |           41,
687 |           1
688 |         ],
689 |         "name": 3
690 |       },
691 |       {
692 |         "data": [
693 |           156,
694 |           12
695 |         ],
696 |         "name": 4
697 |       },
698 |       {
699 |         "data": [
700 |           1111,
701 |           141
702 |         ],
703 |         "name": 5
704 |       }
705 |     ],
706 |     [
707 |       {
708 |         "data": [
709 |           37,
710 |           5
711 |         ],
712 |         "name": 1
713 |       },
714 |       {
715 |         "data": [
716 |           39,
717 |           3
718 |         ],
719 |         "name": 2
720 |       },
721 |       {
722 |         "data": [
723 |           36,
724 |           0
725 |         ],
726 |         "name": 3
727 |       },
728 |       {
729 |         "data": [
730 |           102,
731 |           4
732 |         ],
733 |         "name": 4
734 |       },
735 |       {
736 |         "data": [
737 |           1216,
738 |           66
739 |         ],
740 |         "name": 5
741 |       }
742 |     ]
743 |   ],
744 |   "products": [
745 |     "B005NF5NTK",
746 |     "B0092KJ9BU",
747 |     "B00AANQLRI",
748 |     "B00BT8L2MW",
749 |     "B00D856NOG",
750 |     "B00G7UY3EG",
751 |     "B00IGISUTG",
752 |     "B00M51DDT2",
753 |     "B00M6QODH2",
754 |     "B00MQSMDYU",
755 |     "B00MXWFUQC",
756 |     "B00P7N0320",
757 |     "B00QN1T6NM",
758 |     "B00UCZGS6S",
759 |     "B00UH3L82Y",
760 |     "B00VH88CJ0",
761 |     "B00X5RV14Y",
762 |     "B014EB532U",
763 |     "B018JW3EOY",
764 |     "B019PV2I3G"
765 |   ]
766 | }


--------------------------------------------------------------------------------
/webapp/public/lda_B00VH88CJ0.html:
--------------------------------------------------------------------------------
 1 | 
 2 | <link rel="stylesheet" type="text/css" href="https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.css">
 3 | 
 4 | 
 5 | <div id="ldavis_el224751401455801140008610013083"></div>
 6 | <script type="text/javascript">
 7 | 
 8 | var ldavis_el224751401455801140008610013083_data = {"mdsDat": {"x": [0.2449517101049423, -0.2449517101049423], "y": [0.0, 0.0], "topics": [1, 2], "cluster": [1, 1], "Freq": [58.49543762207031, 41.50456619262695]}, "tinfo": {"Category": ["Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Default", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic1", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2", "Topic2"], "Freq": [1315.0, 502.0, 482.0, 406.0, 665.0, 361.0, 216.0, 216.0, 497.0, 260.0, 156.0, 228.0, 127.0, 384.0, 192.0, 184.0, 178.0, 100.0, 99.0, 98.0, 94.0, 87.0, 142.0, 85.0, 82.0, 82.0, 133.0, 270.0, 77.0, 76.0, 482.8962707519531, 406.78564453125, 361.5224609375, 260.013427734375, 228.13162231445312, 184.69224548339844, 178.8850860595703, 142.96658325195312, 133.19456481933594, 124.31724548339844, 122.28001403808594, 120.56486511230469, 94.5503921508789, 94.1734619140625, 98.1880111694336, 85.1520004272461, 83.36966705322266, 192.599365234375, 75.88340759277344, 73.31732940673828, 67.80545806884766, 67.34545135498047, 65.8536148071289, 63.51698303222656, 63.31181716918945, 59.93573760986328, 100.30821990966797, 55.50241470336914, 52.94071578979492, 50.542476654052734, 1312.5560302734375, 360.0873718261719, 519.6683959960938, 796.9768676757812, 79.32808685302734, 160.2740020751953, 89.45114135742188, 122.41326904296875, 117.38837432861328, 177.6331329345703, 90.43231964111328, 105.46953582763672, 107.1075439453125, 502.6947937011719, 216.10971069335938, 216.29214477539062, 127.73009490966797, 100.69578552246094, 99.86002349853516, 98.40487670898438, 94.86237335205078, 87.56250762939453, 85.71647644042969, 82.84402465820312, 82.0932388305664, 77.92078399658203, 76.76319885253906, 75.14519500732422, 74.55669403076172, 74.07687377929688, 70.69990539550781, 67.64109802246094, 66.71844482421875, 64.1108627319336, 61.19413375854492, 59.182376861572266, 57.17455291748047, 55.039833068847656, 52.28261947631836, 47.10963821411133, 47.0255012512207, 47.03329849243164, 44.25577163696289, 153.05735778808594, 390.5718994140625, 487.423095703125, 278.63275146484375, 189.92523193359375, 113.68626403808594, 92.44303894042969, 243.69581604003906, 100.61830139160156, 106.87361907958984], "Term": ["charge", "product", "phone", "fast", "work", "device", "quality", "get", "great", "port", "anker", "usb", "nice", "good", "cable", "power", "plug", "look", "price", "seem", "high", "quick", "need", "review", "item", "new", "little", "well", "excellent", "give", "phone", "fast", "device", "port", "usb", "power", "plug", "need", "little", "small", "keep", "light", "even", "really", "battery", "iphone", "enough", "cable", "stay", "amp", "take", "could", "expect", "drive", "cord", "compact", "quickly", "perfect", "speed", "let", "charge", "use", "car", "charger", "fit", "time", "also", "would", "buy", "work", "love", "good", "great", "product", "get", "quality", "nice", "look", "price", "seem", "high", "quick", "review", "item", "new", "excellent", "give", "never", "happy", "cheap", "try", "outlet", "tablet", "update", "brand", "contact", "customer_service", "go", "bit", "replace", "star", "send", "hot", "anker", "great", "work", "good", "well", "make", "purchase", "charger", "buy", "car"], "Total": [1315.0, 502.0, 482.0, 406.0, 665.0, 361.0, 216.0, 216.0, 497.0, 260.0, 156.0, 228.0, 127.0, 384.0, 192.0, 184.0, 178.0, 100.0, 99.0, 98.0, 94.0, 87.0, 142.0, 85.0, 82.0, 82.0, 133.0, 270.0, 77.0, 76.0, 482.9064025878906, 406.7957763671875, 361.5325927734375, 260.0235595703125, 228.14173889160156, 184.70236206054688, 178.89520263671875, 142.97669982910156, 133.20468139648438, 124.32736206054688, 122.29013061523438, 120.57498168945312, 94.56050872802734, 94.18357849121094, 98.19927978515625, 85.16211700439453, 83.3797836303711, 192.62403869628906, 75.89352416992188, 73.32744598388672, 67.81563568115234, 67.3555679321289, 65.86373138427734, 63.527099609375, 63.32193374633789, 59.94585418701172, 100.32588958740234, 55.51253128051758, 52.95083236694336, 50.55259323120117, 1315.6785888671875, 415.3809814453125, 626.5419921875, 1040.6727294921875, 84.26249694824219, 208.7955780029297, 103.056640625, 188.68963623046875, 218.00668334960938, 665.0562133789062, 128.79190063476562, 384.102294921875, 497.679443359375, 502.7043762207031, 216.11927795410156, 216.30177307128906, 127.73966217041016, 100.70535278320312, 99.86962127685547, 98.41444396972656, 94.87194061279297, 87.57207489013672, 85.72604370117188, 82.85359191894531, 82.1028060913086, 77.93035125732422, 76.77276611328125, 75.1547622680664, 74.5662612915039, 74.08644104003906, 70.70947265625, 67.65066528320312, 66.72801208496094, 64.12042999267578, 61.203704833984375, 59.19194793701172, 57.18412399291992, 55.04940414428711, 52.29219055175781, 47.11920928955078, 47.035072326660156, 47.042869567871094, 44.265342712402344, 156.2259521484375, 497.679443359375, 665.0562133789062, 384.102294921875, 270.2966613769531, 155.62167358398438, 119.07128143310547, 1040.6727294921875, 218.00668334960938, 626.5419921875], "loglift": [30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.5361999869346619, 0.5361999869346619, 0.5361999869346619, 0.5361999869346619, 0.5361999869346619, 0.5361999869346619, 0.5361999869346619, 0.5361999869346619, 0.5360999703407288, 0.5360999703407288, 0.5360999703407288, 0.5360999703407288, 0.5360999703407288, 0.5360999703407288, 0.5360999703407288, 0.5360999703407288, 0.5360999703407288, 0.5360999703407288, 0.5360999703407288, 0.5360999703407288, 0.5360999703407288, 0.5360999703407288, 0.5360999703407288, 0.5360999703407288, 0.5360999703407288, 0.5360999703407288, 0.5360000133514404, 0.5360000133514404, 0.5360000133514404, 0.5360000133514404, 0.5338000059127808, 0.39340001344680786, 0.3492000102996826, 0.2694000005722046, 0.47589999437332153, 0.2718000113964081, 0.3946000039577484, 0.10350000113248825, -0.0828000009059906, -0.7839000225067139, 0.1826000064611435, -0.7562999725341797, -0.9998999834060669, 0.8792999982833862, 0.8792999982833862, 0.8792999982833862, 0.8792999982833862, 0.8792999982833862, 0.8792999982833862, 0.8792999982833862, 0.8792999982833862, 0.8792999982833862, 0.8792999982833862, 0.8792999982833862, 0.8792999982833862, 0.8791999816894531, 0.8791999816894531, 0.8791999816894531, 0.8791999816894531, 0.8791999816894531, 0.8791999816894531, 0.8791999816894531, 0.8791999816894531, 0.8791999816894531, 0.8791999816894531, 0.8791999816894531, 0.8791999816894531, 0.8791999816894531, 0.8791999816894531, 0.8791999816894531, 0.8791999816894531, 0.8791999816894531, 0.8791999816894531, 0.8589000105857849, 0.6370000243186951, 0.5685999989509583, 0.5583999752998352, 0.5264999866485596, 0.5654000043869019, 0.6262000203132629, -0.5723000168800354, 0.10620000213384628, -0.88919997215271], "logprob": [30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, -3.151700019836426, -3.323199987411499, -3.441200017929077, -3.7708001136779785, -3.901599884033203, -4.112800121307373, -4.144700050354004, -4.368899822235107, -4.439700126647949, -4.508600234985352, -4.525199890136719, -4.539299964904785, -4.782400131225586, -4.786300182342529, -4.74459981918335, -4.88700008392334, -4.908199787139893, -4.070899963378906, -5.002299785614014, -5.0366997718811035, -5.114799976348877, -5.121600151062012, -5.144000053405762, -5.180200099945068, -5.1834001541137695, -5.2382001876831055, -4.723199844360352, -5.315100193023682, -5.362299919128418, -5.408699989318848, -2.1517999172210693, -3.4451000690460205, -3.0782999992370605, -2.650700092315674, -4.957900047302246, -4.2546000480651855, -4.837800025939941, -4.524099826812744, -4.565999984741211, -4.151800155639648, -4.826900005340576, -4.673099994659424, -4.657599925994873, -2.768399953842163, -3.612600088119507, -3.6117000579833984, -4.138400077819824, -4.376200199127197, -4.3846001625061035, -4.399199962615967, -4.4359002113342285, -4.515999794006348, -4.537300109863281, -4.571400165557861, -4.58050012588501, -4.632599830627441, -4.647600173950195, -4.668900012969971, -4.676799774169922, -4.683199882507324, -4.729899883270264, -4.774099826812744, -4.787899971008301, -4.827700138092041, -4.874300003051758, -4.907700061798096, -4.942200183868408, -4.980299949645996, -5.031700134277344, -5.135900020599365, -5.137599945068359, -5.137499809265137, -5.198400020599365, -3.9574999809265137, -3.020699977874756, -2.7992000579833984, -3.3584001064300537, -3.7416999340057373, -4.254899978637695, -4.461699962615967, -3.4923999309539795, -4.376999855041504, -4.316699981689453]}, "token.table": {"Topic": [1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2], "Freq": [0.8636027574539185, 0.13584762811660767, 0.9955344796180725, 0.019202955067157745, 0.979350745677948, 0.9979706406593323, 0.994412362575531, 0.9966716766357422, 0.5366808176040649, 0.4632885456085205, 1.001951813697815, 0.8299523591995239, 0.17077866196632385, 0.997964084148407, 0.002280192216858268, 0.7658507823944092, 0.23446372151374817, 0.9988332390785217, 1.0009032487869263, 0.9967572093009949, 0.9949159026145935, 0.9947210550308228, 0.9967801570892334, 1.0012928247451782, 1.0074440240859985, 0.9954451322555542, 1.0046477317810059, 1.0008937120437622, 1.0020689964294434, 1.0005019903182983, 0.9375463724136353, 0.05933837965130806, 0.9994481205940247, 1.0029598474502563, 0.9991025328636169, 0.2733646631240845, 0.7263690233230591, 0.21499782800674438, 0.7856462597846985, 1.0058168172836304, 1.0013498067855835, 0.9940056204795837, 0.9980963468551636, 1.0017670392990112, 0.9976274967193604, 1.008850336074829, 1.003524899482727, 0.9984633922576904, 1.0029258728027344, 0.6988016963005066, 0.29504960775375366, 0.2698853015899658, 0.732545793056488, 1.000162959098816, 0.9979407787322998, 0.9987478256225586, 1.0020380020141602, 1.0051637887954712, 1.0087811946868896, 1.0001938343048096, 1.0005857944488525, 0.9999094009399414, 1.0016114711761475, 1.0013054609298706, 1.000588059425354, 0.22675493359565735, 0.772646427154541, 0.9986048340797424, 1.004886507987976, 0.9967516660690308, 0.9980508685112, 0.9974700212478638, 1.0031957626342773, 0.9957888126373291, 0.999088704586029, 0.9973669052124023, 1.000928521156311, 0.9992543458938599, 1.0014029741287231, 1.0040761232376099, 1.0027185678482056, 0.7662997245788574, 0.23467929661273956, 1.0041087865829468, 0.9981217980384827, 0.9993787407875061, 0.8666742444038391, 0.13240855932235718, 0.2959710955619812, 0.7029313445091248, 0.26764655113220215, 0.7322689294815063, 0.6465643644332886, 0.34978073835372925], "Term": ["also", "also", "amp", "anker", "anker", "battery", "bit", "brand", "buy", "buy", "cable", "car", "car", "charge", "charge", "charger", "charger", "cheap", "compact", "contact", "cord", "could", "customer_service", "device", "drive", "enough", "even", "excellent", "expect", "fast", "fit", "fit", "get", "give", "go", "good", "good", "great", "great", "happy", "high", "hot", "iphone", "item", "keep", "let", "light", "little", "look", "love", "love", "make", "make", "need", "never", "new", "nice", "outlet", "perfect", "phone", "plug", "port", "power", "price", "product", "purchase", "purchase", "quality", "quick", "quickly", "really", "replace", "review", "seem", "send", "small", "speed", "star", "stay", "tablet", "take", "time", "time", "try", "update", "usb", "use", "use", "well", "well", "work", "work", "would", "would"]}, "R": 30, "lambda.step": 0.01, "plot.opts": {"xlab": "PC1", "ylab": "PC2"}, "topic.order": [2, 1]};
 9 | 
10 | function LDAvis_load_lib(url, callback){
11 |   var s = document.createElement('script');
12 |   s.src = url;
13 |   s.async = true;
14 |   s.onreadystatechange = s.onload = callback;
15 |   s.onerror = function(){console.warn("failed to load library " + url);};
16 |   document.getElementsByTagName("head")[0].appendChild(s);
17 | }
18 | 
19 | if(typeof(LDAvis) !== "undefined"){
20 |    // already loaded: just create the visualization
21 |    !function(LDAvis){
22 |        new LDAvis("#" + "ldavis_el224751401455801140008610013083", ldavis_el224751401455801140008610013083_data);
23 |    }(LDAvis);
24 | }else if(typeof define === "function" && define.amd){
25 |    // require.js is available: use it to load d3/LDAvis
26 |    require.config({paths: {d3: "https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min"}});
27 |    require(["d3"], function(d3){
28 |       window.d3 = d3;
29 |       LDAvis_load_lib("https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.js", function(){
30 |         new LDAvis("#" + "ldavis_el224751401455801140008610013083", ldavis_el224751401455801140008610013083_data);
31 |       });
32 |     });
33 | }else{
34 |     // require.js not available: dynamically load d3 & LDAvis
35 |     LDAvis_load_lib("https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js", function(){
36 |          LDAvis_load_lib("https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.js", function(){
37 |                  new LDAvis("#" + "ldavis_el224751401455801140008610013083", ldavis_el224751401455801140008610013083_data);
38 |             })
39 |          });
40 | }
41 | </script>


--------------------------------------------------------------------------------
/scripts/data_exploration.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | ### Import libraries ###
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from pandas import Grouper
  8 | import matplotlib.pyplot as plt
  9 | import seaborn as sns
 10 | color = sns.color_palette()
 11 | sns.set_style(style="darkgrid")
 12 | from data_utils import most_reviewed_products
 13 | from pathlib import Path
 14 | from matplotlib import rcParams
 15 | import json
 16 | 
 17 | # Default text styling for figures
 18 | rcParams['font.family'] = 'sans-serif'
 19 | rcParams['font.sans-serif'] = ['Inter']
 20 | rcParams['font.weight'] = 500
 21 | rcParams['xtick.labelsize'] = 13
 22 | rcParams['ytick.labelsize'] = 13
 23 | 
 24 | figOutputPath = Path("../figures/")
 25 | 
 26 | ### Functions ###
 27 | 
 28 | def label_typography(ax):
 29 |     ax.xaxis.label.set_fontweight(500)
 30 |     ax.yaxis.label.set_fontsize(15)
 31 |     ax.yaxis.label.set_fontweight(500)
 32 |     ax.xaxis.label.set_fontsize(15)
 33 |     return
 34 | 
 35 | def most_active_reviewers(df, n_reviewers):
 36 |     n_reviews = df['reviewerID'].value_counts()
 37 |     most_reviews = n_reviews.nlargest(n_reviewers)
 38 |     most_reviews = most_reviews.reset_index()
 39 |     most_reviews = most_reviews.drop('reviewerID', axis=1)
 40 |     definitive = df.merge(most_reviews, left_on='reviewerID', right_on='index')
 41 |     definitive = definitive.drop('index', axis=1)
 42 |     return definitive
 43 | 
 44 | def analyze_reviews(df, df_attribute, name_file, xlabel):
 45 |     fig, ax = plt.subplots(figsize=(10, 10))
 46 |     sns.countplot(df_attribute, ax=ax)
 47 |     label_typography(ax)
 48 | 
 49 |     # Set and style the title, and move it up a bit (1.02 = 2%)
 50 |     #ax.set_title(title, fontname='Inter', fontsize=20, fontweight=500, y=1.02)
 51 |     
 52 |     ax.xaxis.label.set_text(xlabel)
 53 |     ax.yaxis.label.set_text("Review count")
 54 |     if (name_file=="review_distribution_per_day"):
 55 |         ax.set_xticklabels(["Sunday", "Monday", "Thuesday", "Wednesday", "Thursday", "Friday", "Saturday"])
 56 |         ax.xaxis.label.set_fontsize(13)
 57 |         ax.set_yticks([0, 100000, 200000])
 58 |         ax.set_yticklabels(["0", "100K", "200K"])
 59 |     elif (name_file=="review_distribution_per_month"):
 60 |         ax.set_xticklabels(["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"])
 61 |         ax.xaxis.label.set_fontsize(13)
 62 |         ax.set_yticks([0, 100000, 200000])
 63 |         ax.set_yticklabels(["0", "100K", "200K"])
 64 |     elif (name_file=="review_distribution_per_year"):
 65 |         ax.set_xticklabels([2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018])
 66 |         ax.xaxis.label.set_fontsize(13)
 67 |         ax.set_yticks([0, 100000, 200000])
 68 |         ax.set_yticklabels(["0", "100K", "200K"])
 69 |     elif (name_file=="unverified_overall_distribution"):
 70 |         ax.set_yticks([0, 50000, 100000])
 71 |         ax.set_yticklabels(["0", "50K", "100K"])
 72 |     elif (name_file=="verified_overall_distribution"):
 73 |         ax.set_yticks([0, 300000, 600000])
 74 |         ax.set_yticklabels(["0", "300K", "600K"])
 75 |     else:
 76 |         ax.set_yticks([0, 100000, 500000, 1000000])
 77 |         ax.set_yticklabels(["0", "100K", "500K", "1M"])
 78 | 
 79 | 
 80 | 
 81 |     ax.figure.savefig(figOutputPath / '1_{0}.svg'.format(name_file), format='svg')
 82 |     print('Exported 1_{}.svg'.format(name_file))
 83 | 
 84 | def run(df):
 85 |     # 1 - Countplot: overall distribution    
 86 |     analyze_reviews(df, df.overall, 'overall_distribution', 'Overall')
 87 |     
 88 |     # 2 - Countplot: opinion distribution    
 89 |     analyze_reviews(df, df.opinion, 'opinion_distribution', 'Opinion')
 90 | 
 91 |     # 3 - Distribution of words
 92 |     reduced_df = df.copy()
 93 |     reduced_df = reduced_df[reduced_df['n_words'] <= 1000]
 94 |     fig, ax5 = plt.subplots()
 95 |     ax5 = sns.violinplot(x=reduced_df['opinion'], y=reduced_df['n_words'])
 96 |     #ax5.set_title('Distribution of words in review for each opinion')
 97 |     ax5.xaxis.label.set_text("Opinion")
 98 |     ax5.yaxis.label.set_text("Number of words")
 99 |     label_typography(ax5)
100 |     ax5.figure.savefig(figOutputPath / '1_correlation_words_opinion.svg', format='svg')
101 |     
102 |     # 4 - Review distribution per day
103 |     analyze_reviews(df, df.week_day, 'review_distribution_per_day', 'Day')
104 |     
105 |     # 5 - Top 20 products
106 |     fig, ax3 = plt.subplots(figsize=(15, 15))
107 |     top_products = most_reviewed_products(df, 20)
108 |     r = list(top_products['asin'].unique())
109 |     positive = list(top_products.loc[top_products['opinion'] == 'positive', 'asin'].value_counts().reindex(top_products['asin'].unique(), fill_value=0))
110 |     neutral = list(top_products.loc[top_products['opinion'] == 'neutral', 'asin'].value_counts().reindex(top_products['asin'].unique(), fill_value=0))
111 |     negative = list(top_products.loc[top_products['opinion'] == 'negative', 'asin'].value_counts().reindex(top_products['asin'].unique(), fill_value=0))
112 |     raw_data = {'positive': positive, 'neutral': neutral, 'negative': negative}
113 |     raw_data = pd.DataFrame(raw_data)
114 |     
115 |     totals = [i+j+k for i,j,k in zip(raw_data['positive'], raw_data['neutral'], raw_data['negative'])]
116 |     positive_percentage = [i / j * 100 for i, j in zip(raw_data['positive'], totals)]
117 |     neutral_percentage = [i / j * 100 for i, j in zip(raw_data['neutral'], totals)]
118 |     negative_percentage = [i / j * 100 for i, j in zip(raw_data['negative'], totals)]
119 | 
120 |     bar_width = 0.85
121 | 
122 |     ax3.bar(r, positive_percentage, color='#b5ffb9', edgecolor='white', width=bar_width, label='positive')
123 |     ax3.bar(r, neutral_percentage, bottom=positive_percentage, color='#f9bc86', edgecolor='white', width=bar_width, label='neutral')
124 |     ax3.bar(r, negative_percentage, bottom=[i + j for i, j in zip(positive_percentage, neutral_percentage)], color='#a3acff', edgecolor='white', width=bar_width, label='negative')
125 |     ax3.set_xticklabels(r, rotation=90)
126 |     ax3.set_xlabel('Unique product')
127 |     ax3.set_xticks([])
128 |     ax3.set_ylabel('Percentage')
129 |     ax3.set_xticks([])
130 |     label_typography(ax3)
131 |     #legend = ax3.legend(loc='lower left', shadow=True, fontsize='large')
132 |     #legend.get_frame().set_facecolor('#00FFCC')
133 |     #ax3.set_title('Opinion for besteller products')
134 |     ax3.figure.savefig(figOutputPath / '1_sentiment_reviews_bestseller_products.svg', format='svg')
135 |     print("Exported 1_sentiment_reviews_besteller_products.svg")
136 | 
137 |     # 6 - Top 50 reviewers
138 |     fig, ax4 = plt.subplots(figsize=(15, 15))
139 |     top_reviewers = most_active_reviewers(df, 50)
140 |     sns.countplot(top_reviewers.reviewerID, ax=ax4, order=top_reviewers['reviewerID'].value_counts().index)
141 |     r = list(top_reviewers['reviewerID'].unique())
142 |     ax4.set_xticklabels(r, rotation=90)
143 |     ax4.set_ylabel('Review count')
144 |     ax4.set_xlabel('Unique Reviewers')
145 |     ax4.set_xticks([])
146 |     label_typography(ax4)
147 |     #ax4.set_title('Reviewers with most reviews')
148 |     ax4.figure.savefig(figOutputPath / '1_reviewers_most_reviews.svg', format='svg')
149 |     
150 |     # 7 - Opinion of top reviewers
151 |     fig, ax6 = plt.subplots(figsize=(15, 15))
152 |     top_reviewers = most_active_reviewers(df, 50)
153 |     r = list(top_reviewers['reviewerID'].unique())
154 |     positive = list(top_reviewers.loc[top_reviewers['opinion'] == 'positive', 'reviewerID'].value_counts().reindex(top_reviewers['reviewerID'].unique(), fill_value=0))
155 |     neutral = list(top_reviewers.loc[top_reviewers['opinion'] == 'neutral', 'reviewerID'].value_counts().reindex(top_reviewers['reviewerID'].unique(), fill_value=0))
156 |     negative = list(top_reviewers.loc[top_reviewers['opinion'] == 'negative', 'reviewerID'].value_counts().reindex(top_reviewers['reviewerID'].unique(), fill_value=0))
157 |     raw_data = {'positive': positive, 'neutral': neutral, 'negative': negative}
158 |     raw_data = pd.DataFrame(raw_data)
159 | 
160 |     #print("Opinions ",raw_data)
161 |     
162 |     totals = [i+j+k for i,j,k in zip(raw_data['positive'], raw_data['neutral'], raw_data['negative'])]
163 |     #totals = list(top_products['asin'].value_counts().reindex(top_products['asin'].unique(), fill_value=0))
164 |     positive_percentage = [i / j * 100 for i, j in zip(raw_data['positive'], totals)]
165 |     neutral_percentage = [i / j * 100 for i, j in zip(raw_data['neutral'], totals)]
166 |     negative_percentage = [i / j * 100 for i, j in zip(raw_data['negative'], totals)]
167 | 
168 |     bar_width = 1
169 | 
170 |     ax6.bar(r, positive_percentage, color='#b5ffb9', edgecolor='white', width=bar_width, label='positive')
171 |     ax6.bar(r, neutral_percentage, bottom=positive_percentage, color='#f9bc86', edgecolor='white', width=bar_width, label='neutral')
172 |     ax6.bar(r, negative_percentage, bottom=[i + j for i, j in zip(positive_percentage, neutral_percentage)], color='#a3acff', edgecolor='white', width=bar_width, label='negative')
173 |     ax6.set_xticklabels(r, rotation=90)
174 |     ax6.set_xlabel('Unique Reviewers')
175 |     ax3.set_xticks([])
176 |     ax6.set_xticks([])
177 |     ax6.set_ylabel('Percentage')
178 |     label_typography(ax6)
179 |     label_typography(ax3)
180 |     #legend = ax6.legend(loc='lower left', shadow=True, fontsize='large')
181 |     #legend.get_frame().set_facecolor('#00FFCC')
182 |     #ax6.set_title('Opinion of top reviewers')
183 |     #plt.show()
184 |     ax6.figure.savefig(figOutputPath / '1_opinion_top_reviewers.svg', format='svg')
185 |     print("Exported 1_opinion_top_reviewers.svg")
186 |     
187 |     # 8 - Unverified reviews
188 |     unverified = df[df['verified'] == False]
189 |     analyze_reviews(unverified, unverified.overall, 'unverified_overall_distribution', 'Overall')
190 | 
191 |     # 9 - Verified reviews
192 |     verified = df[df['verified'] == True]
193 |     analyze_reviews(verified, verified.overall, 'verified_overall_distribution', 'Overall')
194 | 
195 |     # 10 - verified vs unverified of top 50 reviewers
196 |     fig, ax7 = plt.subplots(figsize=(15, 15))
197 |     r = list(top_reviewers['reviewerID'].unique())
198 |     verified = list(top_reviewers.loc[top_reviewers['verified'] == True, 'reviewerID'].value_counts().reindex(top_reviewers['reviewerID'].unique(), fill_value=0))
199 |     unverified = list(top_reviewers.loc[top_reviewers['verified'] == False, 'reviewerID'].value_counts().reindex(top_reviewers['reviewerID'].unique(), fill_value=0))
200 |     raw_data = {'verified': verified, 'unverified': unverified}
201 |     raw_data = pd.DataFrame(raw_data)
202 | 
203 |     totals = [i+j for i,j in zip(raw_data['verified'], raw_data['unverified'])]
204 |     verified_percentage = [i / j * 100 for i, j in zip(raw_data['verified'], totals)]
205 |     unverified_percentage = [i / j * 100 for i, j in zip(raw_data['unverified'], totals)]
206 | 
207 |     bar_width = 1
208 | 
209 |     ax7.bar(r, verified_percentage, color='#b5ffb9', edgecolor='white', width=bar_width, label='verified')
210 |     ax7.bar(r, unverified_percentage, bottom=verified_percentage, color='#f9bc86', edgecolor='white', width=bar_width, label='unverified')
211 |     ax7.set_xticklabels(r, rotation=90)
212 | 
213 |     ax7.set_xlabel('Unique Reviewers')
214 |     ax7.set_xticks([])
215 |     ax3.set_xticks([])
216 |     ax7.set_ylabel('Percentage')
217 |     label_typography(ax3)
218 |     label_typography(ax7)
219 |     #legend = ax7.legend(loc='upper right', shadow=True, fontsize='large')
220 |     #legend.get_frame().set_facecolor('#00FFCC')
221 |     #ax7.set_title('Verified vs Unverified reviews of top reviewers')
222 |     #plt.show()
223 |     ax7.figure.savefig(figOutputPath / '1_verified_unverified.svg', format='svg')
224 |     print("Exported 1_verified_unverified.svg")
225 |     
226 | 
227 | # Exporting raw data for the web demo
228 | 
229 | def top_50_products_verified_unverified_both(df):
230 |     print("top_50_products_verified_unverified_both")
231 |     top_products = most_reviewed_products(df, 5)
232 |     r = list(top_products['asin'].unique())
233 |     products = []
234 |     verified_series = []
235 |     unverified_series = []
236 |     overall_series = []
237 | 
238 |     for asin in r:
239 |         print("Product: ", asin)
240 |         products.append(asin)
241 |         verified = df.loc[(df['asin'] == asin) & (df['verified'] == True), 'overall'].mean()
242 |         print("-verified: ",verified)
243 |         verified_series.append(verified)
244 |         unverified = df.loc[(df['asin'] == asin) & (df['verified'] == False), 'overall'].mean()
245 |         unverified_series.append(unverified)
246 |         print("-unverified: ", unverified)
247 |         aall = df.loc[(df['asin'] == asin), 'overall'].mean()
248 |         overall_series.append(aall)
249 |         print("-all: ", aall)
250 | 
251 |     obj = [
252 |         {"name": "products",
253 |         "data": products},
254 |         {"name": "verified",
255 |         "data": verified_series},
256 |         {"name": "unverified",
257 |         "data": unverified_series},
258 |         {"name": "all",
259 |         "data": overall_series
260 |     }]
261 | 
262 |     with open('ver_unver.json', 'w') as outfile:
263 |         json.dump(obj, outfile, indent=2, sort_keys=True)
264 |     
265 |     print(products)
266 | 
267 | def count_reviews(df):
268 |     top_products = most_reviewed_products(df, 20)
269 |     r = list(top_products['asin'].unique())
270 |     products = []
271 |     # One element per product
272 |     verified_score_qty = []
273 |     unverified_score_qty = []
274 |     n = 0
275 | 
276 |     for asin in r:
277 |         print("Product: ", asin)
278 |         products.append(asin)
279 |         dataseries_ver = []
280 |         dataseries_unver = []
281 | 
282 |         for i in range(1,6):
283 |             key = { "name" : int(i), "data": [int(df.loc[(df['asin'] == asin) & (df['verified'] == True) & (df['overall'] == i), 'overall'].count()), int(df.loc[(df['asin'] == asin) & (df['verified'] == False) & (df['overall'] == i), 'overall'].count())]}
284 |             dataseries_ver.append(key)
285 | 
286 |         verified_score_qty.append(dataseries_ver)
287 |         n = n+1
288 | 
289 |     obj = {'products': products, 'count':verified_score_qty,}
290 |     
291 |     with open('ver_counts.json', 'w') as outfile:
292 |         json.dump(obj, outfile, indent=2, sort_keys=True)
293 | 
294 | 
295 | def year_month_day_reviews(df):
296 |     analyze_reviews(df, df.week_day, 'review_distribution_per_day', 'Day')
297 |     analyze_reviews(df, df.month, 'review_distribution_per_month', 'Month')
298 |     analyze_reviews(df, df.year, 'review_distribution_per_year', 'Year')
299 | 
300 | def export_week_day(df):
301 |     for i in range(1,6):
302 |         print(i, df.loc[df['overall']==i].groupby(['week_day']).size())
303 | 
304 | def export_month(df):
305 |         for i in range(1,6):
306 |             print(i, df.loc[df['overall']==i].groupby(['month']).size().values.tolist())
307 | 
308 | def export_year(df):
309 |         for i in range(1,6):
310 |             print(i, df.loc[df['overall']==i].groupby(['year']).size().values.tolist())
311 | 


--------------------------------------------------------------------------------
/scripts/topic_analysis.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | ### Import libraries ###
  4 | 
  5 | import nltk
  6 | from nltk.sentiment.vader import SentimentIntensityAnalyzer
  7 | nltk.download('vader_lexicon')
  8 | nltk.download('wordnet')
  9 | import gensim
 10 | import matplotlib.pyplot as plt
 11 | import pandas as pd
 12 | import pyLDAvis.gensim
 13 | from data_utils import most_reviewed_products
 14 | from pathlib import Path
 15 | 
 16 | figures_folder = Path("../figures/")
 17 | dataframes_folder = Path("../dataframes/")
 18 | 
 19 | ### Functions ###
 20 | 
 21 | def worst_products_asin(df, n_worst):
 22 |     if n_worst == 0:
 23 |         return []
 24 |     top_products = most_reviewed_products(df, 20)
 25 |     overall_mean = top_products.groupby(['asin'], as_index=False)['overall'].mean()
 26 |     overall_mean = overall_mean.sort_values('overall', ascending=True)
 27 |     worst_n_products = overall_mean['asin'].iloc[:n_worst].tolist()
 28 |     return worst_n_products
 29 |     
 30 | 
 31 | def best_products_asin(df, n_best):
 32 |     if n_best == 0:
 33 |         return []
 34 |     top_products = most_reviewed_products(df, 20)
 35 |     overall_mean = top_products.groupby(['asin'], as_index=False)['overall'].mean()
 36 |     overall_mean = overall_mean.sort_values('overall', ascending=False)
 37 |     best_n_products = overall_mean['asin'].iloc[:n_best].tolist()
 38 |     return best_n_products
 39 | 
 40 | 
 41 | def products_to_analyze(df, n_best=0, n_worst=0):
 42 |     worst = worst_products_asin(df, n_worst)
 43 |     best = best_products_asin(df, n_best)
 44 |     products = worst + best
 45 |     if products == []:
 46 |         # Most reviewed product
 47 |         product_id = df.asin.mode().iloc[0]
 48 |         return [product_id]
 49 |     else:
 50 |         return products
 51 |         
 52 |     
 53 | def create_dictionary(texts):
 54 |     dictionary = gensim.corpora.Dictionary(texts)
 55 |     #dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=100000)
 56 |     dictionary.filter_extremes(keep_n=10000)
 57 |     dictionary.compactify()
 58 |     return dictionary
 59 |     
 60 |             
 61 | def make_bigrams(texts):
 62 |     bigram = gensim.models.Phrases(texts, min_count=5, threshold=100)
 63 |     bigram_mod = gensim.models.phrases.Phraser(bigram)
 64 |     return [bigram_mod[doc] for doc in texts]
 65 | 
 66 | 
 67 | def bag_of_words(texts, dictionary):
 68 |     corpus = [dictionary.doc2bow(text) for text in texts]
 69 |     return corpus
 70 |     
 71 | 
 72 | def compute_lda_model(corpus, num_topics, dictionary, texts, alpha, beta):
 73 |     lda_models, coherences = [], []
 74 |     for n in num_topics:
 75 |         model = gensim.models.LdaModel(corpus=corpus, 
 76 |                                        num_topics=n, 
 77 |                                        random_state=42, 
 78 |                                        chunksize=100,
 79 |                                        id2word=dictionary, 
 80 |                                        passes=10, 
 81 |                                        alpha=alpha,
 82 |                                        eta=beta)
 83 |         lda_models.append(model)
 84 |         cm = gensim.models.ldamodel.CoherenceModel(model=model, 
 85 |                                                    dictionary=dictionary, 
 86 |                                                    coherence='c_v', 
 87 |                                                    texts=texts)
 88 |         coherences.append(cm.get_coherence())
 89 |         print('\nNumber of topic:', n)
 90 |     return coherences, lda_models
 91 | 
 92 | 
 93 | def compute_multiple_lda_models(alphas, betas, num_topics, corpus, texts, dictionary):
 94 |     all_coherences, all_lda_models, all_parameters = [], [], []
 95 |     for alpha in alphas:
 96 |         for beta in betas:
 97 |             coherences, lda_models = compute_lda_model(corpus=corpus, 
 98 |                                                        num_topics=num_topics, 
 99 |                                                        dictionary=dictionary, 
100 |                                                        texts=texts,
101 |                                                        alpha=alpha,
102 |                                                        beta=beta)
103 |             all_coherences.append(coherences)
104 |             all_lda_models.append(lda_models)
105 |             all_parameters.append([alpha, beta])
106 |     return all_coherences, all_lda_models, all_parameters
107 | 
108 | 
109 | def plot_coherence(num_topics, coherence, product_asin):
110 |     x_axis = range(2, 2+num_topics)
111 |     fig, ax0 = plt.subplots()
112 |     ax0.plot(x_axis, coherence)
113 |     ax0.set_xlabel("Number of topics")
114 |     ax0.set_ylabel("Coherence score")
115 |     ax0.figure.savefig(figures_folder / '3_coherence_plot_{0}.svg'.format(product_asin), format='svg')
116 |     
117 |     
118 | def show_topics(model, ideal_topics, num_words, product_asin):
119 |     topics = model.show_topics()
120 |     for topic in topics:
121 |         print(topic)
122 |         
123 |     word_dict = {};
124 |     for i in range(ideal_topics):
125 |         words = model.show_topic(i, topn = num_words)
126 |         word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words]
127 |     topic_df = pd.DataFrame(word_dict)
128 |     topic_df.to_pickle(dataframes_folder / 'topics_{}.pkl'.format(product_asin))
129 |     print(topic_df)
130 |     
131 |     
132 | def topic_visualization(model, corpus, dictionary, product_asin):
133 |     lda_display = pyLDAvis.gensim.prepare(model, corpus, dictionary, sort_topics=True)
134 |     pyLDAvis.save_html(lda_display, 'lda_{0}.html'.format(product_asin))
135 |     
136 | '''
137 | def format_topics_sentences(model, corpus, texts):
138 |     # Get main topic reviews
139 |     # Init output
140 |     df = pd.DataFrame()
141 | 
142 |     # Get main topic in each document
143 |     for i, row in enumerate(model[corpus]):
144 |         row = sorted(row, key=lambda x: (x[1]), reverse=True)
145 |         # Get the Dominant topic, Perc Contribution and Keywords for each document
146 |         for j, (topic_num, prop_topic) in enumerate(row):
147 |             if j == 0:  # => dominant topic
148 |                 # probability pairs for the most relevant words generated by the topic
149 |                 wp = model.show_topic(topic_num)
150 |                 topic_keywords = ", ".join([word for word, prop in wp])
151 |                 df = df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
152 |             else:
153 |                 break
154 |     #df.columns = ['Dominant_Topic', 'Topic_Perc_Contribution', 'Topic_Keywords']
155 | 
156 |     # Add original text to the end of the output
157 |     contents = pd.Series(texts)
158 |     df = pd.concat([df, contents], axis=1)
159 |     df = df.reset_index()
160 |     df.columns = ['review', 'topic_num', 'topic_perc_contribution', 'keywords', 'text']
161 |     return df
162 | 
163 | 
164 | def sentiment_polarity(df):
165 |     sentiment = pd.DataFrame()
166 |     sentiment = pd.concat([sentiment, df], ignore_index=True)
167 |     analyser = SentimentIntensityAnalyzer()
168 |     sentiment['sentiments'] = sentiment['text'].str.join(' ').apply(lambda x:
169 |                                                           analyser.polarity_scores(x))
170 |     sentiment = pd.concat([sentiment.drop(['sentiments'], axis=1), 
171 |                            sentiment['sentiments'].apply(pd.Series)],
172 |                           axis=1)
173 |     # Numbers of words
174 |     sentiment['words_nb'] = sentiment["text"].apply(lambda x: len(x))
175 |     sentiment_final = sentiment.groupby(['topic_num', 
176 |                                          'keywords']).agg({'neg':'mean',
177 |                                                            'neu':'mean',
178 |                                                            'pos':'mean',
179 |                                                            'compound':'mean',
180 |                                                            'topic_perc_contribution':'count'}).reset_index()
181 |     return sentiment_final, sentiment
182 | 
183 | 
184 | def most_representative_document(df):
185 |     # Most representative document for each topic
186 |     sent_topics_sorted_df = pd.DataFrame()
187 |     sent_topics_outdf_grpd = df.groupby('topic_num')
188 |     for i, grp in sent_topics_outdf_grpd:
189 |         sent_topics_sorted_df = pd.concat([sent_topics_sorted_df, 
190 |                                            grp.sort_values(['topic_perc_contribution'], 
191 |                                                            ascending=[0]).head(1)], 
192 |                                            axis=0)
193 |     sent_topics_sorted_df.reset_index(drop=True, inplace=True)
194 |     sent_topics_sorted_df.columns = ['review', 'topic_num', 'topic_perc_contribution', 'keywords', 'text']
195 |     sent_topics_sorted_df.drop(['review'], axis=1, inplace=True)
196 |     return sent_topics_sorted_df
197 | 
198 | 
199 | def topic_distribution_across_documents(df, sentiment):
200 |     # Number of Documents for Each Topic
201 |     sentiment.rename(columns={'dominant_topic':'topic'})
202 |     topic_counts = df['topic_num'].value_counts()
203 |     topic_contribution = round(topic_counts/topic_counts.sum(), 4)
204 |     topic_contribution.rename(columns={'topic_num':'perc_contribution'})
205 |     df_dominant_topics = pd.concat([sentiment, topic_contribution], axis=1)
206 |     # Change Column names
207 |     #df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']
208 |     # Show
209 |     return df_dominant_topics
210 | 
211 | 
212 | '''
213 | 
214 | def run_for_custom_analysis(df):
215 |     print("CUSTOM LDA ANALYSIS!")
216 |     df = df.head(15000)
217 |     product = "PORTABLECHARGERS"
218 |     reviews_product = [r.split(' ') for r in df['preprocessedReview']]
219 |     bigram_reviews = make_bigrams(reviews_product)
220 |     dictionary = create_dictionary(bigram_reviews)
221 |     bow_corpus = bag_of_words(bigram_reviews, dictionary)
222 |     max_topics = 10
223 |     alpha_list = [0.1, 1]
224 |     beta_list = [0.01, 0.1, 1]
225 |     num_topics = list(range(2, max_topics + 1))
226 |     all_coherences, all_lda_models, all_parameters = compute_multiple_lda_models(alphas=alpha_list,
227 |                                                                                  betas=beta_list,
228 |                                                                                  num_topics=num_topics,
229 |                                                                                  corpus=bow_corpus,
230 |                                                                                  texts=bigram_reviews,
231 |                                                                                  dictionary=dictionary)
232 |     # Extract best coherence and index
233 |     best_coherence_value, index_best_value = max((x, (i, j))
234 |                                                  for i, row in enumerate(all_coherences)
235 |                                                  for j, x in enumerate(row))
236 |     best_alpha = all_parameters[index_best_value[0]][0]
237 |     best_beta = all_parameters[index_best_value[0]][1]
238 |     best_model = all_lda_models[index_best_value[0]][index_best_value[1]]
239 |     print('Best model has {} coherence with {} alpha value and {} beta value'.format(best_coherence_value,
240 |                                                                                      best_alpha,
241 |                                                                                      best_beta))
242 |     best_coherences = all_coherences[index_best_value[0]]
243 |     best_num_topics = num_topics[0] + index_best_value[1]
244 |     print('Best num of topics: {}'.format(best_num_topics))
245 |     plot_coherence(len(num_topics), best_coherences, product)
246 |     show_topics(best_model, best_num_topics, 10, product)
247 |     topic_visualization(best_model, bow_corpus, dictionary, product)
248 | 
249 | 
250 | def run(df):    
251 |     product_list = products_to_analyze(df, n_best=3, n_worst=3)
252 |     for product in product_list:
253 |         figures_folder = Path("../figures/")
254 |         name_file = '3_coherence_plot_{0}.svg'.format(product)
255 |         path_file = figures_folder / name_file
256 |         if False:
257 |             print('{} already computed.'.format(product))
258 |         else:
259 |             print(product)
260 |             df_product = df[df['asin'] == product]
261 |             reviews_product = [r.split(' ') for r in df_product['preprocessedReview']]
262 |             bigram_reviews = make_bigrams(reviews_product)
263 |             dictionary = create_dictionary(bigram_reviews)
264 |             bow_corpus = bag_of_words(bigram_reviews, dictionary)
265 |             max_topics = 10
266 |             alpha_list = [0.1, 1]
267 |             beta_list = [0.01, 0.1, 1]
268 |             num_topics = list(range(2, max_topics+1))
269 |             all_coherences, all_lda_models, all_parameters = compute_multiple_lda_models(alphas=alpha_list,
270 |                                                                                          betas=beta_list,
271 |                                                                                          num_topics=num_topics,
272 |                                                                                          corpus=bow_corpus,
273 |                                                                                          texts=bigram_reviews,
274 |                                                                                          dictionary=dictionary)
275 |             # Extract best coherence and index 
276 |             best_coherence_value, index_best_value = max((x, (i, j))
277 |                                                          for i, row in enumerate(all_coherences)
278 |                                                          for j, x in enumerate(row))
279 |             best_alpha = all_parameters[index_best_value[0]][0]
280 |             best_beta = all_parameters[index_best_value[0]][1]
281 |             best_model = all_lda_models[index_best_value[0]][index_best_value[1]]
282 |             print('Best model has {} coherence with {} alpha value and {} beta value'.format(best_coherence_value,
283 |                                                                                              best_alpha,
284 |                                                                                              best_beta))
285 |             best_coherences = all_coherences[index_best_value[0]]
286 |             best_num_topics = num_topics[0] + index_best_value[1]
287 |             print('Best num of topics: {}'.format(best_num_topics))
288 |             plot_coherence(len(num_topics), best_coherences, product)
289 |             show_topics(best_model, best_num_topics, 10, product)
290 |             topic_visualization(best_model, bow_corpus, dictionary, product)
291 |             '''
292 |             topic_sents_keywords = format_topics_sentences(best_model, bow_corpus, bigram_reviews)
293 |         
294 |             topic_sents_keywords.to_pickle('dataframes/topic_sents_keywords.pkl')
295 |             
296 |             sentiment_df, words = sentiment_polarity(topic_sents_keywords)
297 |             pos = words[words["words_nb"] >= 5].sort_values("pos", ascending = False)[["text", "pos"]].head(20)
298 |             neg = words[words["words_nb"] >= 5].sort_values("neg", ascending = False)[["text", "neg"]].head(20)
299 | 
300 |             most_repr_rews = most_representative_document(topic_sents_keywords)
301 |             df_dominant_topics = topic_distribution_across_documents(topic_sents_keywords, sentiment_df)
302 |             '''
303 |             


--------------------------------------------------------------------------------
/report/2_esplorazione.tex:
--------------------------------------------------------------------------------
  1 | \chapter{Esplorazione dei dati}
  2 | \label{exploration}
  3 | 
  4 | \section{Informazioni preliminari sul dominio}
  5 | Prima di addentrarci nell'analisi del nostro dataset, che si limita ad una categoria, abbiamo cercato delle visualizzazioni globali dell'intero marketplace Amazon. La Figura \ref{fig1} ci mostra una carattere fortemente stagionale: gli utenti sono molto più propensi a fornire recensioni nei periodi estivi, nonostante i picchi dei volumi di vendita si verifichino intorno al periodo natalizio \cite{trends}.
  6 | \par 
  7 | 
  8 | \begin{figure}[H]
  9 | 
 10 |   \centering
 11 |   \includegraphics[width=0.95\linewidth]{figures/ext/1_monthly.png}
 12 |   \caption{General Amazon ratings per month \cite{plots1}}
 13 |     \label{fig1}
 14 | \end{figure}
 15 | 
 16 | La Figura \ref{fig2} visualizza invece il contributo di un utente, dandoci un'idea di quanto vocale sia la clientela Amazon, in media.
 17 | 
 18 | \begin{figure}[H]
 19 | 
 20 |   \centering
 21 |   \includegraphics[width=1.1\linewidth]{figures/ext/1_peruser.png}
 22 |   \caption{General Amazon ratings per user \cite{plots1}}
 23 |   \label{fig2}
 24 | \end{figure}
 25 | 
 26 | 
 27 | \section{Descrizione dataset}
 28 | \label{descrizione_dataset}
 29 | Il dataset si presenta in formato JSON e viene caricato in memoria in un DataFrame con la libreria Pandas, molto efficiente per la gestione di dati voluminosi. 
 30 | 
 31 | La fase di caricamento e preprocessamento del dataset sono le più impegnative computazionalmente, impiegando gran parte del tempo totale.
 32 | 
 33 | Per ovviare a questo problema e muoverci più agevolmente durante lo sviluppo sfruttiamo la funzione \texttt{to\_pickle} di Pandas per salvare su disco una versione "cachata" del dataframe, abbreviando le successive esecuzioni della pipeline.
 34 | 
 35 | \par
 36 | Prima del salvataggio sono state effettuate alcune operazioni utili per rendere il dataset conforme agli obiettivi. In particolare:
 37 | \begin{itemize}
 38 |     \item Il campo \texttt{vote} è stato trasformato da tipo \texttt{object} a tipo \texttt{float}
 39 |     \item Il campo \texttt{reviewText} possedeva alcune recensioni vuote, inutili e perciò eliminate
 40 | \end{itemize}
 41 | Queste operazioni hanno ridotto il dataset portandolo da un totale di recensioni pari a 1128437 a un totale di 1127654, suddivise fra ben 157195 utenti e 48146 prodotti.
 42 | \begin{table}[H]
 43 | \small  
 44 | \centering
 45 | \begin{tabular}{|p{0.20\textwidth}||p{0.10\textwidth}||p{0.55\textwidth}|}
 46 | \hline
 47 | Campo & Tipo & Descrizione  \\
 48 | \hline
 49 | overall & int & Valutazione del prodotto (1-5)\\
 50 | verified & bool & Recensione proveniente da acquisto verificato\\
 51 | reviewTime & string & Data della recensione in formato string\\
 52 | reviewerID & string & Codice univoco del recensore\\
 53 | asin & string & Codice univoco del prodotto\\
 54 | style & string & Dizionario dei metadati del prodotto\\
 55 | reviewerName & string & Nome del recensore\\
 56 | reviewText & string & Testo della recensione\\
 57 | summary & string & Titolo della recensione\\
 58 | unixReviewTime & int & Data della recensione in formato unix\\
 59 | vote & float & Numero di voti della recensione \\
 60 | image & string & Immagine associata alla recensione\\
 61 | \hline
 62 | \end{tabular}
 63 | \caption{Campi del dataset con tipo e descrizione}
 64 | \label{table_dataset_description}
 65 | \end{table}
 66 | 
 67 | Il dataset possiede gli attributi mostrati in Tabella \ref{table_dataset_description}. Ogni record del dataset è la rappresentazione di una singola recensione svolta da parte di un utente per un certo prodotto nella data indicata. 
 68 | \par
 69 | Per l'identificazione dell'utente abbiamo a disposizione il campo \texttt{reviewerName} e il campo \texttt{reviewerID}: utilizzeremo solamente quest'ultimo per i nostri scopi.
 70 | Per quanto riguarda i campi relativi alla recensione, abbiamo a disposizione sia \texttt{summary} che \texttt{reviewText}.
 71 | \par
 72 | Per identificare il prodotto abbiamo a disposizione solamente il campo \texttt{asin}, che è un codice univoco da cui si può risalire a maggiori informazioni con l'utilizzo delle API Amazon o software di terze parti. 
 73 | \par
 74 | Le recensioni sono classificate come \textit{verificate} se provengono da un acquisto su Amazon per almeno l'80\% del valore originale dell'articolo. L'utente deve aver inoltre speso almeno 50\$ sul proprio account.
 75 | 
 76 | \section{Estensione del dataset}
 77 | 
 78 | A partire dal dataset originale abbiamo creato dei nuovi campi ritenuti di valore per effettuare una fase di esplorazione più approfondita. 
 79 | 
 80 | \subsection{Da \texttt{overall} a \texttt{opinion}}
 81 | \label{overall_opinion}
 82 | Osservando la distribuzione del campo \texttt{overall}, mostrata in Figura \ref{overall_distribution}, possiamo notare un forte sbilanciamento sul valore 5: questa tendenza è presente anche in dataset Amazon di categorie diverse dalla nostra.
 83 | 
 84 | \begin{figure}[H]
 85 |   \centering
 86 |   \includesvg[width=0.9\linewidth]{figures/1_overall_distribution}
 87 |   \caption{Overall distribution}
 88 |   \label{overall_distribution}
 89 | \end{figure}
 90 | 
 91 | In previsione della fase di sentiment analysis, il campo \texttt{overall} è stato utilizzato per la creazione del campo \texttt{opinion}, così composto:
 92 | 
 93 | \begin{itemize}
 94 |     \item I valori 1 e 2 vengono trasformati in \textit{negative}
 95 |     \item Il valore 3 viene trasformato in \textit{neutral}
 96 |     \item I valori 4 e 5 vengono trasformati in \textit{positive}
 97 | \end{itemize}
 98 | 
 99 | In Figura \ref{opinion_distribution} viene mostrata la distribuzione: essa è ovviamente simile a quella già osservata per il campo \texttt{overall} e sarà quindi necessario un bilanciamento del dataset per la fase di sentiment analysis.
100 | 
101 | \begin{figure}[H]
102 |   \centering
103 |   \includesvg[width=0.9\linewidth]{figures/1_opinion_distribution}
104 |   \caption{Opinion distribution}
105 |   \label{opinion_distribution}
106 | \end{figure}
107 | 
108 | \subsection{Conteggio delle parole nelle recensioni}
109 | 
110 | Il campo \texttt{reviewText} è di fondamentale importanza per le fasi di sentiment e topic analysis. Ma per la fase di esplorazione, essendo il testo di una recensione un dato qualitativo, non è di alcun valore. Per questo motivo, abbiamo computato direttamente il numero di parole e creato il campo risultante \texttt{n\_words}. In Figura \ref{distribution_words_opinion} viene mostrata la distribuzione del campo \texttt{n\_words} rispetto al campo \texttt{opinion}, tenendo in considerazione solamente le recensioni con meno di 1000 parole per una questione di visibilità che sarebbe venuta meno considerando anche le (poche) recensioni composte da oltre 1000 parole.
111 | 
112 | \begin{figure}[H]
113 |   \centering
114 |   \includesvg[width=0.9\linewidth]{figures/1_correlation_words_opinion}
115 |   \caption{Distribution of words in review for each opinion}
116 |   \label{distribution_words_opinion}
117 | \end{figure}
118 | 
119 | \subsection{Analisi temporale}
120 | Il campo \texttt{unixReviewTime} fornisce la data della recensione in formato unix. Con alcune semplici manipolazioni del suddetto campo abbiamo creato i seguenti:
121 | 
122 | \begin{itemize}
123 |     \item \texttt{month\_year} nel formato YYYY-MM
124 |     \item \texttt{month} nel formato MM
125 |     \item \texttt{year} nel formato YYYY
126 |     \item \texttt{week\_day} in cui il giorno della settimana è rappresentato con un numero intero (0-6) 
127 | \end{itemize}{}
128 | 
129 | Il dataset considera recensioni nell'arco di 16 anni circa: più precisamente la prima recensione risale al 23-10-2002, mentre l'ultima al 01-10-2018.
130 | Considerato il dominio trattato, un'analisi di valore è quella di considerare la distribuzione delle recensioni tenendo in considerazione il giorno della settimana cosicché da mettere in risalto pattern di attività. \par 
131 | Nel caso specifico, come è possibile osservare in Figura \ref{review_dist}, non vi è una dominanza degna di nota nonostante vi sia una tendenza a produrre meno recensioni nelle giornate di venerdì e sabato. 
132 | 
133 | \begin{figure}[H]
134 |   \centering
135 |   \includesvg[width=0.9\linewidth]{figures/1_review_distribution_per_day}
136 |   \caption{Review distribution per day}
137 |   \label{review_dist}
138 | \end{figure}
139 | 
140 | \section{Prodotti più recensiti e recensori più popolari}
141 | Il numero di utenti e di prodotti è nell'ordine delle migliaia (come anticipato nel Capitolo \ref{descrizione_dataset}) ed è impensabile anche solo immaginare di fare analisi esplorative approfondite su ogni singolo utente e su ogni singolo prodotto. Per questo motivo abbiamo deciso di focalizzare l'attenzione su un numero ristretto di utenti e di prodotti. 
142 | 
143 | \par
144 | 
145 | La Figura \ref{opinion_bestseller_products} mostra i 20 prodotti più popolari in termini di recensioni. Possiamo notare come, seppur ogni prodotto abbia perlopiù un maggior numero di recensioni \textit{positive}, per alcuni prodotti in particolare la percentuale di recensioni \textit{neutrali} e \textit{negative} è elevata rispetto alla distribuzione osservata nel Capitolo \ref{overall_opinion}. 
146 | 
147 | \begin{figure}[H]
148 |   \centering
149 |   \includesvg[width=0.9\linewidth]{figures/1_sentiment_reviews_bestseller_products}
150 |   \caption{Opinion for bestseller products}
151 |   \label{opinion_bestseller_products}
152 | \end{figure}
153 | 
154 | \par
155 | La Figura \ref{reviewers_most_reviews} mostra i 50 utenti con più recensioni prodotte, mentre la Figura \ref{opinion_top_reviewers} mostra la distribuzione delle valutazioni delle recensioni effettuate. Possiamo notare come la maggior parte degli utenti considerati dia in percentuale una valutazione in linea con la distribuzione osservata nel Capitolo \ref{overall_opinion}, fatta eccezione per casi estremi.
156 | 
157 | \begin{figure}[H]
158 |   \centering
159 |   \includesvg[width=1\linewidth]{figures/1_reviewers_most_reviews}
160 |   \caption{Reviewers with most reviews}
161 |   \label{reviewers_most_reviews}
162 | \end{figure}
163 | 
164 | \begin{figure}[H]
165 |   \centering
166 |   \includesvg[width=1\linewidth]{figures/1_opinion_top_reviewers}
167 |   \caption{Opinion of top reviewers}
168 |   \label{opinion_top_reviewers}
169 | \end{figure}
170 | 
171 | \section{Natura delle recensioni}
172 | Il campo \texttt{verified} merita una trattazione dettagliata per capire se le recensioni \texttt{non verificate} sono di valore tanto quanto le recensioni \textit{verificate}. In Figura \ref{ver_unver_overall} si può notare che la distribuzione del campo \textit{overall} è praticamente identica.
173 | 
174 | \begin{figure}[H]
175 |     \centering
176 |     \subfigure[Unverified overall distribution]{\includesvg[width=0.4\linewidth]{figures/1_unverified_overall_distribution}} 
177 |     \subfigure[Verified overall distribution]{\includesvg[width=0.4\linewidth]{figures/1_verified_overall_distribution}}
178 |     \caption{Verified - Unverified overall distribution}
179 |     \label{ver_unver_overall}
180 | \end{figure}
181 | 
182 | In Figura \ref{ver_unver_toprev} è invece possibile notare una particolarità. Come in Figura \ref{opinion_top_reviewers} abbiamo preso i 50 utenti con più recensioni prodotte e la maggior parte delle loro recensioni risulta come \textit{non verificata}.
183 | \par
184 | Alcune riflessioni sono possibili soffermandoci su questa Figura. Come specificato nel Capitolo \ref{descrizione_dataset}, le recensioni sono classificate come \textit{verificate} se provengono da un acquisto su Amazon per almeno l'80\% del valore originale dell'articolo. Una suggestione potrebbe far propendere per l'idea che molti di questi utenti siano i cosiddetti \textit{top recensori} solitamente posizionati in cima alla lista dei commenti che (in teoria) non acquistano direttamente i prodotti recensiti che invece gli vengono prestati per provare il prodotto e scrivere una recensione imparziale.
185 | 
186 | \begin{figure}[H]
187 |   \centering
188 |   \includesvg[width=1\linewidth]{figures/1_verified_unverified}
189 |   \caption{Verified - Unverified reviews of top reviewers}
190 |   \label{ver_unver_toprev}
191 | \end{figure}
192 | 
193 | \section{Correlazioni temporali e relative al traffico}
194 | 
195 | Aggregando temporalmente i dati abbiamo ottenuto alcuni grafici che suggeriscono correlazioni interessanti: la Figura \ref{figtime1} ci mostra come grande parte del traffico attivo sulle recensioni (ovvero gli utenti che votano e danno rilevanza alle recensioni esistenti) si distribuisce su quelle già più popolari, mentre la grande rimanenza rimane quasi intoccata da grossi picchi di attività di questo tipo.
196 | 
197 | \begin{figure}[H]
198 | 
199 |   \centering
200 |   \includesvg[width=1.1\linewidth]{figures/1_avg_help_25_100_traffic}
201 |   \caption{Average "helpfulness" of 25 and 200 most relevant reviews over time and traffic}
202 |     \label{figtime1}
203 | \end{figure}
204 | 
205 | \par
206 | 
207 | La Figura \ref{figtime2} mostra un fenomeno interessante: nonostante la quantità di recensioni cambi notevolmente nel tempo, la quantità di recensioni non verificate in rapporto al totale sembra rimanere (quasi) invariata, suggerendo un qualche tipo di moderazione.
208 | 
209 | \begin{figure}[H]
210 | 
211 |   \centering
212 |   \includesvg[width=1.1\linewidth]{figures/1_ver_unver_time_traffic.svg}
213 |   \caption{Verified - Unverified reviews over time and traffic}
214 |   \label{figtime2}
215 | \end{figure}
216 | 
217 | 
218 | \par
219 | 
220 | Incrociando la lunghezza media delle recensioni con il loro voto, abbiamo ottenuto la Figura \ref{figtime3}. Con il passare del tempo (e l'aumentare vertiginoso del traffico) le recensioni sono generalmente più lunghe e meno generose con la valutazione che esprimono.
221 | 
222 | \par
223 | 
224 | Basandosi su alcuni di questi aspetti, Amazon ha sviluppato un modello di apprendimento automatico per assegnare un valore di rilevanza alle recensioni, in modo da poterle mettere in primo piano. In particolare, i fattori considerati sono: punteggio "utilità" della recensioni (voti), recensione verificata/non verificata, età della recensione.
225 | 
226 | \par 
227 | Non è noto nel dettaglio come funzioni e in che modo questi fattori vengano pesati ma è certamente importante rilevare come un approccio di questo tipo permette ad Amazon di sfruttare i contributi degli utenti e capitalizzarci, promuovendo recensioni convincenti e prodotti che riescono a produrre (legittimamente o no, altro aspetto importante) feedback così positivi e virali.
228 | 
229 | 
230 | \begin{figure}[H]
231 | 
232 |   \centering
233 |   \includesvg[width=1.1\linewidth]{figures/1_rew_len_over_time.svg}
234 |   \caption{Review length VS overall score over time}
235 |   \label{figtime3}
236 | \end{figure}
237 | 
238 | \newpage
239 | \section{Polarizzazione delle valutazioni}
240 | 
241 | Uno degli aspetti fondamentali e poco chiaro delle recensioni è quanto esse siano polarizzate attorno un singolo voto numerico, in modo spesso estremo. È un fenomeno che si estende per ogni categoria di ogni marketplace in modo praticamente omogeneo.
242 | Nel caso di Amazon, la gran parte delle recensioni riporta una valutazione numerica massima.
243 | \par
244 | In letteratura, abbiamo trovato un recente lavoro \cite{schoenmuller2018extreme} che investiga dettagliatamente la questione, descrivendo la \textit{polarity self-selection} come fattore trainante di questo fenomeno. È tendenza dei consumatori a recensire esperienze estreme. Si discute inoltre il fatto che le distribuzioni estreme di queste valutazioni ne riducono l'informatività, su larga scala.
245 | 
246 | \par
247 | I seguenti grafici danno un'idea di questo comportamento: la figura \ref{disp1} confronta recensori abituali ed occasionali del sito Yelp, mostrando come utenti che producono più recensioni distribuiscono meglio le proprie valutazioni, senza esagerare con valutazioni massime nella maggior parte dei casi. \ref{disp2} affronta invece l'aspetto dell'incipit della recensione: quando siamo forzati a valutare un elemento, è più probabile che distribuiremo intro al 4 la nostra valutazione, mentre quando lasciamo una recensione di nostra spontanea volontà si tende a recensire ottime esperienze.
248 | 
249 | \begin{figure}[htbp]
250 |   \centering
251 |   \includegraphics[width=1.1\linewidth]{figures/ext/1_frequentInfrequentYelp.png}
252 |   \caption{Review Distribution of Frequent and Infrequent Yelp Reviewers \cite{schoenmuller2018extreme}}
253 |   \label{disp1}
254 | \end{figure}
255 | 
256 | \begin{figure}[htbp]
257 |   \centering
258 |   \includegraphics[width=1.1\linewidth]{figures/ext/1_selfVSforced.png}
259 |   \caption{Empirical Distributions for Self-Selection versus Forced Reviews \cite{schoenmuller2018extreme}}
260 |   \label{disp2}
261 | \end{figure}


--------------------------------------------------------------------------------
/figures/1_opinion_distribution.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8" standalone="no"?>
  2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
  3 |   "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
  4 | <!-- Created with matplotlib (https://matplotlib.org/) -->
  5 | <svg height="720pt" version="1.1" viewBox="0 0 720 720" width="720pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
  6 |  <defs>
  7 |   <style type="text/css">
  8 | *{stroke-linecap:butt;stroke-linejoin:round;}
  9 |   </style>
 10 |  </defs>
 11 |  <g id="figure_1">
 12 |   <g id="patch_1">
 13 |    <path d="M 0 720 
 14 | L 720 720 
 15 | L 720 0 
 16 | L 0 0 
 17 | z
 18 | " style="fill:#ffffff;"/>
 19 |   </g>
 20 |   <g id="axes_1">
 21 |    <g id="patch_2">
 22 |     <path d="M 90 640.8 
 23 | L 648 640.8 
 24 | L 648 86.4 
 25 | L 90 86.4 
 26 | z
 27 | " style="fill:#eaeaf2;"/>
 28 |    </g>
 29 |    <g id="matplotlib.axis_1">
 30 |     <g id="xtick_1">
 31 |      <g id="text_1">
 32 |       <!-- positive -->
 33 |       <defs>
 34 |        <path d="M 7.140625 -20.453125 
 35 | L 17.75 -20.453125 
 36 | L 17.75 8.484375 
 37 | L 18.390625 8.484375 
 38 | C 20.3125 5 24.21875 -1.0625 34.15625 -1.0625 
 39 | C 47.375 -1.0625 56.96875 9.515625 56.96875 27.171875 
 40 | C 56.96875 44.84375 47.234375 55.25 34.0625 55.25 
 41 | C 23.9375 55.25 20.28125 49.078125 18.390625 45.703125 
 42 | L 17.515625 45.703125 
 43 | L 17.515625 54.546875 
 44 | L 7.140625 54.546875 
 45 | z
 46 | M 17.546875 27.265625 
 47 | C 17.546875 38.671875 22.515625 46.234375 31.78125 46.234375 
 48 | C 41.40625 46.234375 46.234375 38.109375 46.234375 27.265625 
 49 | C 46.234375 16.328125 41.265625 7.984375 31.78125 7.984375 
 50 | C 22.65625 7.984375 17.546875 15.796875 17.546875 27.265625 
 51 | z
 52 | " id="Inter-Medium-112"/>
 53 |        <path d="M 30.109375 -1.09375 
 54 | C 45.484375 -1.09375 55.546875 10.15625 55.546875 27.03125 
 55 | C 55.546875 44 45.484375 55.25 30.109375 55.25 
 56 | C 14.734375 55.25 4.6875 44 4.6875 27.03125 
 57 | C 4.6875 10.15625 14.734375 -1.09375 30.109375 -1.09375 
 58 | z
 59 | M 30.15625 7.8125 
 60 | C 20.09375 7.8125 15.40625 16.578125 15.40625 27.0625 
 61 | C 15.40625 37.5 20.09375 46.375 30.15625 46.375 
 62 | C 40.125 46.375 44.8125 37.5 44.8125 27.0625 
 63 | C 44.8125 16.578125 40.125 7.8125 30.15625 7.8125 
 64 | z
 65 | " id="Inter-Medium-111"/>
 66 |        <path d="M 48.015625 41.234375 
 67 | C 45.8125 49.71875 39.171875 55.25 27.375 55.25 
 68 | C 15.0625 55.25 6.328125 48.75 6.328125 39.09375 
 69 | C 6.328125 31.359375 11.015625 26.203125 21.234375 23.9375 
 70 | L 30.46875 21.90625 
 71 | C 35.71875 20.734375 38.171875 18.390625 38.171875 14.984375 
 72 | C 38.171875 10.765625 33.671875 7.453125 26.703125 7.453125 
 73 | C 20.34375 7.453125 16.265625 10.1875 14.984375 15.546875 
 74 | L 4.71875 13.984375 
 75 | C 6.5 4.328125 14.53125 -1.09375 26.78125 -1.09375 
 76 | C 39.953125 -1.09375 49.078125 5.890625 49.078125 15.765625 
 77 | C 49.078125 23.46875 44.171875 28.234375 34.15625 30.546875 
 78 | L 25.5 32.53125 
 79 | C 19.5 33.953125 16.90625 35.96875 16.9375 39.671875 
 80 | C 16.90625 43.859375 21.453125 46.84375 27.484375 46.84375 
 81 | C 34.09375 46.84375 37.140625 43.1875 38.390625 39.53125 
 82 | z
 83 | " id="Inter-Medium-115"/>
 84 |        <path d="M 7.140625 0 
 85 | L 17.75 0 
 86 | L 17.75 54.546875 
 87 | L 7.140625 54.546875 
 88 | z
 89 | M 12.5 62.96875 
 90 | C 16.15625 62.96875 19.21875 65.796875 19.21875 69.28125 
 91 | C 19.21875 72.765625 16.15625 75.640625 12.5 75.640625 
 92 | C 8.8125 75.640625 5.796875 72.765625 5.796875 69.28125 
 93 | C 5.796875 65.796875 8.8125 62.96875 12.5 62.96875 
 94 | z
 95 | " id="Inter-Medium-105"/>
 96 |        <path d="M 32.59375 54.546875 
 97 | L 21.421875 54.546875 
 98 | L 21.421875 67.609375 
 99 | L 10.796875 67.609375 
100 | L 10.796875 54.546875 
101 | L 2.8125 54.546875 
102 | L 2.8125 46.015625 
103 | L 10.796875 46.015625 
104 | L 10.796875 13.8125 
105 | C 10.765625 3.90625 18.328125 -0.890625 26.703125 -0.703125 
106 | C 30.078125 -0.671875 32.359375 -0.03125 33.59375 0.421875 
107 | L 31.671875 9.203125 
108 | C 30.96875 9.0625 29.65625 8.734375 27.953125 8.734375 
109 | C 24.5 8.734375 21.421875 9.875 21.421875 16.015625 
110 | L 21.421875 46.015625 
111 | L 32.59375 46.015625 
112 | z
113 | " id="Inter-Medium-116"/>
114 |        <path d="M 53.796875 54.546875 
115 | L 42.40625 54.546875 
116 | L 28.625 12.578125 
117 | L 28.0625 12.578125 
118 | L 14.234375 54.546875 
119 | L 2.84375 54.546875 
120 | L 22.65625 0 
121 | L 34.015625 0 
122 | z
123 | " id="Inter-Medium-118"/>
124 |        <path d="M 30.640625 -1.09375 
125 | C 42.546875 -1.09375 50.953125 4.765625 53.375 13.640625 
126 | L 43.328125 15.453125 
127 | C 41.40625 10.296875 36.796875 7.671875 30.75 7.671875 
128 | C 21.65625 7.671875 15.5625 13.5625 15.265625 24.078125 
129 | L 54.046875 24.078125 
130 | L 54.046875 27.84375 
131 | C 54.046875 47.546875 42.265625 55.25 29.90625 55.25 
132 | C 14.703125 55.25 4.6875 43.671875 4.6875 26.921875 
133 | C 4.6875 9.984375 14.5625 -1.09375 30.640625 -1.09375 
134 | z
135 | M 15.3125 32.03125 
136 | C 15.734375 39.765625 21.34375 46.484375 29.96875 46.484375 
137 | C 38.21875 46.484375 43.609375 40.375 43.640625 32.03125 
138 | z
139 | " id="Inter-Medium-101"/>
140 |       </defs>
141 |       <g style="fill:#262626;" transform="translate(158.437109 657.633281)scale(0.13 -0.13)">
142 |        <use xlink:href="#Inter-Medium-112"/>
143 |        <use x="61.718765" xlink:href="#Inter-Medium-111"/>
144 |        <use x="121.946045" xlink:href="#Inter-Medium-115"/>
145 |        <use x="175.5327" xlink:href="#Inter-Medium-105"/>
146 |        <use x="200.426163" xlink:href="#Inter-Medium-116"/>
147 |        <use x="237.606567" xlink:href="#Inter-Medium-105"/>
148 |        <use x="262.500031" xlink:href="#Inter-Medium-118"/>
149 |        <use x="319.140671" xlink:href="#Inter-Medium-101"/>
150 |       </g>
151 |      </g>
152 |     </g>
153 |     <g id="xtick_2">
154 |      <g id="text_2">
155 |       <!-- neutral -->
156 |       <defs>
157 |        <path d="M 17.75 32.390625 
158 | C 17.75 41.09375 23.078125 46.0625 30.46875 46.0625 
159 | C 37.671875 46.0625 42.046875 41.328125 42.046875 33.421875 
160 | L 42.046875 0 
161 | L 52.671875 0 
162 | L 52.671875 34.6875 
163 | C 52.671875 48.1875 45.25 55.25 34.09375 55.25 
164 | C 25.890625 55.25 20.53125 51.453125 18 45.671875 
165 | L 17.328125 45.671875 
166 | L 17.328125 54.546875 
167 | L 7.140625 54.546875 
168 | L 7.140625 0 
169 | L 17.75 0 
170 | z
171 | " id="Inter-Medium-110"/>
172 |        <path d="M 41.6875 22.625 
173 | C 41.734375 13.359375 34.84375 8.953125 28.875 8.953125 
174 | C 22.296875 8.953125 17.75 13.703125 17.75 21.125 
175 | L 17.75 54.546875 
176 | L 7.140625 54.546875 
177 | L 7.140625 19.84375 
178 | C 7.140625 6.328125 14.5625 -0.703125 25.03125 -0.703125 
179 | C 33.234375 -0.703125 38.8125 3.625 41.34375 9.453125 
180 | L 41.90625 9.453125 
181 | L 41.90625 0 
182 | L 52.34375 0 
183 | L 52.34375 54.546875 
184 | L 41.6875 54.546875 
185 | z
186 | " id="Inter-Medium-117"/>
187 |        <path d="M 7.140625 0 
188 | L 17.75 0 
189 | L 17.75 33.3125 
190 | C 17.75 40.40625 23.265625 45.5625 30.609375 45.5625 
191 | C 32.671875 45.5625 34.265625 45.140625 35.546875 44.5 
192 | L 38.953125 53.375 
193 | C 37.140625 54.578125 34.796875 55.359375 31.8125 55.359375 
194 | C 25.359375 55.359375 19.953125 51.734375 17.96875 45.875 
195 | L 17.40625 45.875 
196 | L 17.40625 54.546875 
197 | L 7.140625 54.546875 
198 | z
199 | " id="Inter-Medium-114"/>
200 |        <path d="M 23.015625 -1.203125 
201 | C 32.03125 -1.203125 37.109375 3.375 39.140625 7.453125 
202 | L 39.5625 7.453125 
203 | L 39.5625 0 
204 | L 49.9375 0 
205 | L 49.9375 36.21875 
206 | C 49.9375 52.09375 37.4375 55.25 28.765625 55.25 
207 | C 18.890625 55.25 9.796875 51.28125 6.25 41.328125 
208 | L 16.234375 39.0625 
209 | C 17.796875 42.9375 21.765625 46.65625 28.90625 46.65625 
210 | C 35.765625 46.65625 39.28125 43.078125 39.28125 36.890625 
211 | L 39.28125 36.640625 
212 | C 39.28125 32.78125 35.296875 32.84375 25.5 31.71875 
213 | C 15.171875 30.5 4.578125 27.8125 4.578125 15.40625 
214 | C 4.578125 4.6875 12.640625 -1.203125 23.015625 -1.203125 
215 | z
216 | M 25.328125 7.3125 
217 | C 19.3125 7.3125 14.984375 10.015625 14.984375 15.265625 
218 | C 14.984375 20.953125 20.03125 22.96875 26.171875 23.796875 
219 | C 29.625 24.25 37.78125 25.171875 39.3125 26.703125 
220 | L 39.3125 19.671875 
221 | C 39.3125 13.203125 34.15625 7.3125 25.328125 7.3125 
222 | z
223 | " id="Inter-Medium-97"/>
224 |        <path d="M 17.75 72.734375 
225 | L 7.140625 72.734375 
226 | L 7.140625 0 
227 | L 17.75 0 
228 | z
229 | " id="Inter-Medium-108"/>
230 |       </defs>
231 |       <g style="fill:#262626;" transform="translate(347.110234 657.255469)scale(0.13 -0.13)">
232 |        <use xlink:href="#Inter-Medium-110"/>
233 |        <use x="59.730118" xlink:href="#Inter-Medium-101"/>
234 |        <use x="118.465927" xlink:href="#Inter-Medium-117"/>
235 |        <use x="177.911957" xlink:href="#Inter-Medium-116"/>
236 |        <use x="215.092361" xlink:href="#Inter-Medium-114"/>
237 |        <use x="254.936111" xlink:href="#Inter-Medium-97"/>
238 |        <use x="311.86084" xlink:href="#Inter-Medium-108"/>
239 |       </g>
240 |      </g>
241 |     </g>
242 |     <g id="xtick_3">
243 |      <g id="text_3">
244 |       <!-- negative -->
245 |       <defs>
246 |        <path d="M 30.015625 -21.484375 
247 | C 43.890625 -21.484375 54.625 -15.125 54.625 -1.203125 
248 | L 54.625 54.546875 
249 | L 44.21875 54.546875 
250 | L 44.21875 45.703125 
251 | L 43.4375 45.703125 
252 | C 41.546875 49.078125 37.78125 55.25 27.625 55.25 
253 | C 14.453125 55.25 4.765625 44.84375 4.765625 27.484375 
254 | C 4.765625 10.078125 14.671875 0.8125 27.5625 0.8125 
255 | C 37.578125 0.8125 41.4375 6.46875 43.359375 9.9375 
256 | L 44.03125 9.9375 
257 | L 44.03125 -0.78125 
258 | C 44.03125 -9.203125 38.171875 -12.921875 30.109375 -12.921875 
259 | C 21.265625 -12.921875 17.828125 -8.484375 15.953125 -5.46875 
260 | L 6.8125 -9.234375 
261 | C 9.703125 -15.796875 16.96875 -21.484375 30.015625 -21.484375 
262 | z
263 | M 29.90625 9.625 
264 | C 20.421875 9.625 15.484375 16.90625 15.484375 27.625 
265 | C 15.484375 38.109375 20.3125 46.234375 29.90625 46.234375 
266 | C 39.171875 46.234375 44.140625 38.671875 44.140625 27.625 
267 | C 44.140625 16.375 39.0625 9.625 29.90625 9.625 
268 | z
269 | " id="Inter-Medium-103"/>
270 |       </defs>
271 |       <g style="fill:#262626;" transform="translate(528.053437 657.633281)scale(0.13 -0.13)">
272 |        <use xlink:href="#Inter-Medium-110"/>
273 |        <use x="59.730118" xlink:href="#Inter-Medium-101"/>
274 |        <use x="118.465927" xlink:href="#Inter-Medium-103"/>
275 |        <use x="180.184692" xlink:href="#Inter-Medium-97"/>
276 |        <use x="237.109421" xlink:href="#Inter-Medium-116"/>
277 |        <use x="274.289825" xlink:href="#Inter-Medium-105"/>
278 |        <use x="299.183289" xlink:href="#Inter-Medium-118"/>
279 |        <use x="355.823929" xlink:href="#Inter-Medium-101"/>
280 |       </g>
281 |      </g>
282 |     </g>
283 |     <g id="text_4">
284 |      <!-- Opinion -->
285 |      <defs>
286 |       <path d="M 71.171875 36.359375 
287 | C 71.171875 59.625 57.25 73.71875 38.421875 73.71875 
288 | C 19.53125 73.71875 5.640625 59.625 5.640625 36.359375 
289 | C 5.640625 13.140625 19.53125 -1 38.421875 -1 
290 | C 57.25 -1 71.171875 13.109375 71.171875 36.359375 
291 | z
292 | M 60.296875 36.359375 
293 | C 60.296875 18.640625 50.921875 9.015625 38.421875 9.015625 
294 | C 25.890625 9.015625 16.546875 18.640625 16.546875 36.359375 
295 | C 16.546875 54.078125 25.890625 63.703125 38.421875 63.703125 
296 | C 50.921875 63.703125 60.296875 54.078125 60.296875 36.359375 
297 | z
298 | " id="Inter-Medium-79"/>
299 |      </defs>
300 |      <g style="fill:#262626;" transform="translate(341.398828 675.772344)scale(0.15 -0.15)">
301 |       <use xlink:href="#Inter-Medium-79"/>
302 |       <use x="76.811096" xlink:href="#Inter-Medium-112"/>
303 |       <use x="138.529861" xlink:href="#Inter-Medium-105"/>
304 |       <use x="163.423325" xlink:href="#Inter-Medium-110"/>
305 |       <use x="223.153442" xlink:href="#Inter-Medium-105"/>
306 |       <use x="248.046906" xlink:href="#Inter-Medium-111"/>
307 |       <use x="308.274185" xlink:href="#Inter-Medium-110"/>
308 |      </g>
309 |     </g>
310 |    </g>
311 |    <g id="matplotlib.axis_2">
312 |     <g id="ytick_1">
313 |      <g id="line2d_1">
314 |       <path clip-path="url(#p61b4226e6a)" d="M 90 640.8 
315 | L 648 640.8 
316 | " style="fill:none;stroke:#ffffff;stroke-linecap:round;stroke-width:0.8;"/>
317 |      </g>
318 |      <g id="text_5">
319 |       <!-- 0 -->
320 |       <defs>
321 |        <path d="M 32.3125 -1.203125 
322 | C 49.109375 -1.203125 58.953125 12.5 58.953125 36.328125 
323 | C 58.953125 59.984375 48.96875 73.71875 32.3125 73.71875 
324 | C 15.625 73.71875 5.6875 60.015625 5.640625 36.328125 
325 | C 5.640625 12.53125 15.484375 -1.171875 32.3125 -1.203125 
326 | z
327 | M 32.3125 8.28125 
328 | C 22.484375 8.28125 16.546875 18.140625 16.546875 36.328125 
329 | C 16.578125 54.4375 22.515625 64.421875 32.3125 64.421875 
330 | C 42.078125 64.421875 48.046875 54.4375 48.046875 36.328125 
331 | C 48.046875 18.140625 42.125 8.28125 32.3125 8.28125 
332 | z
333 | " id="Inter-Medium-48"/>
334 |       </defs>
335 |       <g style="fill:#262626;" transform="translate(74.602812 645.527734)scale(0.13 -0.13)">
336 |        <use xlink:href="#Inter-Medium-48"/>
337 |       </g>
338 |      </g>
339 |     </g>
340 |     <g id="ytick_2">
341 |      <g id="line2d_2">
342 |       <path clip-path="url(#p61b4226e6a)" d="M 90 585.36 
343 | L 648 585.36 
344 | " style="fill:none;stroke:#ffffff;stroke-linecap:round;stroke-width:0.8;"/>
345 |      </g>
346 |      <g id="text_6">
347 |       <!-- 100K -->
348 |       <defs>
349 |        <path d="M 32.5625 72.734375 
350 | L 22.34375 72.734375 
351 | C 21.953125 70.734375 16.40625 61.546875 5.796875 61.546875 
352 | L 5.796875 52.515625 
353 | C 14.03125 52.515625 19.75 55.78125 20.953125 57.921875 
354 | L 21.5625 57.921875 
355 | L 21.5625 0 
356 | L 32.5625 0 
357 | z
358 | " id="Inter-Medium-49"/>
359 |        <path d="M 7.984375 0 
360 | L 18.96875 0 
361 | L 18.96875 22.625 
362 | L 27.5625 32.5 
363 | L 51.0625 0 
364 | L 64.28125 0 
365 | L 34.84375 39.984375 
366 | L 64.171875 72.734375 
367 | L 50.359375 72.734375 
368 | L 19.859375 38 
369 | L 18.96875 38 
370 | L 18.96875 72.734375 
371 | L 7.984375 72.734375 
372 | z
373 | " id="Inter-Medium-75"/>
374 |       </defs>
375 |       <g style="fill:#262626;" transform="translate(51.68625 590.087734)scale(0.13 -0.13)">
376 |        <use xlink:href="#Inter-Medium-49"/>
377 |        <use x="45.241486" xlink:href="#Inter-Medium-48"/>
378 |        <use x="109.83667" xlink:href="#Inter-Medium-48"/>
379 |        <use x="174.431854" xlink:href="#Inter-Medium-75"/>
380 |       </g>
381 |      </g>
382 |     </g>
383 |     <g id="ytick_3">
384 |      <g id="line2d_3">
385 |       <path clip-path="url(#p61b4226e6a)" d="M 90 363.6 
386 | L 648 363.6 
387 | " style="fill:none;stroke:#ffffff;stroke-linecap:round;stroke-width:0.8;"/>
388 |      </g>
389 |      <g id="text_7">
390 |       <!-- 500K -->
391 |       <defs>
392 |        <path d="M 31 -1 
393 | C 45.640625 -1 56.078125 9.34375 56.03125 23.515625 
394 | C 56.078125 37.578125 46.265625 47.796875 33.03125 47.796875 
395 | C 27.625 47.796875 22.625 45.734375 19.78125 42.96875 
396 | L 19.359375 42.96875 
397 | L 21.65625 63.3125 
398 | L 52.515625 63.3125 
399 | L 52.515625 72.734375 
400 | L 12.53125 72.734375 
401 | L 8.453125 36.078125 
402 | L 18.5 34.59375 
403 | C 21.234375 37.03125 26.140625 38.703125 30.546875 38.703125 
404 | C 39.171875 38.640625 45.421875 32.171875 45.421875 23.296875 
405 | C 45.421875 14.5625 39.34375 8.234375 31 8.234375 
406 | C 23.96875 8.234375 18.390625 12.71875 17.828125 18.96875 
407 | L 7.171875 18.96875 
408 | C 7.59375 7.390625 17.65625 -1 31 -1 
409 | z
410 | " id="Inter-Medium-53"/>
411 |       </defs>
412 |       <g style="fill:#262626;" transform="translate(49.50875 368.327734)scale(0.13 -0.13)">
413 |        <use xlink:href="#Inter-Medium-53"/>
414 |        <use x="62.002853" xlink:href="#Inter-Medium-48"/>
415 |        <use x="126.598038" xlink:href="#Inter-Medium-48"/>
416 |        <use x="191.193222" xlink:href="#Inter-Medium-75"/>
417 |       </g>
418 |      </g>
419 |     </g>
420 |     <g id="ytick_4">
421 |      <g id="line2d_4">
422 |       <path clip-path="url(#p61b4226e6a)" d="M 90 86.4 
423 | L 648 86.4 
424 | " style="fill:none;stroke:#ffffff;stroke-linecap:round;stroke-width:0.8;"/>
425 |      </g>
426 |      <g id="text_8">
427 |       <!-- 1M -->
428 |       <defs>
429 |        <path d="M 7.984375 72.734375 
430 | L 7.984375 0 
431 | L 18.4375 0 
432 | L 18.4375 52.65625 
433 | L 19.109375 52.65625 
434 | L 40.5625 0.109375 
435 | L 49.21875 0.109375 
436 | L 70.671875 52.625 
437 | L 71.34375 52.625 
438 | L 71.34375 0 
439 | L 81.78125 0 
440 | L 81.78125 72.734375 
441 | L 68.46875 72.734375 
442 | L 45.3125 16.1875 
443 | L 44.46875 16.1875 
444 | L 21.3125 72.734375 
445 | z
446 | " id="Inter-Medium-77"/>
447 |       </defs>
448 |       <g style="fill:#262626;" transform="translate(65.212344 91.127734)scale(0.13 -0.13)">
449 |        <use xlink:href="#Inter-Medium-49"/>
450 |        <use x="45.241486" xlink:href="#Inter-Medium-77"/>
451 |       </g>
452 |      </g>
453 |     </g>
454 |     <g id="text_9">
455 |      <!-- Review count -->
456 |      <defs>
457 |       <path d="M 7.984375 0 
458 | L 18.96875 0 
459 | L 18.96875 27.515625 
460 | L 33.875 27.515625 
461 | C 34.125 27.515625 34.34375 27.515625 34.59375 27.515625 
462 | L 49.359375 0 
463 | L 61.859375 0 
464 | L 45.734375 29.4375 
465 | C 54.65625 32.734375 59.09375 40.15625 59.09375 49.921875 
466 | C 59.09375 63.3125 50.78125 72.734375 33.921875 72.734375 
467 | L 7.984375 72.734375 
468 | z
469 | M 18.96875 36.96875 
470 | L 18.96875 63.3125 
471 | L 32.75 63.3125 
472 | C 43.578125 63.3125 48.046875 58.171875 48.046875 49.921875 
473 | C 48.046875 41.71875 43.578125 36.96875 32.890625 36.96875 
474 | z
475 | " id="Inter-Medium-82"/>
476 |       <path d="M 18.96875 0 
477 | L 29.765625 0 
478 | L 40.84375 39.375 
479 | L 41.65625 39.375 
480 | L 52.734375 0 
481 | L 63.5625 0 
482 | L 79.578125 54.546875 
483 | L 68.609375 54.546875 
484 | L 57.984375 14.671875 
485 | L 57.453125 14.671875 
486 | L 46.8125 54.546875 
487 | L 35.828125 54.546875 
488 | L 25.109375 14.484375 
489 | L 24.578125 14.484375 
490 | L 13.890625 54.546875 
491 | L 2.90625 54.546875 
492 | z
493 | " id="Inter-Medium-119"/>
494 |       <path id="Inter-Medium-32"/>
495 |       <path d="M 30.109375 -1.09375 
496 | C 43 -1.09375 51.34375 6.640625 52.515625 17.265625 
497 | L 42.1875 17.265625 
498 | C 40.84375 11.359375 36.265625 7.890625 30.1875 7.890625 
499 | C 21.203125 7.890625 15.40625 15.375 15.40625 27.265625 
500 | C 15.40625 38.953125 21.3125 46.3125 30.1875 46.3125 
501 | C 36.9375 46.3125 41.046875 42.046875 42.1875 36.9375 
502 | L 52.515625 36.9375 
503 | C 51.390625 47.9375 42.40625 55.25 30.015625 55.25 
504 | C 14.625 55.25 4.6875 43.671875 4.6875 27.03125 
505 | C 4.6875 10.578125 14.28125 -1.09375 30.109375 -1.09375 
506 | z
507 | " id="Inter-Medium-99"/>
508 |      </defs>
509 |      <g style="fill:#262626;" transform="translate(42.440781 412.040625)rotate(-90)scale(0.15 -0.15)">
510 |       <use xlink:href="#Inter-Medium-82"/>
511 |       <use x="64.488647" xlink:href="#Inter-Medium-101"/>
512 |       <use x="123.224457" xlink:href="#Inter-Medium-118"/>
513 |       <use x="179.865097" xlink:href="#Inter-Medium-105"/>
514 |       <use x="204.75856" xlink:href="#Inter-Medium-101"/>
515 |       <use x="263.49437" xlink:href="#Inter-Medium-119"/>
516 |       <use x="345.987289" xlink:href="#Inter-Medium-32"/>
517 |       <use x="372.478775" xlink:href="#Inter-Medium-99"/>
518 |       <use x="429.261459" xlink:href="#Inter-Medium-111"/>
519 |       <use x="489.488739" xlink:href="#Inter-Medium-117"/>
520 |       <use x="548.934769" xlink:href="#Inter-Medium-110"/>
521 |       <use x="608.664886" xlink:href="#Inter-Medium-116"/>
522 |      </g>
523 |     </g>
524 |    </g>
525 |    <g id="patch_3">
526 |     <path clip-path="url(#p61b4226e6a)" d="M 108.6 640.8 
527 | L 257.4 640.8 
528 | L 257.4 146.957666 
529 | L 108.6 146.957666 
530 | z
531 | " style="fill:#3274a1;stroke:#ffffff;stroke-linejoin:miter;"/>
532 |    </g>
533 |    <g id="patch_4">
534 |     <path clip-path="url(#p61b4226e6a)" d="M 294.6 640.8 
535 | L 443.4 640.8 
536 | L 443.4 586.350713 
537 | L 294.6 586.350713 
538 | z
539 | " style="fill:#e1812c;stroke:#ffffff;stroke-linejoin:miter;"/>
540 |    </g>
541 |    <g id="patch_5">
542 |     <path clip-path="url(#p61b4226e6a)" d="M 480.6 640.8 
543 | L 629.4 640.8 
544 | L 629.4 563.920243 
545 | L 480.6 563.920243 
546 | z
547 | " style="fill:#3a923a;stroke:#ffffff;stroke-linejoin:miter;"/>
548 |    </g>
549 |    <g id="patch_6">
550 |     <path d="M 90 640.8 
551 | L 90 86.4 
552 | " style="fill:none;stroke:#ffffff;stroke-linecap:square;stroke-linejoin:miter;stroke-width:0.8;"/>
553 |    </g>
554 |    <g id="patch_7">
555 |     <path d="M 648 640.8 
556 | L 648 86.4 
557 | " style="fill:none;stroke:#ffffff;stroke-linecap:square;stroke-linejoin:miter;stroke-width:0.8;"/>
558 |    </g>
559 |    <g id="patch_8">
560 |     <path d="M 90 640.8 
561 | L 648 640.8 
562 | " style="fill:none;stroke:#ffffff;stroke-linecap:square;stroke-linejoin:miter;stroke-width:0.8;"/>
563 |    </g>
564 |    <g id="patch_9">
565 |     <path d="M 90 86.4 
566 | L 648 86.4 
567 | " style="fill:none;stroke:#ffffff;stroke-linecap:square;stroke-linejoin:miter;stroke-width:0.8;"/>
568 |    </g>
569 |   </g>
570 |  </g>
571 |  <defs>
572 |   <clipPath id="p61b4226e6a">
573 |    <rect height="554.4" width="558" x="90" y="86.4"/>
574 |   </clipPath>
575 |  </defs>
576 | </svg>
577 | 


--------------------------------------------------------------------------------