├── .gitignore
├── Pipfile
├── lib
    ├── helpers.py
    ├── plotting.py
    └── template.html
├── examples
    └── example.csv
├── readme.MD
└── sherlock.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.pyo
3 | out.html
4 | Pipfile.lock
5 | .DS_Store
6 | .idea
7 | __pycache__/
8 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | name = "pypi"
 3 | url = "https://pypi.org/simple"
 4 | verify_ssl = true
 5 | 
 6 | [dev-packages]
 7 | 
 8 | [packages]
 9 | pmprophet = "==0.2.8"
10 | arviz = "*"
11 | pymc3 = "*"
12 | jinja2 = "*"
13 | matplotlib = "*"
14 | 
15 | [requires]
16 | python_version = "3.7"
17 | 


--------------------------------------------------------------------------------
/lib/helpers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import io
 3 | import base64
 4 | 
 5 | ERRORS = {
 6 |     "no_input": "No input data file was provided",
 7 |     "not_found": "I can't find the input file you provided me, try removing any space in it's name",
 8 |     "not_readable": "The input file is not readable (make sure it's a valid csv file)",
 9 |     "missing_columns": "The input data file is missing one or more mandatory columns: {}",
10 |     "date_column_not_readable": "The date column is not readable, try using the format dd/mm/yyyy (e.g. 30/12/1990)",
11 |     "timespan_too_short": "Provide at least 7 days of data",
12 |     "dls_less_than_0": "Downloads are below zero for certain dates. Check your data or allow outlier removal with -r",
13 |     "conversion_less_than_0": "The conversion is below zero for certain dates. Check your data or allow outlier removal with -r",
14 | }
15 | 
16 | WARNINGS = {
17 |     'update_not_understood': 'An update was marked as "{}" and was ignored, valid updates are "textual" and "visual"',
18 |     "timespan_too_short": 'For best results use at least one month of data',
19 | }
20 | 
21 | INFO = {
22 |     'additional_regressor': 'Using column {} as an additional regressor'
23 | }
24 | 
25 | REQUIRED_COLUMNS = [
26 |     'date',
27 |     'update',
28 |     'search_downloads',
29 |     'search_impressions'
30 | ]
31 | 
32 | OPTIONAL_COLUMNS = [
33 |     'asa_impressions',
34 | ]
35 | 
36 | 
37 | def figure_to_base64(fig):
38 |     io_stream = io.BytesIO()
39 |     fig.savefig(io_stream, format='png')
40 |     io_stream.seek(0)
41 |     return (b'data:image/png;base64, ' + base64.b64encode(io_stream.read())).decode()
42 | 
43 | 
44 | def safe_mean(x):
45 |     try:
46 |         return np.mean(x)
47 |     except TypeError:
48 |         x = x.dropna()
49 |         if x.empty:
50 |             return None
51 |         else:
52 |             return x[0]
53 | 


--------------------------------------------------------------------------------
/examples/example.csv:
--------------------------------------------------------------------------------
 1 | date,update,search_downloads,search_impressions,asa_impressions,asa,google
 2 | 01/11/2018,,725.0,8786,3105,334,94
 3 | 02/11/2018,,711.0,8374,3890,348,74
 4 | 03/11/2018,,790.0,9515,3900,400,71
 5 | 04/11/2018,,902.0,10371,4600,517,107
 6 | 05/11/2018,,721.0,8681,3409,369,108
 7 | 06/11/2018,,733.0,8243,3642,391,95
 8 | 07/11/2018,,667.0,7793,2524,322,97
 9 | 08/11/2018,,775.0,8892,3083,391,67
10 | 09/11/2018,,687.0,9052,3672,373,108
11 | 10/11/2018,,745.0,8692,3473,433,110
12 | 11/11/2018,,762.0,9351,4142,390,92
13 | 12/11/2018,,776.0,9371,3805,395,91
14 | 13/11/2018,,725.0,8364,3226,330,204
15 | 14/11/2018,,696.0,7730,2417,297,271
16 | 15/11/2018,,735.0,8900,3719,321,252
17 | 16/11/2018,,940.0,11172,4787,516,223
18 | 17/11/2018,,1067.0,14395,8754,686,188
19 | 18/11/2018,,1133.0,14722,9353,770,153
20 | 19/11/2018,,929.0,13941,7075,644,129
21 | 20/11/2018,,855.0,12840,5821,519,130
22 | 21/11/2018,,916.0,12024,5344,490,157
23 | 22/11/2018,,876.0,11490,5656,461,159
24 | 23/11/2018,,970.0,11383,6281,528,123
25 | 24/11/2018,,912.0,14066,7242,561,113
26 | 25/11/2018,,818.0,11886,6055,524,118
27 | 26/11/2018,,823.0,10661,7744,630,98
28 | 27/11/2018,,670.0,9266,4024,323,133
29 | 28/11/2018,,854.0,11844,5596,470,102
30 | 29/11/2018,,874.0,11668,5727,523,118
31 | 30/11/2018,,730.0,12789,6472,381,112
32 | 01/12/2018,,1109.0,14686,8085,629,117
33 | 02/12/2018,,1088.0,12858,6831,622,169
34 | 03/12/2018,visual,926.0,11366,5811,569,115
35 | 04/12/2018,,825.0,9462,4689,489,105
36 | 05/12/2018,,919.0,10024,4804,522,102
37 | 06/12/2018,,905.0,8799,3773,347,110
38 | 07/12/2018,,746.0,8452,3723,309,134
39 | 08/12/2018,,871.0,10105,4313,484,154
40 | 09/12/2018,,1065.0,11098,6048,680,128
41 | 10/12/2018,,866.0,9660,4936,485,112
42 | 11/12/2018,,930.0,11273,4725,560,162
43 | 12/12/2018,,1097.0,11491,5563,797,190
44 | 13/12/2018,,1170.0,13394,7253,778,156
45 | 14/12/2018,,1364.0,14422,10244,954,145
46 | 15/12/2018,,1451.0,18068,12438,1087,179
47 | 16/12/2018,,1568.0,19479,14878,1151,201
48 | 17/12/2018,,1315.0,18427,12337,985,142
49 | 18/12/2018,,1412.0,13688,9544,1001,111
50 | 19/12/2018,,1371.0,17575,10999,1058,149
51 | 20/12/2018,,1506.0,15090,9029,1095,194
52 | 21/12/2018,,1297.0,13762,8338,937,171
53 | 22/12/2018,all,1111.0,12600,6691,734,158
54 | 23/12/2018,,829.0,10262,4932,472,148
55 | 24/12/2018,,898.0,11565,4308,469,115
56 | 25/12/2018,,738.0,10827,4558,354,89
57 | 26/12/2018,,949.0,11061,5279,453,136
58 | 27/12/2018,,868.0,11166,5101,568,122
59 | 28/12/2018,,958.0,11272,5134,545,128
60 | 29/12/2018,,934.0,11317,4647,460,145
61 | 30/12/2018,,1044.0,12539,5156,601,178
62 | 31/12/2018,,1366.0,15050,7941,873,190


--------------------------------------------------------------------------------
/readme.MD:
--------------------------------------------------------------------------------
 1 | # Sherlock - The ASO inspector
 2 | 
 3 | Sherlock is a program I wrote as a companion to my presentation at the [ASO Conference in NYC](https://asoconference.com/newyork/) in 2019.
 4 | 
 5 | It's based on top of a library I wrote called [pm-prophet](https://github.com/luke14free/pm-prophet)
 6 | 
 7 | It can produce curated reports about your app and it's traffic, including:
 8 | * Updates impact (both visual and textual updates)
 9 | * Seasonality
10 | * Uplift
11 | 
12 | **Sample output**: is demostrated in the file `examples/report.html` (which is a fit of sherlock on some fake data contained in `examples/example.csv`).
13 | 
14 | ### Installation
15 | 
16 | 1) [Install python 3](https://www.python.org/downloads/)
17 | 2) [Install pipenv](https://docs.pipenv.org/en/latest/install/)
18 | 3) Open your terminal and run `pipenv install` from the folder in which you have cloned this repository
19 | 
20 | Et Voilà. You should be all set.
21 | 
22 | ### Preparing your data
23 | 
24 | Sherlock takes in input a csv file of different time-series. The naming convention is strict and it's case-sensitive.
25 | 
26 | **Required columns:**
27 | 
28 | 1) `date` the date of the events (expressed in *dd/mm/yyyy* format, e.g. 12/30/2019).
29 | 2) `update` a column with value `textual`, `visual` or `all` (i.e. visual and textual) to tell sherlock what type of update was done in that particular date (in lowercase!).
30 | 3) `search_downloads` downloads data coming from iTunes connect filtered by App Store Search source
31 | 4) `search_impressions` impressions data coming from iTunes connect filtered by App Store Search source
32 | 
33 | **Optional columns (if you do Apple Search Ads):**
34 | 
35 | 1) `asa_impressions` impressions coming from Apple Search Ads
36 | 2) `asa` downloads coming from Apple Search Ads
37 | 
38 | **Extra columns:** Any other time series you like (e.g. Facebook, Google, Snapchat..) for which to compute uplift.
39 | 
40 | ### Usage
41 | 
42 | Usage is very simple, from the terminal run:
43 | 
44 | `pipenv shell` 
45 | 
46 | to activate the virtual environment and:
47 | 
48 | `python sherlock.py -i example.csv`
49 | 
50 | To run the script. There are some options available:
51 | 
52 | ```
53 | Options:
54 |   -h, --help            show this help message and exit
55 |   -a APP_NAME, --app-name=APP_NAME
56 |                         Specify the app name if you want a more personalized
57 |                         report
58 |   -i FILE, --input-file=FILE
59 |                         Input CSV file
60 |   -o FILE, --output-file=FILE
61 |                         Output report file (in html format)
62 |   -s SAMPLER, --sampler=SAMPLER
63 |                         Sampler to use ("nuts" is slower but more precise,
64 |                         default "metropolis")
65 |   -n, --no-asa          Do not use ASA as an additional regressor (better
66 |                         seasonality fits)
67 |   -w, --weekly          Run the analysis on a weekly resampling
68 |   -r SIGMA, --remove-outliers-sigma=SIGMA
69 |                         Remove outliers at more than X sigma from the mean
70 |                         (suggested values range between 1.5-3.5). Default
71 |                         value is: 0 that means that Sherlock will not remove
72 |                         outliers
73 |   -l ALPHA, --significance-level=ALPHA
74 |                         The significance level for the analysis (default is
75 |                         0.05)
76 |   -k SEASONALITY_SCALE, --seasonality-scale=SEASONALITY_SCALE
77 |                         The scale of the seasonality, if it fits poorly
78 |                         because you have  great variance due to seasonality
79 |                         increase this. By default this is automatically
80 |                         computed
81 | ```
82 |                         
83 | ### Output
84 | 
85 | Open the `report.html` file generated by Sherlock.
86 | 
87 | ### License
88 | 
89 | [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/legalcode)
90 | 


--------------------------------------------------------------------------------
/lib/plotting.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import matplotlib.pylab as plt
  3 | import numpy as np
  4 | 
  5 | 
  6 | def plot_nowcast(model, updates):
  7 |     fig = plt.figure(figsize=(20, 10))
  8 |     y = model.trace['y_hat_%s' % model.name]
  9 |     ddf = pd.DataFrame(
 10 |         [
 11 |             np.percentile(y, 50, axis=0),
 12 |             np.max(y, axis=0),
 13 |             np.min(y, axis=0),
 14 |         ]
 15 |     ).T
 16 |     ddf["ds"] = model.data["ds"]
 17 |     ddf.columns = ["y_hat", "y_low", "y_high", "ds"]
 18 |     ddf["orig_y"] = model.data["y"]
 19 |     ddf.plot("ds", "y_hat", ax=plt.gca())
 20 |     plt.fill_between(
 21 |         ddf["ds"].values,
 22 |         ddf["y_low"].values.astype(float),
 23 |         ddf["y_high"].values.astype(float),
 24 |         alpha=0.3,
 25 |     )
 26 |     ddf.plot("ds", "orig_y", style="k.", ax=plt.gca(), alpha=0.3)
 27 |     for update in updates:
 28 |         plt.axvline(
 29 |             update, color="C3", lw=1, ls="dotted"
 30 |         )
 31 |     plt.grid(axis="y")
 32 |     return fig
 33 | 
 34 | 
 35 | def plot_predict(prediction, original, updates):
 36 |     fig = plt.figure(figsize=(20, 10))
 37 |     prediction.plot("ds", "y_hat", ax=plt.gca())
 38 |     prediction["orig_y"] = original["y"]
 39 |     plt.fill_between(
 40 |         prediction["ds"].values,
 41 |         prediction["y_low"].values.astype(float),
 42 |         prediction["y_high"].values.astype(float),
 43 |         alpha=0.3,
 44 |     )
 45 | 
 46 |     prediction.plot("ds", "orig_y", style="k.", ax=plt.gca(), alpha=0.2)
 47 |     for update in updates:
 48 |         plt.axvline(
 49 |             update, color="C3", lw=1, ls="dotted"
 50 |         )
 51 |     plt.grid(axis="y")
 52 |     return fig
 53 | 
 54 | 
 55 | def plot_seasonality(self, alpha: float, plot_kwargs: bool):
 56 |     periods = list(set([float(i.split("_")[1]) for i in self.seasonality]))
 57 | 
 58 |     additive_ts, multiplicative_ts = self._fit_seasonality()
 59 | 
 60 |     all_seasonalities = [("additive", additive_ts)]
 61 |     if len(self.multiplicative_data):
 62 |         all_seasonalities.append(("multiplicative", multiplicative_ts))
 63 |     all_figures = {}
 64 | 
 65 |     for sn, ts in all_seasonalities:
 66 |         if (sn == "multiplicative" and np.sum(ts) == 1) or (
 67 |                 sn == "additive" and np.sum(ts) == 0
 68 |         ):
 69 |             continue
 70 |         ddf = pd.DataFrame(
 71 |             np.vstack(
 72 |                 [
 73 |                     np.percentile(ts[:, :, self.skip_first:], 50, axis=-1),
 74 |                     np.percentile(
 75 |                         ts[:, :, self.skip_first:], alpha / 2 * 100, axis=-1
 76 |                     ),
 77 |                     np.percentile(
 78 |                         ts[:, :, self.skip_first:], (1 - alpha / 2) * 100, axis=-1
 79 |                     ),
 80 |                 ]
 81 |             ).T,
 82 |             columns=[
 83 |                 "%s_%s" % (p, l)
 84 |                 for l in ["mid", "low", "high"]
 85 |                 for p in periods[::-1]
 86 |             ],
 87 |         )
 88 |         ddf.loc[:, "ds"] = self.data["ds"]
 89 | 
 90 |         for period in periods:
 91 |             if int(period) == 0:
 92 |                 step = int(
 93 |                     self.data["ds"].diff().mean().total_seconds() // float(period)
 94 |                 )
 95 |             else:
 96 |                 step = int(period)
 97 |             graph = ddf.head(step)
 98 |             if period == 7:
 99 |                 ddf.loc[:, "dow"] = [i for i in ddf["ds"].dt.weekday]
100 |                 graph = (
101 |                     ddf[
102 |                         [
103 |                             "dow",
104 |                             "%s_low" % period,
105 |                             "%s_mid" % period,
106 |                             "%s_high" % period,
107 |                         ]
108 |                     ]
109 |                         .groupby("dow")
110 |                         .mean()
111 |                         .sort_values("dow")
112 |                 )
113 |                 graph.loc[:, "ds"] = [
114 |                     ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][i]
115 |                     for i in graph.index
116 |                 ]
117 |                 graph = graph.sort_index()
118 |             fig = plt.figure(**plot_kwargs)
119 |             all_figures[period] = fig
120 |             graph.plot(
121 |                 y="%s_mid" % period, x="ds", color="C0", legend=False, ax=plt.gca()
122 |             )
123 |             plt.grid()
124 | 
125 |             if period == 7:
126 |                 plt.xticks(range(7), graph["ds"].values)
127 |                 plt.fill_between(
128 |                     np.arange(0, 7),
129 |                     graph["%s_low" % period].values.astype(float),
130 |                     graph["%s_high" % period].values.astype(float),
131 |                     alpha=0.3,
132 |                 )
133 |             else:
134 |                 plt.fill_between(
135 |                     graph["ds"].values,
136 |                     graph["%s_low" % period].values.astype(float),
137 |                     graph["%s_high" % period].values.astype(float),
138 |                     alpha=0.3,
139 |                 )
140 | 
141 |             plt.title("Model Seasonality (%s) for period: %s days" % (sn, period))
142 |             plt.gca().xaxis.label.set_visible(False)
143 |     return all_figures
144 | 


--------------------------------------------------------------------------------
/lib/template.html:
--------------------------------------------------------------------------------
  1 | <html>
  2 | <head>
  3 |     <title>{% if app_name %}{{ app_name }} - by {% endif %}Sherlock - The ASO Inspector</title>
  4 |     <meta charset="utf-8">
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
  6 |     <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css"
  7 |           integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
  8 |     <style>
  9 |         h3 {
 10 |             border-bottom: 1px solid #cccccc;
 11 |         }
 12 | 
 13 |         h3 {
 14 |             margin-bottom: 40px;
 15 |             margin-top: 60px;
 16 |         }
 17 | 
 18 |         h4 {
 19 |             margin-bottom: 30px;
 20 |             border-bottom: 1px solid #efefef;
 21 |         }
 22 | 
 23 |         img {
 24 |             width: 100%;
 25 |         }
 26 |     </style>
 27 | </head>
 28 | 
 29 | <body style="max-width: 768px; margin-right: auto; margin-left: auto; margin-top:30px;">
 30 | <h2>{% if app_name %}{{ app_name }} - by {% endif %}Sherlock - The ASO Inspector</h2>
 31 | <p>An <a target="_blank" href="https://github.com/luke14free/aso-sherlock">open source</a> tool by <a href="https://www.linkedin.com/in/lgiacomel/">Luca
 32 |     Giacomel</a>.
 33 |     Based on <a target="_blank" href="https://github.com/luke14free/pm-prophet">pm-prophet</a>,
 34 |     license is <a target="_blank" href="https://creativecommons.org/licenses/by-sa/4.0/legalcode">CC BY-SA</a>.</p>
 35 | 
 36 | <h3>Menu</h3>
 37 | <div class="list-group" style="width: 300px;">
 38 |     {% if textual_model %}<a class="list-group-item" href="#textual-updates">Textual Updates</a>{% endif %}
 39 |     {% if visual_model %}<a class="list-group-item" href="#visual-updates">Visual Updates</a>{% endif %}
 40 |     {% if textual_seasonality[7] %}<a class="list-group-item" href="#seasonality-analysis">Seasonality Analysis</a>{% endif %}
 41 |     {% if extra_regressors_plots %}<a class="list-group-item" href="#uplift-analysis">Uplift Analysis</a>{% endif %}
 42 |     <a class="list-group-item" href="#summary">Summary</a>
 43 | </div>
 44 | 
 45 | {% if textual_model %}
 46 | <h3 id="textual-updates">Textual Updates</h3>
 47 | <p>This model shows a fit of the downloads model, using as predictors the seasonality trends, paid user acquisition and
 48 |     the textual updates performed over time. Red lines represent textual updates dates.</p>
 49 | <p>The impact of updates is shown in the <a href="#summary">summary</a>.</p>
 50 | <img src="{{ textual_model }}" alt="textual_model"/>
 51 | {% endif %}
 52 | 
 53 | {% if visual_model %}
 54 | <h3 id="visual-updates">Visual Updates</h3>
 55 | <p>This model shows a fit of the conversion model, using as predictors the weekly seasonality trend and
 56 |     the visual updates performed over time. Red lines represent visual updates dates.</p>
 57 | <p>The impact of updates is shown in the <a href="#summary">summary</a>.</p>
 58 | <p><strong>IMPORTANT Notice:</strong> please note that it's very likely that conversion reported by ASc before 1st March 2019 is not reliable
 59 |     as the App Store was counting impressions for apps that were not being shown in the search results (impressions were over-reported).
 60 | <img src="{{ visual_model }}" alt="visual_model"/>
 61 | {% endif %}
 62 | 
 63 | {% if textual_seasonality[7] %}
 64 | <h3 id="seasonality-analysis">Seasonality Analysis</h3>
 65 | <p>
 66 |     Seasonality is a common phenomenon to observe in apps, some make more downloads during the weekdays while other
 67 |     more during the weekends. Depending on the length of the time-series provided, you will also be able to see
 68 |     monthly and yearly seasonalities.
 69 | </p>
 70 | <p>
 71 |     <strong>Note:</strong> if you have fitted a regressor that is also very seasonal (e.g. Apple Search Ads) the graph below
 72 |     will not be an accurate representation of your app seasonality
 73 | </p>
 74 | <h4>Weekly Seasonality</h4>
 75 | <h6>From Downloads</h6>
 76 | <img src="{{ textual_seasonality[7] }}" alt="t_seasonality_7_days"/>
 77 | <h6>From Conversion</h6>
 78 | <img src="{{ conversion_seasonality[7] }}" alt="c_seasonality_7_days"/>
 79 | {% if 30 in textual_seasonality %}
 80 |     <h4>Monthly Seasonality</h4>
 81 |     <h6>From Downloads</h6>
 82 |     <img src="{{ textual_seasonality[30] }}" alt="t_seasonality_30days"/>
 83 |     <h6>From Conversion</h6>
 84 |     <img src="{{ conversion_seasonality[30] }}" alt="c_seasonality_30days"/>
 85 | {% endif %}
 86 | {% if 365 in textual_seasonality %}
 87 |     <h4>Yearly Seasonality</h4>
 88 |     <h6>From Downloads</h6>
 89 |     <img src="{{ textual_seasonality[365] }}" alt="t_seasonality_365_days"/>
 90 |     <h6>From Conversion</h6>
 91 |     <img src="{{ conversion_seasonality[365] }}" alt="c_seasonality_365_days"/>
 92 | {% endif %}
 93 | {% endif %}
 94 | 
 95 | {% if extra_regressors_plots %}
 96 | <h3 id="uplift-analysis">Uplift Analysis</h3>
 97 | <p>
 98 |     Uplift (or <i>k-factor</i>) is the extra effect on organic downloads that is given by certain sources of traffic.<br>
 99 |     The impact of the variable is shown as a full posterior distribution over the spectrum of possible values.
100 |     Given that central values (where bars are higher) are more likely, if the 0 is in the central part of the histogram
101 |     the impact of this variable will not be statistically significant as we cannot exclude with confidence that it's nonzero.
102 | </p>
103 | <p>
104 |     <strong>Note: </strong>please consider that uplift is expressed in percentage (e.g., how many extra downloads do I get by this source on average?)
105 |     and that this metric will also includes mis-attributions.
106 | </p>
107 | {% for extra_regressor in extra_regressors_plots %}
108 | <img src="{{extra_regressor['img_data']}}">
109 | {% endfor %}
110 | {% endif %}
111 | 
112 | <h3 id="summary">Summary</h3>
113 | 
114 | <p>
115 |     Below you will find a summary of all the variables and their percentage effects over downloads.
116 |     Note that the error reported is half of the width of credible interval for the posterior distribution
117 |     and it's "equivalent" to a 95% confidence interval. <strong>Note:</strong> Coefficients for visual updates
118 |     are absolute percentages (a 0.5% increase from a 5% baseline means an increase from 5% to 5.5% on average), while
119 |     coefficients for textual updates are relative percentages (a 20% increase on a 100 downloads baseline is an
120 |     increase of ~20 downloads per day on average).
121 | </p>
122 | 
123 | <table class="table">
124 |     <thead>
125 |     <tr>
126 |         <th>Variable</th>
127 |         <th>Significant</th>
128 |         <th>Median effect</th>
129 |         <th>2&sigma; Error Equivalent (+/-)</th>
130 |     </tr>
131 |     </thead>
132 |     <tbody>
133 |     {% for line in summary %}
134 |     <tr>
135 |         <td>{{line['name']}}</td>
136 |         <td>{% if line['error']|abs > line['median']|abs %} Not Significant {% else %} Significant {% endif %}</td>
137 |         <td>{{line['median']}}%</td>
138 |         <td>{{line['error']}}%</td>
139 |     </tr>
140 |     {% endfor %}
141 |     </tbody>
142 | </table>
143 | 
144 | 
145 | <script src="https://code.jquery.com/jquery-3.3.1.slim.min.js"
146 |         integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo"
147 |         crossorigin="anonymous"></script>
148 | <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js"
149 |         integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1"
150 |         crossorigin="anonymous"></script>
151 | <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js"
152 |         integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM"
153 |         crossorigin="anonymous"></script>
154 | </body>
155 | </html>


--------------------------------------------------------------------------------
/sherlock.py:
--------------------------------------------------------------------------------
  1 | from optparse import OptionParser
  2 | from typing import Dict, List, Any, Union, Tuple
  3 | 
  4 | import jinja2
  5 | import pandas as pd
  6 | import sys
  7 | import logging
  8 | 
  9 | from matplotlib.pylab import plt
 10 | 
 11 | from pandas.errors import ParserError
 12 | from pmprophet import PMProphet, Sampler
 13 | import numpy as np
 14 | import pymc3 as pm
 15 | 
 16 | from lib.helpers import figure_to_base64, safe_mean, ERRORS, REQUIRED_COLUMNS, WARNINGS, OPTIONAL_COLUMNS
 17 | from lib.plotting import plot_nowcast, plot_seasonality
 18 | 
 19 | 
 20 | def read_input_file(file_path: str) -> pd.DataFrame:
 21 |     try:
 22 |         df = pd.read_csv(file_path)
 23 |     except FileNotFoundError:
 24 |         logging.error(ERRORS['not_found'])
 25 |         sys.exit()
 26 |     except ParserError:
 27 |         logging.error(ERRORS['not_readable'])
 28 |         sys.exit()
 29 |     missing_columns = ", ".join(set(REQUIRED_COLUMNS) - set(df.columns))
 30 |     if missing_columns:
 31 |         logging.error(ERRORS['missing_columns'].format(missing_columns))
 32 |         sys.exit()
 33 |     try:
 34 |         df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y')
 35 |     except ValueError:
 36 |         logging.error(ERRORS['date_column_not_readable'])
 37 |         sys.exit()
 38 |     return df.sort_values('date')
 39 | 
 40 | 
 41 | def handle_outliers(data: pd.DataFrame) -> pd.DataFrame:
 42 |     if options.sigma:
 43 |         y = []
 44 |         raw_y = data['y'].values
 45 |         for idx, val in enumerate(raw_y):
 46 |             if val < 0 or not (raw_y.mean() - options.sigma * raw_y.std()) < val < (
 47 |                     raw_y.mean() + options.sigma * raw_y.std()):
 48 |                 ts_slice = raw_y[idx - 20:idx + 20]
 49 |                 y.append(np.median(ts_slice[ts_slice >= 0]))
 50 |             else:
 51 |                 y.append(val)
 52 |         data['y'] = y
 53 |     else:
 54 |         if data['y'].min() < 0:
 55 |             raise Exception(ERRORS['conversion_less_than_0'])
 56 |     return data
 57 | 
 58 | 
 59 | def fit_beta_regression(model: PMProphet, data: pd.DataFrame) -> PMProphet:
 60 |     model._prepare_fit()
 61 |     with model.model:
 62 |         mean = pm.Deterministic('y_%s' % model.name, model.y)  # no scaling needed
 63 |         hp_alpha = pm.HalfCauchy('y_alpha_%s' % model.name, 2.5)
 64 |         hp_beta = pm.Deterministic('y_beta_%s' % model.name, hp_alpha * ((1 - mean) / mean))
 65 |         pm.Beta("observed_%s" % model.name, hp_alpha, hp_beta, observed=data['y'])
 66 |         pm.Deterministic('y_hat_%s' % model.name, mean)
 67 |     model.fit(10000 if options.sampler == 'metropolis' else 2000,
 68 |               method=Sampler.METROPOLIS if options.sampler == 'metropolis' else Sampler.NUTS,
 69 |               finalize=False,
 70 |               step_kwargs={'compute_convergence_checks': False} if options.sampler == 'metropolis' else {})
 71 |     return model
 72 | 
 73 | 
 74 | def summary_from_model_regressors(model: PMProphet, regressors: Union[List, Tuple]) -> List[
 75 |     Dict[str, Union[str, float]]]:
 76 |     alpha = options.alpha
 77 |     summary = []
 78 |     for idx, regressor in enumerate(regressors):
 79 |         error = (pd.np.percentile(
 80 |             model.trace['regressors_{}'.format(model.name)][:, idx],
 81 |             100 - (alpha * 100 / 2)
 82 |         ) - pd.np.percentile(
 83 |             model.trace['regressors_{}'.format(model.name)][:, idx],
 84 |             (alpha * 100 / 2)
 85 |         )) / 2
 86 |         summary.append({
 87 |             'name': regressor,
 88 |             'median': pd.np.round(pd.np.median(model.trace['regressors_{}'.format(model.name)][:, idx] * 100), 2),
 89 |             'error': pd.np.round(error * 100, 2),
 90 |         })
 91 |     return summary
 92 | 
 93 | 
 94 | def create_model(model_name: str, data: pd.DataFrame, seasonality_scale: float, growth: bool,
 95 |                  regressors: Union[List, Tuple] = (), changepoints: Union[List, Tuple] = ()) -> PMProphet:
 96 |     model = PMProphet(
 97 |         data,
 98 |         growth=growth,
 99 |         seasonality_prior_scale=seasonality_scale,
100 |         changepoints=[] if not changepoints else changepoints,
101 |         name=model_name,
102 |     )
103 | 
104 |     for regressor in regressors:
105 |         model.add_regressor(regressor)
106 | 
107 |     if not options.weekly:
108 |         model.add_seasonality(7, 3)
109 | 
110 |     if (data['ds'].max() - data['ds'].min()).days > 365:
111 |         model.add_seasonality(365, 5)
112 |     return model
113 | 
114 | 
115 | def visual_update_analysis(df: pd.DataFrame) -> Tuple[Dict[str, str], List[
116 |     Dict[str, Union[str, float]]]]:
117 |     summary = []
118 |     template_vars = {}
119 |     df = df.rename(columns={'date': 'ds'})
120 |     if 'asa_impressions' in df.columns:
121 |         df['impressions'] = df['search_impressions'] - df['asa_impressions']
122 |         df['conversions'] = df['search_downloads'] - df['asa']
123 |     else:
124 |         df['impressions'] = df['search_impressions']
125 |         df['conversions'] = df['search_downloads']
126 |     df['y'] = df['conversions'] / df['impressions']
127 |     df.index = df['ds']
128 | 
129 |     if options.weekly:
130 |         df = df.resample('W').apply(safe_mean)
131 | 
132 |     df = handle_outliers(df.copy())
133 | 
134 |     time_regressors = []
135 |     for _, row in df.iterrows():
136 |         if row['update'] == 'visual' or row['update'] == 'all':
137 |             additional_regressor = '{} (visual)'.format(str(row['ds']).split(" ")[0])
138 |             df[additional_regressor] = [1 if other_row['ds'] >= row['ds'] else 0 for
139 |                                         _, other_row in df.iterrows()]
140 |             time_regressors.append(additional_regressor)
141 | 
142 |     model = create_model('sherlock_visual', df, 1.0, False, time_regressors)
143 |     conversion_model = fit_beta_regression(model, df)
144 | 
145 |     fig = plot_nowcast(conversion_model,
146 |                        [row['ds'] for _, row in df.iterrows() if row['update'] == 'visual' or row['update'] == 'all'])
147 |     plt.title('Conversion & Visual Updates')
148 |     template_vars['visual_model'] = figure_to_base64(fig)
149 | 
150 |     summary.extend(summary_from_model_regressors(conversion_model, time_regressors))
151 |     seasonality = {}
152 |     for period, fig in plot_seasonality(conversion_model, alpha=options.alpha, plot_kwargs={}).items():
153 |         seasonality[int(period)] = figure_to_base64(fig)
154 |     template_vars['conversion_seasonality'] = seasonality
155 | 
156 |     return template_vars, summary
157 | 
158 | 
159 | def textual_update_analysis(df: pd.DataFrame, extra_columns: List) -> Tuple[Dict[str, str], List[
160 |     Dict[str, Union[str, float]]]]:
161 |     template_vars: Dict[str, Any] = {}
162 |     summary = []
163 |     df = df.rename(columns={'date': 'ds', 'search_downloads': 'y'})
164 |     if 'asa' in df.columns:
165 |         df['y'] = df['y'] - df['asa']
166 |     df.index = df['ds']
167 | 
168 |     df = handle_outliers(df)
169 | 
170 |     if options.weekly:
171 |         df = df.resample('W').apply(safe_mean)
172 | 
173 |     time_regressors = []
174 |     for _, row in df.iterrows():
175 |         if row['update'] == 'textual' or row['update'] == 'all':
176 |             additional_regressor = '{} (text)'.format(str(row['ds']).split(" ")[0])
177 |             df[additional_regressor] = [other_row['y'] if other_row['ds'] >= row['ds'] else 0 for
178 |                                         _, other_row in df.iterrows()]
179 |             time_regressors.append(additional_regressor)
180 | 
181 |     seasonality_scale = df['y'].std() if options.seasonality_scale == 0 else options.seasonality_scale
182 |     model = create_model('sherlock_textual', df, seasonality_scale, True, time_regressors + extra_columns)
183 | 
184 |     model.fit(10000 if options.sampler == 'metropolis' else 2000,
185 |               method=Sampler.METROPOLIS if options.sampler == 'metropolis' else Sampler.NUTS,
186 |               step_kwargs={'compute_convergence_checks': False} if options.sampler == 'metropolis' else {})
187 | 
188 |     fig = plot_nowcast(model,
189 |                        [row['ds'] for _, row in df.iterrows() if row['update'] == 'textual' or row['update'] == 'all'])
190 |     plt.title('Downloads & Textual Updates')
191 |     template_vars['textual_model'] = figure_to_base64(fig)
192 | 
193 |     summary.extend(summary_from_model_regressors(model, time_regressors + extra_columns))
194 | 
195 |     extra_regressors_plots: List[Dict[str, str]] = []
196 |     for i in range(len(time_regressors), len(time_regressors) + len(extra_columns)):
197 |         fig = plt.figure()
198 |         plt.grid()
199 |         plt.hist(model.trace['regressors_{}'.format(model.name)][:, i] * 100, bins=30, alpha=0.8, histtype='stepfilled',
200 |                  density=True)
201 |         plt.axvline(np.median(model.trace['regressors_{}'.format(model.name)][:, i]) * 100, color="C3", lw=1,
202 |                     ls="dotted")
203 |         plt.title("{} (in %)".format(extra_columns[i - len(time_regressors)]))
204 |         extra_regressors_plots.append({
205 |             'name': extra_columns[i - len(time_regressors)],
206 |             'img_data': figure_to_base64(fig)
207 |         })
208 | 
209 |     template_vars['extra_regressors_plots'] = extra_regressors_plots
210 | 
211 |     seasonality = {}
212 |     for period, fig in plot_seasonality(model, alpha=options.alpha, plot_kwargs={}).items():
213 |         seasonality[int(period)] = figure_to_base64(fig)
214 |     template_vars['textual_seasonality'] = seasonality
215 | 
216 |     return template_vars, summary
217 | 
218 | 
219 | def run_sherlock() -> None:
220 |     template_vars = {'textual_seasonality': {}, 'conversion_seasonality': {}}
221 | 
222 |     df = read_input_file(options.input_file)
223 |     df['update'] = df['update'].str.lower()
224 |     for unknown_update in (set(df['update'].unique()) - {'textual', 'visual', pd.np.nan, 'all'}):
225 |         logging.warning(WARNINGS['update_not_understood'].format(unknown_update))
226 |     time_span = (df['date'].max() - df['date'].min()).days
227 |     if time_span < 7:
228 |         logging.error(ERRORS['timespan_too_short'])
229 |         sys.exit()
230 |     if time_span < 30:
231 |         logging.warning(WARNINGS['timespan_too_short'])
232 |     extra_columns = list(set(df.columns) - set(REQUIRED_COLUMNS + OPTIONAL_COLUMNS + ['date', 'search_downloads']))
233 |     summary = []
234 | 
235 |     tv, s = visual_update_analysis(df.copy())
236 |     template_vars.update(tv)
237 |     summary.extend(s)
238 | 
239 |     tv, s = textual_update_analysis(df.copy(), extra_columns)
240 |     template_vars.update(tv)
241 |     summary.extend(s)
242 | 
243 |     if options.app_name:
244 |         template_vars['app_name'] = options.app_name
245 |     template_vars['summary'] = summary
246 | 
247 |     template_env = jinja2.Environment(loader=jinja2.FileSystemLoader(searchpath="./lib"))
248 |     template = template_env.get_template("template.html")
249 | 
250 |     with open(options.output_file, 'w') as output_file:
251 |         output_file.write(template.render(**template_vars))
252 | 
253 | 
254 | if __name__ == '__main__':
255 |     parser = OptionParser()
256 |     parser.add_option("-a", "--app-name", dest='app_name', default=None,
257 |                       help="Specify the app name if you want a more personalized report")
258 |     parser.add_option("-i", "--input-file", dest="input_file",
259 |                       help="Input CSV file", metavar="FILE")
260 |     parser.add_option("-o", "--output-file", dest="output_file",
261 |                       help="Output report file (in html format)", metavar="FILE", default='report.html')
262 |     parser.add_option("-s", "--sampler", dest='sampler', choices=['metropolis', 'nuts'], default='metropolis',
263 |                       help='Sampler to use ("nuts" is slower but more precise, default "metropolis")')
264 |     parser.add_option("-n", "--no-asa", dest='no_asa', action="store_true", default=False,
265 |                       help="Do not use ASA as an additional regressor (better seasonality fits)")
266 |     parser.add_option("-w", "--weekly", dest='weekly', action="store_true", default=False,
267 |                       help="Run the analysis on a weekly resampling")
268 |     parser.add_option("-r", "--remove-outliers-sigma", dest='sigma', default=False, type='float',
269 |                       help='''Remove outliers at more than X sigma from the mean (suggested values range between 1.5-3.5).
270 | Default value is: 0 that means that Sherlock will not remove outliers''')
271 |     parser.add_option("-l", "--significance-level", dest='alpha', default=0.05, type='float',
272 |                       help="The significance level for the analysis (default is 0.05)")
273 |     parser.add_option("-k", "--seasonality-scale", dest='seasonality_scale', default=0, type='float',
274 |                       help="""The scale of the seasonality, if it fits poorly because you have 
275 | great variance due to seasonality increase this. By default this is automatically computed""")
276 | 
277 |     (options, args) = parser.parse_args()
278 | 
279 |     if not options.input_file:
280 |         logging.error(ERRORS['no_input'])
281 |         sys.exit()
282 |     if options.no_asa:
283 |         OPTIONAL_COLUMNS.append('asa')
284 | 
285 |     try:
286 |         run_sherlock()
287 |     except pm.exceptions.SamplingError:
288 |         print('NUTS cannot model the data, retrying with metropolis')
289 |         options.sampler = 'metropolis'
290 |         run_sherlock()
291 | 


--------------------------------------------------------------------------------