├── .gitignore ├── Pipfile ├── lib ├── helpers.py ├── plotting.py └── template.html ├── examples └── example.csv ├── readme.MD └── sherlock.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.pyo 3 | out.html 4 | Pipfile.lock 5 | .DS_Store 6 | .idea 7 | __pycache__/ 8 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | 8 | [packages] 9 | pmprophet = "==0.2.8" 10 | arviz = "*" 11 | pymc3 = "*" 12 | jinja2 = "*" 13 | matplotlib = "*" 14 | 15 | [requires] 16 | python_version = "3.7" 17 | -------------------------------------------------------------------------------- /lib/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import io 3 | import base64 4 | 5 | ERRORS = { 6 | "no_input": "No input data file was provided", 7 | "not_found": "I can't find the input file you provided me, try removing any space in it's name", 8 | "not_readable": "The input file is not readable (make sure it's a valid csv file)", 9 | "missing_columns": "The input data file is missing one or more mandatory columns: {}", 10 | "date_column_not_readable": "The date column is not readable, try using the format dd/mm/yyyy (e.g. 30/12/1990)", 11 | "timespan_too_short": "Provide at least 7 days of data", 12 | "dls_less_than_0": "Downloads are below zero for certain dates. Check your data or allow outlier removal with -r", 13 | "conversion_less_than_0": "The conversion is below zero for certain dates. Check your data or allow outlier removal with -r", 14 | } 15 | 16 | WARNINGS = { 17 | 'update_not_understood': 'An update was marked as "{}" and was ignored, valid updates are "textual" and "visual"', 18 | "timespan_too_short": 'For best results use at least one month of data', 19 | } 20 | 21 | INFO = { 22 | 'additional_regressor': 'Using column {} as an additional regressor' 23 | } 24 | 25 | REQUIRED_COLUMNS = [ 26 | 'date', 27 | 'update', 28 | 'search_downloads', 29 | 'search_impressions' 30 | ] 31 | 32 | OPTIONAL_COLUMNS = [ 33 | 'asa_impressions', 34 | ] 35 | 36 | 37 | def figure_to_base64(fig): 38 | io_stream = io.BytesIO() 39 | fig.savefig(io_stream, format='png') 40 | io_stream.seek(0) 41 | return (b'data:image/png;base64, ' + base64.b64encode(io_stream.read())).decode() 42 | 43 | 44 | def safe_mean(x): 45 | try: 46 | return np.mean(x) 47 | except TypeError: 48 | x = x.dropna() 49 | if x.empty: 50 | return None 51 | else: 52 | return x[0] 53 | -------------------------------------------------------------------------------- /examples/example.csv: -------------------------------------------------------------------------------- 1 | date,update,search_downloads,search_impressions,asa_impressions,asa,google 2 | 01/11/2018,,725.0,8786,3105,334,94 3 | 02/11/2018,,711.0,8374,3890,348,74 4 | 03/11/2018,,790.0,9515,3900,400,71 5 | 04/11/2018,,902.0,10371,4600,517,107 6 | 05/11/2018,,721.0,8681,3409,369,108 7 | 06/11/2018,,733.0,8243,3642,391,95 8 | 07/11/2018,,667.0,7793,2524,322,97 9 | 08/11/2018,,775.0,8892,3083,391,67 10 | 09/11/2018,,687.0,9052,3672,373,108 11 | 10/11/2018,,745.0,8692,3473,433,110 12 | 11/11/2018,,762.0,9351,4142,390,92 13 | 12/11/2018,,776.0,9371,3805,395,91 14 | 13/11/2018,,725.0,8364,3226,330,204 15 | 14/11/2018,,696.0,7730,2417,297,271 16 | 15/11/2018,,735.0,8900,3719,321,252 17 | 16/11/2018,,940.0,11172,4787,516,223 18 | 17/11/2018,,1067.0,14395,8754,686,188 19 | 18/11/2018,,1133.0,14722,9353,770,153 20 | 19/11/2018,,929.0,13941,7075,644,129 21 | 20/11/2018,,855.0,12840,5821,519,130 22 | 21/11/2018,,916.0,12024,5344,490,157 23 | 22/11/2018,,876.0,11490,5656,461,159 24 | 23/11/2018,,970.0,11383,6281,528,123 25 | 24/11/2018,,912.0,14066,7242,561,113 26 | 25/11/2018,,818.0,11886,6055,524,118 27 | 26/11/2018,,823.0,10661,7744,630,98 28 | 27/11/2018,,670.0,9266,4024,323,133 29 | 28/11/2018,,854.0,11844,5596,470,102 30 | 29/11/2018,,874.0,11668,5727,523,118 31 | 30/11/2018,,730.0,12789,6472,381,112 32 | 01/12/2018,,1109.0,14686,8085,629,117 33 | 02/12/2018,,1088.0,12858,6831,622,169 34 | 03/12/2018,visual,926.0,11366,5811,569,115 35 | 04/12/2018,,825.0,9462,4689,489,105 36 | 05/12/2018,,919.0,10024,4804,522,102 37 | 06/12/2018,,905.0,8799,3773,347,110 38 | 07/12/2018,,746.0,8452,3723,309,134 39 | 08/12/2018,,871.0,10105,4313,484,154 40 | 09/12/2018,,1065.0,11098,6048,680,128 41 | 10/12/2018,,866.0,9660,4936,485,112 42 | 11/12/2018,,930.0,11273,4725,560,162 43 | 12/12/2018,,1097.0,11491,5563,797,190 44 | 13/12/2018,,1170.0,13394,7253,778,156 45 | 14/12/2018,,1364.0,14422,10244,954,145 46 | 15/12/2018,,1451.0,18068,12438,1087,179 47 | 16/12/2018,,1568.0,19479,14878,1151,201 48 | 17/12/2018,,1315.0,18427,12337,985,142 49 | 18/12/2018,,1412.0,13688,9544,1001,111 50 | 19/12/2018,,1371.0,17575,10999,1058,149 51 | 20/12/2018,,1506.0,15090,9029,1095,194 52 | 21/12/2018,,1297.0,13762,8338,937,171 53 | 22/12/2018,all,1111.0,12600,6691,734,158 54 | 23/12/2018,,829.0,10262,4932,472,148 55 | 24/12/2018,,898.0,11565,4308,469,115 56 | 25/12/2018,,738.0,10827,4558,354,89 57 | 26/12/2018,,949.0,11061,5279,453,136 58 | 27/12/2018,,868.0,11166,5101,568,122 59 | 28/12/2018,,958.0,11272,5134,545,128 60 | 29/12/2018,,934.0,11317,4647,460,145 61 | 30/12/2018,,1044.0,12539,5156,601,178 62 | 31/12/2018,,1366.0,15050,7941,873,190 -------------------------------------------------------------------------------- /readme.MD: -------------------------------------------------------------------------------- 1 | # Sherlock - The ASO inspector 2 | 3 | Sherlock is a program I wrote as a companion to my presentation at the [ASO Conference in NYC](https://asoconference.com/newyork/) in 2019. 4 | 5 | It's based on top of a library I wrote called [pm-prophet](https://github.com/luke14free/pm-prophet) 6 | 7 | It can produce curated reports about your app and it's traffic, including: 8 | * Updates impact (both visual and textual updates) 9 | * Seasonality 10 | * Uplift 11 | 12 | **Sample output**: is demostrated in the file `examples/report.html` (which is a fit of sherlock on some fake data contained in `examples/example.csv`). 13 | 14 | ### Installation 15 | 16 | 1) [Install python 3](https://www.python.org/downloads/) 17 | 2) [Install pipenv](https://docs.pipenv.org/en/latest/install/) 18 | 3) Open your terminal and run `pipenv install` from the folder in which you have cloned this repository 19 | 20 | Et Voilà. You should be all set. 21 | 22 | ### Preparing your data 23 | 24 | Sherlock takes in input a csv file of different time-series. The naming convention is strict and it's case-sensitive. 25 | 26 | **Required columns:** 27 | 28 | 1) `date` the date of the events (expressed in *dd/mm/yyyy* format, e.g. 12/30/2019). 29 | 2) `update` a column with value `textual`, `visual` or `all` (i.e. visual and textual) to tell sherlock what type of update was done in that particular date (in lowercase!). 30 | 3) `search_downloads` downloads data coming from iTunes connect filtered by App Store Search source 31 | 4) `search_impressions` impressions data coming from iTunes connect filtered by App Store Search source 32 | 33 | **Optional columns (if you do Apple Search Ads):** 34 | 35 | 1) `asa_impressions` impressions coming from Apple Search Ads 36 | 2) `asa` downloads coming from Apple Search Ads 37 | 38 | **Extra columns:** Any other time series you like (e.g. Facebook, Google, Snapchat..) for which to compute uplift. 39 | 40 | ### Usage 41 | 42 | Usage is very simple, from the terminal run: 43 | 44 | `pipenv shell` 45 | 46 | to activate the virtual environment and: 47 | 48 | `python sherlock.py -i example.csv` 49 | 50 | To run the script. There are some options available: 51 | 52 | ``` 53 | Options: 54 | -h, --help show this help message and exit 55 | -a APP_NAME, --app-name=APP_NAME 56 | Specify the app name if you want a more personalized 57 | report 58 | -i FILE, --input-file=FILE 59 | Input CSV file 60 | -o FILE, --output-file=FILE 61 | Output report file (in html format) 62 | -s SAMPLER, --sampler=SAMPLER 63 | Sampler to use ("nuts" is slower but more precise, 64 | default "metropolis") 65 | -n, --no-asa Do not use ASA as an additional regressor (better 66 | seasonality fits) 67 | -w, --weekly Run the analysis on a weekly resampling 68 | -r SIGMA, --remove-outliers-sigma=SIGMA 69 | Remove outliers at more than X sigma from the mean 70 | (suggested values range between 1.5-3.5). Default 71 | value is: 0 that means that Sherlock will not remove 72 | outliers 73 | -l ALPHA, --significance-level=ALPHA 74 | The significance level for the analysis (default is 75 | 0.05) 76 | -k SEASONALITY_SCALE, --seasonality-scale=SEASONALITY_SCALE 77 | The scale of the seasonality, if it fits poorly 78 | because you have great variance due to seasonality 79 | increase this. By default this is automatically 80 | computed 81 | ``` 82 | 83 | ### Output 84 | 85 | Open the `report.html` file generated by Sherlock. 86 | 87 | ### License 88 | 89 | [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/legalcode) 90 | -------------------------------------------------------------------------------- /lib/plotting.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pylab as plt 3 | import numpy as np 4 | 5 | 6 | def plot_nowcast(model, updates): 7 | fig = plt.figure(figsize=(20, 10)) 8 | y = model.trace['y_hat_%s' % model.name] 9 | ddf = pd.DataFrame( 10 | [ 11 | np.percentile(y, 50, axis=0), 12 | np.max(y, axis=0), 13 | np.min(y, axis=0), 14 | ] 15 | ).T 16 | ddf["ds"] = model.data["ds"] 17 | ddf.columns = ["y_hat", "y_low", "y_high", "ds"] 18 | ddf["orig_y"] = model.data["y"] 19 | ddf.plot("ds", "y_hat", ax=plt.gca()) 20 | plt.fill_between( 21 | ddf["ds"].values, 22 | ddf["y_low"].values.astype(float), 23 | ddf["y_high"].values.astype(float), 24 | alpha=0.3, 25 | ) 26 | ddf.plot("ds", "orig_y", style="k.", ax=plt.gca(), alpha=0.3) 27 | for update in updates: 28 | plt.axvline( 29 | update, color="C3", lw=1, ls="dotted" 30 | ) 31 | plt.grid(axis="y") 32 | return fig 33 | 34 | 35 | def plot_predict(prediction, original, updates): 36 | fig = plt.figure(figsize=(20, 10)) 37 | prediction.plot("ds", "y_hat", ax=plt.gca()) 38 | prediction["orig_y"] = original["y"] 39 | plt.fill_between( 40 | prediction["ds"].values, 41 | prediction["y_low"].values.astype(float), 42 | prediction["y_high"].values.astype(float), 43 | alpha=0.3, 44 | ) 45 | 46 | prediction.plot("ds", "orig_y", style="k.", ax=plt.gca(), alpha=0.2) 47 | for update in updates: 48 | plt.axvline( 49 | update, color="C3", lw=1, ls="dotted" 50 | ) 51 | plt.grid(axis="y") 52 | return fig 53 | 54 | 55 | def plot_seasonality(self, alpha: float, plot_kwargs: bool): 56 | periods = list(set([float(i.split("_")[1]) for i in self.seasonality])) 57 | 58 | additive_ts, multiplicative_ts = self._fit_seasonality() 59 | 60 | all_seasonalities = [("additive", additive_ts)] 61 | if len(self.multiplicative_data): 62 | all_seasonalities.append(("multiplicative", multiplicative_ts)) 63 | all_figures = {} 64 | 65 | for sn, ts in all_seasonalities: 66 | if (sn == "multiplicative" and np.sum(ts) == 1) or ( 67 | sn == "additive" and np.sum(ts) == 0 68 | ): 69 | continue 70 | ddf = pd.DataFrame( 71 | np.vstack( 72 | [ 73 | np.percentile(ts[:, :, self.skip_first:], 50, axis=-1), 74 | np.percentile( 75 | ts[:, :, self.skip_first:], alpha / 2 * 100, axis=-1 76 | ), 77 | np.percentile( 78 | ts[:, :, self.skip_first:], (1 - alpha / 2) * 100, axis=-1 79 | ), 80 | ] 81 | ).T, 82 | columns=[ 83 | "%s_%s" % (p, l) 84 | for l in ["mid", "low", "high"] 85 | for p in periods[::-1] 86 | ], 87 | ) 88 | ddf.loc[:, "ds"] = self.data["ds"] 89 | 90 | for period in periods: 91 | if int(period) == 0: 92 | step = int( 93 | self.data["ds"].diff().mean().total_seconds() // float(period) 94 | ) 95 | else: 96 | step = int(period) 97 | graph = ddf.head(step) 98 | if period == 7: 99 | ddf.loc[:, "dow"] = [i for i in ddf["ds"].dt.weekday] 100 | graph = ( 101 | ddf[ 102 | [ 103 | "dow", 104 | "%s_low" % period, 105 | "%s_mid" % period, 106 | "%s_high" % period, 107 | ] 108 | ] 109 | .groupby("dow") 110 | .mean() 111 | .sort_values("dow") 112 | ) 113 | graph.loc[:, "ds"] = [ 114 | ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][i] 115 | for i in graph.index 116 | ] 117 | graph = graph.sort_index() 118 | fig = plt.figure(**plot_kwargs) 119 | all_figures[period] = fig 120 | graph.plot( 121 | y="%s_mid" % period, x="ds", color="C0", legend=False, ax=plt.gca() 122 | ) 123 | plt.grid() 124 | 125 | if period == 7: 126 | plt.xticks(range(7), graph["ds"].values) 127 | plt.fill_between( 128 | np.arange(0, 7), 129 | graph["%s_low" % period].values.astype(float), 130 | graph["%s_high" % period].values.astype(float), 131 | alpha=0.3, 132 | ) 133 | else: 134 | plt.fill_between( 135 | graph["ds"].values, 136 | graph["%s_low" % period].values.astype(float), 137 | graph["%s_high" % period].values.astype(float), 138 | alpha=0.3, 139 | ) 140 | 141 | plt.title("Model Seasonality (%s) for period: %s days" % (sn, period)) 142 | plt.gca().xaxis.label.set_visible(False) 143 | return all_figures 144 | -------------------------------------------------------------------------------- /lib/template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% if app_name %}{{ app_name }} - by {% endif %}Sherlock - The ASO Inspector 4 | 5 | 6 | 8 | 27 | 28 | 29 | 30 |

{% if app_name %}{{ app_name }} - by {% endif %}Sherlock - The ASO Inspector

31 |

An open source tool by Luca 32 | Giacomel. 33 | Based on pm-prophet, 34 | license is CC BY-SA.

35 | 36 |

Menu

37 |
38 | {% if textual_model %}Textual Updates{% endif %} 39 | {% if visual_model %}Visual Updates{% endif %} 40 | {% if textual_seasonality[7] %}Seasonality Analysis{% endif %} 41 | {% if extra_regressors_plots %}Uplift Analysis{% endif %} 42 | Summary 43 |
44 | 45 | {% if textual_model %} 46 |

Textual Updates

47 |

This model shows a fit of the downloads model, using as predictors the seasonality trends, paid user acquisition and 48 | the textual updates performed over time. Red lines represent textual updates dates.

49 |

The impact of updates is shown in the summary.

50 | textual_model 51 | {% endif %} 52 | 53 | {% if visual_model %} 54 |

Visual Updates

55 |

This model shows a fit of the conversion model, using as predictors the weekly seasonality trend and 56 | the visual updates performed over time. Red lines represent visual updates dates.

57 |

The impact of updates is shown in the summary.

58 |

IMPORTANT Notice: please note that it's very likely that conversion reported by ASc before 1st March 2019 is not reliable 59 | as the App Store was counting impressions for apps that were not being shown in the search results (impressions were over-reported). 60 | visual_model 61 | {% endif %} 62 | 63 | {% if textual_seasonality[7] %} 64 |

Seasonality Analysis

65 |

66 | Seasonality is a common phenomenon to observe in apps, some make more downloads during the weekdays while other 67 | more during the weekends. Depending on the length of the time-series provided, you will also be able to see 68 | monthly and yearly seasonalities. 69 |

70 |

71 | Note: if you have fitted a regressor that is also very seasonal (e.g. Apple Search Ads) the graph below 72 | will not be an accurate representation of your app seasonality 73 |

74 |

Weekly Seasonality

75 |
From Downloads
76 | t_seasonality_7_days 77 |
From Conversion
78 | c_seasonality_7_days 79 | {% if 30 in textual_seasonality %} 80 |

Monthly Seasonality

81 |
From Downloads
82 | t_seasonality_30days 83 |
From Conversion
84 | c_seasonality_30days 85 | {% endif %} 86 | {% if 365 in textual_seasonality %} 87 |

Yearly Seasonality

88 |
From Downloads
89 | t_seasonality_365_days 90 |
From Conversion
91 | c_seasonality_365_days 92 | {% endif %} 93 | {% endif %} 94 | 95 | {% if extra_regressors_plots %} 96 |

Uplift Analysis

97 |

98 | Uplift (or k-factor) is the extra effect on organic downloads that is given by certain sources of traffic.
99 | The impact of the variable is shown as a full posterior distribution over the spectrum of possible values. 100 | Given that central values (where bars are higher) are more likely, if the 0 is in the central part of the histogram 101 | the impact of this variable will not be statistically significant as we cannot exclude with confidence that it's nonzero. 102 |

103 |

104 | Note: please consider that uplift is expressed in percentage (e.g., how many extra downloads do I get by this source on average?) 105 | and that this metric will also includes mis-attributions. 106 |

107 | {% for extra_regressor in extra_regressors_plots %} 108 | 109 | {% endfor %} 110 | {% endif %} 111 | 112 |

Summary

113 | 114 |

115 | Below you will find a summary of all the variables and their percentage effects over downloads. 116 | Note that the error reported is half of the width of credible interval for the posterior distribution 117 | and it's "equivalent" to a 95% confidence interval. Note: Coefficients for visual updates 118 | are absolute percentages (a 0.5% increase from a 5% baseline means an increase from 5% to 5.5% on average), while 119 | coefficients for textual updates are relative percentages (a 20% increase on a 100 downloads baseline is an 120 | increase of ~20 downloads per day on average). 121 |

122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | {% for line in summary %} 134 | 135 | 136 | 137 | 138 | 139 | 140 | {% endfor %} 141 | 142 |
VariableSignificantMedian effect2σ Error Equivalent (+/-)
{{line['name']}}{% if line['error']|abs > line['median']|abs %} Not Significant {% else %} Significant {% endif %}{{line['median']}}%{{line['error']}}%
143 | 144 | 145 | 148 | 151 | 154 | 155 | -------------------------------------------------------------------------------- /sherlock.py: -------------------------------------------------------------------------------- 1 | from optparse import OptionParser 2 | from typing import Dict, List, Any, Union, Tuple 3 | 4 | import jinja2 5 | import pandas as pd 6 | import sys 7 | import logging 8 | 9 | from matplotlib.pylab import plt 10 | 11 | from pandas.errors import ParserError 12 | from pmprophet import PMProphet, Sampler 13 | import numpy as np 14 | import pymc3 as pm 15 | 16 | from lib.helpers import figure_to_base64, safe_mean, ERRORS, REQUIRED_COLUMNS, WARNINGS, OPTIONAL_COLUMNS 17 | from lib.plotting import plot_nowcast, plot_seasonality 18 | 19 | 20 | def read_input_file(file_path: str) -> pd.DataFrame: 21 | try: 22 | df = pd.read_csv(file_path) 23 | except FileNotFoundError: 24 | logging.error(ERRORS['not_found']) 25 | sys.exit() 26 | except ParserError: 27 | logging.error(ERRORS['not_readable']) 28 | sys.exit() 29 | missing_columns = ", ".join(set(REQUIRED_COLUMNS) - set(df.columns)) 30 | if missing_columns: 31 | logging.error(ERRORS['missing_columns'].format(missing_columns)) 32 | sys.exit() 33 | try: 34 | df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y') 35 | except ValueError: 36 | logging.error(ERRORS['date_column_not_readable']) 37 | sys.exit() 38 | return df.sort_values('date') 39 | 40 | 41 | def handle_outliers(data: pd.DataFrame) -> pd.DataFrame: 42 | if options.sigma: 43 | y = [] 44 | raw_y = data['y'].values 45 | for idx, val in enumerate(raw_y): 46 | if val < 0 or not (raw_y.mean() - options.sigma * raw_y.std()) < val < ( 47 | raw_y.mean() + options.sigma * raw_y.std()): 48 | ts_slice = raw_y[idx - 20:idx + 20] 49 | y.append(np.median(ts_slice[ts_slice >= 0])) 50 | else: 51 | y.append(val) 52 | data['y'] = y 53 | else: 54 | if data['y'].min() < 0: 55 | raise Exception(ERRORS['conversion_less_than_0']) 56 | return data 57 | 58 | 59 | def fit_beta_regression(model: PMProphet, data: pd.DataFrame) -> PMProphet: 60 | model._prepare_fit() 61 | with model.model: 62 | mean = pm.Deterministic('y_%s' % model.name, model.y) # no scaling needed 63 | hp_alpha = pm.HalfCauchy('y_alpha_%s' % model.name, 2.5) 64 | hp_beta = pm.Deterministic('y_beta_%s' % model.name, hp_alpha * ((1 - mean) / mean)) 65 | pm.Beta("observed_%s" % model.name, hp_alpha, hp_beta, observed=data['y']) 66 | pm.Deterministic('y_hat_%s' % model.name, mean) 67 | model.fit(10000 if options.sampler == 'metropolis' else 2000, 68 | method=Sampler.METROPOLIS if options.sampler == 'metropolis' else Sampler.NUTS, 69 | finalize=False, 70 | step_kwargs={'compute_convergence_checks': False} if options.sampler == 'metropolis' else {}) 71 | return model 72 | 73 | 74 | def summary_from_model_regressors(model: PMProphet, regressors: Union[List, Tuple]) -> List[ 75 | Dict[str, Union[str, float]]]: 76 | alpha = options.alpha 77 | summary = [] 78 | for idx, regressor in enumerate(regressors): 79 | error = (pd.np.percentile( 80 | model.trace['regressors_{}'.format(model.name)][:, idx], 81 | 100 - (alpha * 100 / 2) 82 | ) - pd.np.percentile( 83 | model.trace['regressors_{}'.format(model.name)][:, idx], 84 | (alpha * 100 / 2) 85 | )) / 2 86 | summary.append({ 87 | 'name': regressor, 88 | 'median': pd.np.round(pd.np.median(model.trace['regressors_{}'.format(model.name)][:, idx] * 100), 2), 89 | 'error': pd.np.round(error * 100, 2), 90 | }) 91 | return summary 92 | 93 | 94 | def create_model(model_name: str, data: pd.DataFrame, seasonality_scale: float, growth: bool, 95 | regressors: Union[List, Tuple] = (), changepoints: Union[List, Tuple] = ()) -> PMProphet: 96 | model = PMProphet( 97 | data, 98 | growth=growth, 99 | seasonality_prior_scale=seasonality_scale, 100 | changepoints=[] if not changepoints else changepoints, 101 | name=model_name, 102 | ) 103 | 104 | for regressor in regressors: 105 | model.add_regressor(regressor) 106 | 107 | if not options.weekly: 108 | model.add_seasonality(7, 3) 109 | 110 | if (data['ds'].max() - data['ds'].min()).days > 365: 111 | model.add_seasonality(365, 5) 112 | return model 113 | 114 | 115 | def visual_update_analysis(df: pd.DataFrame) -> Tuple[Dict[str, str], List[ 116 | Dict[str, Union[str, float]]]]: 117 | summary = [] 118 | template_vars = {} 119 | df = df.rename(columns={'date': 'ds'}) 120 | if 'asa_impressions' in df.columns: 121 | df['impressions'] = df['search_impressions'] - df['asa_impressions'] 122 | df['conversions'] = df['search_downloads'] - df['asa'] 123 | else: 124 | df['impressions'] = df['search_impressions'] 125 | df['conversions'] = df['search_downloads'] 126 | df['y'] = df['conversions'] / df['impressions'] 127 | df.index = df['ds'] 128 | 129 | if options.weekly: 130 | df = df.resample('W').apply(safe_mean) 131 | 132 | df = handle_outliers(df.copy()) 133 | 134 | time_regressors = [] 135 | for _, row in df.iterrows(): 136 | if row['update'] == 'visual' or row['update'] == 'all': 137 | additional_regressor = '{} (visual)'.format(str(row['ds']).split(" ")[0]) 138 | df[additional_regressor] = [1 if other_row['ds'] >= row['ds'] else 0 for 139 | _, other_row in df.iterrows()] 140 | time_regressors.append(additional_regressor) 141 | 142 | model = create_model('sherlock_visual', df, 1.0, False, time_regressors) 143 | conversion_model = fit_beta_regression(model, df) 144 | 145 | fig = plot_nowcast(conversion_model, 146 | [row['ds'] for _, row in df.iterrows() if row['update'] == 'visual' or row['update'] == 'all']) 147 | plt.title('Conversion & Visual Updates') 148 | template_vars['visual_model'] = figure_to_base64(fig) 149 | 150 | summary.extend(summary_from_model_regressors(conversion_model, time_regressors)) 151 | seasonality = {} 152 | for period, fig in plot_seasonality(conversion_model, alpha=options.alpha, plot_kwargs={}).items(): 153 | seasonality[int(period)] = figure_to_base64(fig) 154 | template_vars['conversion_seasonality'] = seasonality 155 | 156 | return template_vars, summary 157 | 158 | 159 | def textual_update_analysis(df: pd.DataFrame, extra_columns: List) -> Tuple[Dict[str, str], List[ 160 | Dict[str, Union[str, float]]]]: 161 | template_vars: Dict[str, Any] = {} 162 | summary = [] 163 | df = df.rename(columns={'date': 'ds', 'search_downloads': 'y'}) 164 | if 'asa' in df.columns: 165 | df['y'] = df['y'] - df['asa'] 166 | df.index = df['ds'] 167 | 168 | df = handle_outliers(df) 169 | 170 | if options.weekly: 171 | df = df.resample('W').apply(safe_mean) 172 | 173 | time_regressors = [] 174 | for _, row in df.iterrows(): 175 | if row['update'] == 'textual' or row['update'] == 'all': 176 | additional_regressor = '{} (text)'.format(str(row['ds']).split(" ")[0]) 177 | df[additional_regressor] = [other_row['y'] if other_row['ds'] >= row['ds'] else 0 for 178 | _, other_row in df.iterrows()] 179 | time_regressors.append(additional_regressor) 180 | 181 | seasonality_scale = df['y'].std() if options.seasonality_scale == 0 else options.seasonality_scale 182 | model = create_model('sherlock_textual', df, seasonality_scale, True, time_regressors + extra_columns) 183 | 184 | model.fit(10000 if options.sampler == 'metropolis' else 2000, 185 | method=Sampler.METROPOLIS if options.sampler == 'metropolis' else Sampler.NUTS, 186 | step_kwargs={'compute_convergence_checks': False} if options.sampler == 'metropolis' else {}) 187 | 188 | fig = plot_nowcast(model, 189 | [row['ds'] for _, row in df.iterrows() if row['update'] == 'textual' or row['update'] == 'all']) 190 | plt.title('Downloads & Textual Updates') 191 | template_vars['textual_model'] = figure_to_base64(fig) 192 | 193 | summary.extend(summary_from_model_regressors(model, time_regressors + extra_columns)) 194 | 195 | extra_regressors_plots: List[Dict[str, str]] = [] 196 | for i in range(len(time_regressors), len(time_regressors) + len(extra_columns)): 197 | fig = plt.figure() 198 | plt.grid() 199 | plt.hist(model.trace['regressors_{}'.format(model.name)][:, i] * 100, bins=30, alpha=0.8, histtype='stepfilled', 200 | density=True) 201 | plt.axvline(np.median(model.trace['regressors_{}'.format(model.name)][:, i]) * 100, color="C3", lw=1, 202 | ls="dotted") 203 | plt.title("{} (in %)".format(extra_columns[i - len(time_regressors)])) 204 | extra_regressors_plots.append({ 205 | 'name': extra_columns[i - len(time_regressors)], 206 | 'img_data': figure_to_base64(fig) 207 | }) 208 | 209 | template_vars['extra_regressors_plots'] = extra_regressors_plots 210 | 211 | seasonality = {} 212 | for period, fig in plot_seasonality(model, alpha=options.alpha, plot_kwargs={}).items(): 213 | seasonality[int(period)] = figure_to_base64(fig) 214 | template_vars['textual_seasonality'] = seasonality 215 | 216 | return template_vars, summary 217 | 218 | 219 | def run_sherlock() -> None: 220 | template_vars = {'textual_seasonality': {}, 'conversion_seasonality': {}} 221 | 222 | df = read_input_file(options.input_file) 223 | df['update'] = df['update'].str.lower() 224 | for unknown_update in (set(df['update'].unique()) - {'textual', 'visual', pd.np.nan, 'all'}): 225 | logging.warning(WARNINGS['update_not_understood'].format(unknown_update)) 226 | time_span = (df['date'].max() - df['date'].min()).days 227 | if time_span < 7: 228 | logging.error(ERRORS['timespan_too_short']) 229 | sys.exit() 230 | if time_span < 30: 231 | logging.warning(WARNINGS['timespan_too_short']) 232 | extra_columns = list(set(df.columns) - set(REQUIRED_COLUMNS + OPTIONAL_COLUMNS + ['date', 'search_downloads'])) 233 | summary = [] 234 | 235 | tv, s = visual_update_analysis(df.copy()) 236 | template_vars.update(tv) 237 | summary.extend(s) 238 | 239 | tv, s = textual_update_analysis(df.copy(), extra_columns) 240 | template_vars.update(tv) 241 | summary.extend(s) 242 | 243 | if options.app_name: 244 | template_vars['app_name'] = options.app_name 245 | template_vars['summary'] = summary 246 | 247 | template_env = jinja2.Environment(loader=jinja2.FileSystemLoader(searchpath="./lib")) 248 | template = template_env.get_template("template.html") 249 | 250 | with open(options.output_file, 'w') as output_file: 251 | output_file.write(template.render(**template_vars)) 252 | 253 | 254 | if __name__ == '__main__': 255 | parser = OptionParser() 256 | parser.add_option("-a", "--app-name", dest='app_name', default=None, 257 | help="Specify the app name if you want a more personalized report") 258 | parser.add_option("-i", "--input-file", dest="input_file", 259 | help="Input CSV file", metavar="FILE") 260 | parser.add_option("-o", "--output-file", dest="output_file", 261 | help="Output report file (in html format)", metavar="FILE", default='report.html') 262 | parser.add_option("-s", "--sampler", dest='sampler', choices=['metropolis', 'nuts'], default='metropolis', 263 | help='Sampler to use ("nuts" is slower but more precise, default "metropolis")') 264 | parser.add_option("-n", "--no-asa", dest='no_asa', action="store_true", default=False, 265 | help="Do not use ASA as an additional regressor (better seasonality fits)") 266 | parser.add_option("-w", "--weekly", dest='weekly', action="store_true", default=False, 267 | help="Run the analysis on a weekly resampling") 268 | parser.add_option("-r", "--remove-outliers-sigma", dest='sigma', default=False, type='float', 269 | help='''Remove outliers at more than X sigma from the mean (suggested values range between 1.5-3.5). 270 | Default value is: 0 that means that Sherlock will not remove outliers''') 271 | parser.add_option("-l", "--significance-level", dest='alpha', default=0.05, type='float', 272 | help="The significance level for the analysis (default is 0.05)") 273 | parser.add_option("-k", "--seasonality-scale", dest='seasonality_scale', default=0, type='float', 274 | help="""The scale of the seasonality, if it fits poorly because you have 275 | great variance due to seasonality increase this. By default this is automatically computed""") 276 | 277 | (options, args) = parser.parse_args() 278 | 279 | if not options.input_file: 280 | logging.error(ERRORS['no_input']) 281 | sys.exit() 282 | if options.no_asa: 283 | OPTIONAL_COLUMNS.append('asa') 284 | 285 | try: 286 | run_sherlock() 287 | except pm.exceptions.SamplingError: 288 | print('NUTS cannot model the data, retrying with metropolis') 289 | options.sampler = 'metropolis' 290 | run_sherlock() 291 | --------------------------------------------------------------------------------