├── .gitignore ├── Pipfile ├── lib ├── helpers.py ├── plotting.py └── template.html ├── examples └── example.csv ├── readme.MD └── sherlock.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.pyo 3 | out.html 4 | Pipfile.lock 5 | .DS_Store 6 | .idea 7 | __pycache__/ 8 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | 8 | [packages] 9 | pmprophet = "==0.2.8" 10 | arviz = "*" 11 | pymc3 = "*" 12 | jinja2 = "*" 13 | matplotlib = "*" 14 | 15 | [requires] 16 | python_version = "3.7" 17 | -------------------------------------------------------------------------------- /lib/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import io 3 | import base64 4 | 5 | ERRORS = { 6 | "no_input": "No input data file was provided", 7 | "not_found": "I can't find the input file you provided me, try removing any space in it's name", 8 | "not_readable": "The input file is not readable (make sure it's a valid csv file)", 9 | "missing_columns": "The input data file is missing one or more mandatory columns: {}", 10 | "date_column_not_readable": "The date column is not readable, try using the format dd/mm/yyyy (e.g. 30/12/1990)", 11 | "timespan_too_short": "Provide at least 7 days of data", 12 | "dls_less_than_0": "Downloads are below zero for certain dates. Check your data or allow outlier removal with -r", 13 | "conversion_less_than_0": "The conversion is below zero for certain dates. Check your data or allow outlier removal with -r", 14 | } 15 | 16 | WARNINGS = { 17 | 'update_not_understood': 'An update was marked as "{}" and was ignored, valid updates are "textual" and "visual"', 18 | "timespan_too_short": 'For best results use at least one month of data', 19 | } 20 | 21 | INFO = { 22 | 'additional_regressor': 'Using column {} as an additional regressor' 23 | } 24 | 25 | REQUIRED_COLUMNS = [ 26 | 'date', 27 | 'update', 28 | 'search_downloads', 29 | 'search_impressions' 30 | ] 31 | 32 | OPTIONAL_COLUMNS = [ 33 | 'asa_impressions', 34 | ] 35 | 36 | 37 | def figure_to_base64(fig): 38 | io_stream = io.BytesIO() 39 | fig.savefig(io_stream, format='png') 40 | io_stream.seek(0) 41 | return (b'data:image/png;base64, ' + base64.b64encode(io_stream.read())).decode() 42 | 43 | 44 | def safe_mean(x): 45 | try: 46 | return np.mean(x) 47 | except TypeError: 48 | x = x.dropna() 49 | if x.empty: 50 | return None 51 | else: 52 | return x[0] 53 | -------------------------------------------------------------------------------- /examples/example.csv: -------------------------------------------------------------------------------- 1 | date,update,search_downloads,search_impressions,asa_impressions,asa,google 2 | 01/11/2018,,725.0,8786,3105,334,94 3 | 02/11/2018,,711.0,8374,3890,348,74 4 | 03/11/2018,,790.0,9515,3900,400,71 5 | 04/11/2018,,902.0,10371,4600,517,107 6 | 05/11/2018,,721.0,8681,3409,369,108 7 | 06/11/2018,,733.0,8243,3642,391,95 8 | 07/11/2018,,667.0,7793,2524,322,97 9 | 08/11/2018,,775.0,8892,3083,391,67 10 | 09/11/2018,,687.0,9052,3672,373,108 11 | 10/11/2018,,745.0,8692,3473,433,110 12 | 11/11/2018,,762.0,9351,4142,390,92 13 | 12/11/2018,,776.0,9371,3805,395,91 14 | 13/11/2018,,725.0,8364,3226,330,204 15 | 14/11/2018,,696.0,7730,2417,297,271 16 | 15/11/2018,,735.0,8900,3719,321,252 17 | 16/11/2018,,940.0,11172,4787,516,223 18 | 17/11/2018,,1067.0,14395,8754,686,188 19 | 18/11/2018,,1133.0,14722,9353,770,153 20 | 19/11/2018,,929.0,13941,7075,644,129 21 | 20/11/2018,,855.0,12840,5821,519,130 22 | 21/11/2018,,916.0,12024,5344,490,157 23 | 22/11/2018,,876.0,11490,5656,461,159 24 | 23/11/2018,,970.0,11383,6281,528,123 25 | 24/11/2018,,912.0,14066,7242,561,113 26 | 25/11/2018,,818.0,11886,6055,524,118 27 | 26/11/2018,,823.0,10661,7744,630,98 28 | 27/11/2018,,670.0,9266,4024,323,133 29 | 28/11/2018,,854.0,11844,5596,470,102 30 | 29/11/2018,,874.0,11668,5727,523,118 31 | 30/11/2018,,730.0,12789,6472,381,112 32 | 01/12/2018,,1109.0,14686,8085,629,117 33 | 02/12/2018,,1088.0,12858,6831,622,169 34 | 03/12/2018,visual,926.0,11366,5811,569,115 35 | 04/12/2018,,825.0,9462,4689,489,105 36 | 05/12/2018,,919.0,10024,4804,522,102 37 | 06/12/2018,,905.0,8799,3773,347,110 38 | 07/12/2018,,746.0,8452,3723,309,134 39 | 08/12/2018,,871.0,10105,4313,484,154 40 | 09/12/2018,,1065.0,11098,6048,680,128 41 | 10/12/2018,,866.0,9660,4936,485,112 42 | 11/12/2018,,930.0,11273,4725,560,162 43 | 12/12/2018,,1097.0,11491,5563,797,190 44 | 13/12/2018,,1170.0,13394,7253,778,156 45 | 14/12/2018,,1364.0,14422,10244,954,145 46 | 15/12/2018,,1451.0,18068,12438,1087,179 47 | 16/12/2018,,1568.0,19479,14878,1151,201 48 | 17/12/2018,,1315.0,18427,12337,985,142 49 | 18/12/2018,,1412.0,13688,9544,1001,111 50 | 19/12/2018,,1371.0,17575,10999,1058,149 51 | 20/12/2018,,1506.0,15090,9029,1095,194 52 | 21/12/2018,,1297.0,13762,8338,937,171 53 | 22/12/2018,all,1111.0,12600,6691,734,158 54 | 23/12/2018,,829.0,10262,4932,472,148 55 | 24/12/2018,,898.0,11565,4308,469,115 56 | 25/12/2018,,738.0,10827,4558,354,89 57 | 26/12/2018,,949.0,11061,5279,453,136 58 | 27/12/2018,,868.0,11166,5101,568,122 59 | 28/12/2018,,958.0,11272,5134,545,128 60 | 29/12/2018,,934.0,11317,4647,460,145 61 | 30/12/2018,,1044.0,12539,5156,601,178 62 | 31/12/2018,,1366.0,15050,7941,873,190 -------------------------------------------------------------------------------- /readme.MD: -------------------------------------------------------------------------------- 1 | # Sherlock - The ASO inspector 2 | 3 | Sherlock is a program I wrote as a companion to my presentation at the [ASO Conference in NYC](https://asoconference.com/newyork/) in 2019. 4 | 5 | It's based on top of a library I wrote called [pm-prophet](https://github.com/luke14free/pm-prophet) 6 | 7 | It can produce curated reports about your app and it's traffic, including: 8 | * Updates impact (both visual and textual updates) 9 | * Seasonality 10 | * Uplift 11 | 12 | **Sample output**: is demostrated in the file `examples/report.html` (which is a fit of sherlock on some fake data contained in `examples/example.csv`). 13 | 14 | ### Installation 15 | 16 | 1) [Install python 3](https://www.python.org/downloads/) 17 | 2) [Install pipenv](https://docs.pipenv.org/en/latest/install/) 18 | 3) Open your terminal and run `pipenv install` from the folder in which you have cloned this repository 19 | 20 | Et Voilà. You should be all set. 21 | 22 | ### Preparing your data 23 | 24 | Sherlock takes in input a csv file of different time-series. The naming convention is strict and it's case-sensitive. 25 | 26 | **Required columns:** 27 | 28 | 1) `date` the date of the events (expressed in *dd/mm/yyyy* format, e.g. 12/30/2019). 29 | 2) `update` a column with value `textual`, `visual` or `all` (i.e. visual and textual) to tell sherlock what type of update was done in that particular date (in lowercase!). 30 | 3) `search_downloads` downloads data coming from iTunes connect filtered by App Store Search source 31 | 4) `search_impressions` impressions data coming from iTunes connect filtered by App Store Search source 32 | 33 | **Optional columns (if you do Apple Search Ads):** 34 | 35 | 1) `asa_impressions` impressions coming from Apple Search Ads 36 | 2) `asa` downloads coming from Apple Search Ads 37 | 38 | **Extra columns:** Any other time series you like (e.g. Facebook, Google, Snapchat..) for which to compute uplift. 39 | 40 | ### Usage 41 | 42 | Usage is very simple, from the terminal run: 43 | 44 | `pipenv shell` 45 | 46 | to activate the virtual environment and: 47 | 48 | `python sherlock.py -i example.csv` 49 | 50 | To run the script. There are some options available: 51 | 52 | ``` 53 | Options: 54 | -h, --help show this help message and exit 55 | -a APP_NAME, --app-name=APP_NAME 56 | Specify the app name if you want a more personalized 57 | report 58 | -i FILE, --input-file=FILE 59 | Input CSV file 60 | -o FILE, --output-file=FILE 61 | Output report file (in html format) 62 | -s SAMPLER, --sampler=SAMPLER 63 | Sampler to use ("nuts" is slower but more precise, 64 | default "metropolis") 65 | -n, --no-asa Do not use ASA as an additional regressor (better 66 | seasonality fits) 67 | -w, --weekly Run the analysis on a weekly resampling 68 | -r SIGMA, --remove-outliers-sigma=SIGMA 69 | Remove outliers at more than X sigma from the mean 70 | (suggested values range between 1.5-3.5). Default 71 | value is: 0 that means that Sherlock will not remove 72 | outliers 73 | -l ALPHA, --significance-level=ALPHA 74 | The significance level for the analysis (default is 75 | 0.05) 76 | -k SEASONALITY_SCALE, --seasonality-scale=SEASONALITY_SCALE 77 | The scale of the seasonality, if it fits poorly 78 | because you have great variance due to seasonality 79 | increase this. By default this is automatically 80 | computed 81 | ``` 82 | 83 | ### Output 84 | 85 | Open the `report.html` file generated by Sherlock. 86 | 87 | ### License 88 | 89 | [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/legalcode) 90 | -------------------------------------------------------------------------------- /lib/plotting.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pylab as plt 3 | import numpy as np 4 | 5 | 6 | def plot_nowcast(model, updates): 7 | fig = plt.figure(figsize=(20, 10)) 8 | y = model.trace['y_hat_%s' % model.name] 9 | ddf = pd.DataFrame( 10 | [ 11 | np.percentile(y, 50, axis=0), 12 | np.max(y, axis=0), 13 | np.min(y, axis=0), 14 | ] 15 | ).T 16 | ddf["ds"] = model.data["ds"] 17 | ddf.columns = ["y_hat", "y_low", "y_high", "ds"] 18 | ddf["orig_y"] = model.data["y"] 19 | ddf.plot("ds", "y_hat", ax=plt.gca()) 20 | plt.fill_between( 21 | ddf["ds"].values, 22 | ddf["y_low"].values.astype(float), 23 | ddf["y_high"].values.astype(float), 24 | alpha=0.3, 25 | ) 26 | ddf.plot("ds", "orig_y", style="k.", ax=plt.gca(), alpha=0.3) 27 | for update in updates: 28 | plt.axvline( 29 | update, color="C3", lw=1, ls="dotted" 30 | ) 31 | plt.grid(axis="y") 32 | return fig 33 | 34 | 35 | def plot_predict(prediction, original, updates): 36 | fig = plt.figure(figsize=(20, 10)) 37 | prediction.plot("ds", "y_hat", ax=plt.gca()) 38 | prediction["orig_y"] = original["y"] 39 | plt.fill_between( 40 | prediction["ds"].values, 41 | prediction["y_low"].values.astype(float), 42 | prediction["y_high"].values.astype(float), 43 | alpha=0.3, 44 | ) 45 | 46 | prediction.plot("ds", "orig_y", style="k.", ax=plt.gca(), alpha=0.2) 47 | for update in updates: 48 | plt.axvline( 49 | update, color="C3", lw=1, ls="dotted" 50 | ) 51 | plt.grid(axis="y") 52 | return fig 53 | 54 | 55 | def plot_seasonality(self, alpha: float, plot_kwargs: bool): 56 | periods = list(set([float(i.split("_")[1]) for i in self.seasonality])) 57 | 58 | additive_ts, multiplicative_ts = self._fit_seasonality() 59 | 60 | all_seasonalities = [("additive", additive_ts)] 61 | if len(self.multiplicative_data): 62 | all_seasonalities.append(("multiplicative", multiplicative_ts)) 63 | all_figures = {} 64 | 65 | for sn, ts in all_seasonalities: 66 | if (sn == "multiplicative" and np.sum(ts) == 1) or ( 67 | sn == "additive" and np.sum(ts) == 0 68 | ): 69 | continue 70 | ddf = pd.DataFrame( 71 | np.vstack( 72 | [ 73 | np.percentile(ts[:, :, self.skip_first:], 50, axis=-1), 74 | np.percentile( 75 | ts[:, :, self.skip_first:], alpha / 2 * 100, axis=-1 76 | ), 77 | np.percentile( 78 | ts[:, :, self.skip_first:], (1 - alpha / 2) * 100, axis=-1 79 | ), 80 | ] 81 | ).T, 82 | columns=[ 83 | "%s_%s" % (p, l) 84 | for l in ["mid", "low", "high"] 85 | for p in periods[::-1] 86 | ], 87 | ) 88 | ddf.loc[:, "ds"] = self.data["ds"] 89 | 90 | for period in periods: 91 | if int(period) == 0: 92 | step = int( 93 | self.data["ds"].diff().mean().total_seconds() // float(period) 94 | ) 95 | else: 96 | step = int(period) 97 | graph = ddf.head(step) 98 | if period == 7: 99 | ddf.loc[:, "dow"] = [i for i in ddf["ds"].dt.weekday] 100 | graph = ( 101 | ddf[ 102 | [ 103 | "dow", 104 | "%s_low" % period, 105 | "%s_mid" % period, 106 | "%s_high" % period, 107 | ] 108 | ] 109 | .groupby("dow") 110 | .mean() 111 | .sort_values("dow") 112 | ) 113 | graph.loc[:, "ds"] = [ 114 | ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][i] 115 | for i in graph.index 116 | ] 117 | graph = graph.sort_index() 118 | fig = plt.figure(**plot_kwargs) 119 | all_figures[period] = fig 120 | graph.plot( 121 | y="%s_mid" % period, x="ds", color="C0", legend=False, ax=plt.gca() 122 | ) 123 | plt.grid() 124 | 125 | if period == 7: 126 | plt.xticks(range(7), graph["ds"].values) 127 | plt.fill_between( 128 | np.arange(0, 7), 129 | graph["%s_low" % period].values.astype(float), 130 | graph["%s_high" % period].values.astype(float), 131 | alpha=0.3, 132 | ) 133 | else: 134 | plt.fill_between( 135 | graph["ds"].values, 136 | graph["%s_low" % period].values.astype(float), 137 | graph["%s_high" % period].values.astype(float), 138 | alpha=0.3, 139 | ) 140 | 141 | plt.title("Model Seasonality (%s) for period: %s days" % (sn, period)) 142 | plt.gca().xaxis.label.set_visible(False) 143 | return all_figures 144 | -------------------------------------------------------------------------------- /lib/template.html: -------------------------------------------------------------------------------- 1 | 2 |
3 |An open source tool by Luca 32 | Giacomel. 33 | Based on pm-prophet, 34 | license is CC BY-SA.
35 | 36 |This model shows a fit of the downloads model, using as predictors the seasonality trends, paid user acquisition and 48 | the textual updates performed over time. Red lines represent textual updates dates.
49 |The impact of updates is shown in the summary.
50 |This model shows a fit of the conversion model, using as predictors the weekly seasonality trend and 56 | the visual updates performed over time. Red lines represent visual updates dates.
57 |The impact of updates is shown in the summary.
58 |IMPORTANT Notice: please note that it's very likely that conversion reported by ASc before 1st March 2019 is not reliable
59 | as the App Store was counting impressions for apps that were not being shown in the search results (impressions were over-reported).
60 |
61 | {% endif %}
62 |
63 | {% if textual_seasonality[7] %}
64 |
66 | Seasonality is a common phenomenon to observe in apps, some make more downloads during the weekdays while other 67 | more during the weekends. Depending on the length of the time-series provided, you will also be able to see 68 | monthly and yearly seasonalities. 69 |
70 |71 | Note: if you have fitted a regressor that is also very seasonal (e.g. Apple Search Ads) the graph below 72 | will not be an accurate representation of your app seasonality 73 |
74 |
98 | Uplift (or k-factor) is the extra effect on organic downloads that is given by certain sources of traffic.
99 | The impact of the variable is shown as a full posterior distribution over the spectrum of possible values.
100 | Given that central values (where bars are higher) are more likely, if the 0 is in the central part of the histogram
101 | the impact of this variable will not be statistically significant as we cannot exclude with confidence that it's nonzero.
102 |
104 | Note: please consider that uplift is expressed in percentage (e.g., how many extra downloads do I get by this source on average?) 105 | and that this metric will also includes mis-attributions. 106 |
107 | {% for extra_regressor in extra_regressors_plots %} 108 |115 | Below you will find a summary of all the variables and their percentage effects over downloads. 116 | Note that the error reported is half of the width of credible interval for the posterior distribution 117 | and it's "equivalent" to a 95% confidence interval. Note: Coefficients for visual updates 118 | are absolute percentages (a 0.5% increase from a 5% baseline means an increase from 5% to 5.5% on average), while 119 | coefficients for textual updates are relative percentages (a 20% increase on a 100 downloads baseline is an 120 | increase of ~20 downloads per day on average). 121 |
122 | 123 || Variable | 127 |Significant | 128 |Median effect | 129 |2σ Error Equivalent (+/-) | 130 |
|---|---|---|---|
| {{line['name']}} | 136 |{% if line['error']|abs > line['median']|abs %} Not Significant {% else %} Significant {% endif %} | 137 |{{line['median']}}% | 138 |{{line['error']}}% | 139 |