├── CONTRIBUTING.md
├── Fractribution_Slides.pdf
├── LICENSE
├── README.md
└── py
├── Dockerfile
├── README.md
├── fractribution.py
├── main.py
├── requirements.txt
├── start.py
└── templates
├── channel_counts.sql
├── channel_definitions.sql
├── conversion_definition.sql
├── create_path_summary_results_table.sql
├── extract_channel_spend_data.sql
├── extract_channels.sql
├── extract_conversions.sql
├── extract_data.sql
├── extract_fullvisitorid_userid_map.sql
├── extract_ga_sessions.sql
├── generate_report.sql
├── path_summary.sql
├── path_transforms.sql
├── paths_to_conversion.sql
├── paths_to_non_conversion.sql
└── select_path_summary_query.sql
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | We'd love to accept your patches and contributions to this project. There are
4 | just a few small guidelines you need to follow.
5 |
6 | ## Contributor License Agreement
7 |
8 | Contributions to this project must be accompanied by a Contributor License
9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to to see
12 | your current agreements on file or to sign a new one.
13 |
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 |
18 | ## Code reviews
19 |
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 |
25 | ## Community Guidelines
26 |
27 | This project follows
28 | [Google's Open Source Community Guidelines](https://opensource.google/conduct/).
29 |
--------------------------------------------------------------------------------
/Fractribution_Slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/fractribution/72ad8a4a851c75152c700283fbc4f58450b083af/Fractribution_Slides.pdf
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | https://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | https://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Fractribution code base
2 |
3 | ## Attribution Overview
4 |
5 | In a marketing context, Attribution involves identifying the set of user actions
6 | ("events" or "touchpoints") that contribute in some manner to a desired outcome,
7 | and then assigning a value to each of these events. Users might click on
8 | multiple ads before converting. This can make it challenging to assign proper
9 | credit to the different marketing channels. For example, should all the credit
10 | go to the last ad the user saw, or the first ad? Should all the ads share in the
11 | credit equally? Or should some other rule be used to determine how to distribute
12 | credit?
13 |
14 | ## Data-driven attribution (DDA)
15 |
16 | DDA attempts to algorithmically work out a fair weighting of credit among
17 | marketing channels. For example, a particular display ad might not convert
18 | immediately, but users who click the display ad might be much more likely to
19 | convert later on. In this case, the display ad should get credit, even though
20 | it may not be the first or last ad on a user's path to conversion.
21 |
22 | ## Fractribution Package
23 |
24 | Google Marketing Platform products already support DDA. This Fractribution
25 | package is a DDA algorithm that generates **user-level fractional attribution
26 | values** for each conversion. The advantage of user-level attribution is that
27 | the attribution values can later be joined with custom user-level data (e.g.
28 | transaction value, lifetime value etc). This can be useful when regulation or
29 | data policy prevents ecommerce/revenue events from being shared with the Google
30 | Marketing Platform.
31 |
32 | Please see Fractribution_Slides.pdf file in this directory for more background
33 | on use cases and details on the DDA algorithm.
34 |
35 | ## Using Fractribution
36 |
37 | For more instructions, including a tutorial for running Fractribution over
38 | sample GA360 data from the Google Merchandise Store, from see py/README.md.
39 |
40 | ## Directory structure
41 |
42 | ```bash
43 | fractribution
44 | ├── README.md
45 | ├── py
46 | ├──── README.md
47 | ├──── main.py
48 | ├──── fractribution.py
49 | ├──── templates/
50 | ├──── Dockerfile
51 | └──── start.py
52 | ```
53 |
54 | Disclaimer: This is not an officially supported Google product.
55 |
--------------------------------------------------------------------------------
/py/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7-slim
2 |
3 | WORKDIR .
4 | COPY . /
5 |
6 | RUN pip install --upgrade functions-framework
7 | RUN pip install --upgrade google-api-python-client oauth2client
8 | RUN pip install --upgrade google-auth
9 | RUN pip install --upgrade google-auth-httplib2
10 | RUN pip install --upgrade google-cloud-bigquery
11 | RUN pip install --upgrade google-cloud-logging
12 | RUN pip install --upgrade google-cloud-pubsub
13 | RUN pip install --upgrade jinja2
14 |
15 | ENTRYPOINT python start.py
16 |
--------------------------------------------------------------------------------
/py/README.md:
--------------------------------------------------------------------------------
1 | ## Setting up Fractribution:
2 |
3 | ### Step 1: Extracting customer conversions:
4 |
5 | By default, `templates/extract_conversions.sql` runs over the GA360 BigQuery
6 | table using the conversion definition in `templates/conversion_definition.sql`.
7 | In most cases, you should only have to edit
8 | `templates/conversion_definition.sql`. However, if your conversions are stored
9 | outside the GA360 BigQuery table, use the instructions in
10 | `templates/extract_conversions.sql` to replace it with a custom script.
11 |
12 | The conversion window is the period of time that Fractribution will look to
13 | extract conversions. The window is specified by passing in the following flags:
14 |
15 | * ***`conversion_window_end_date`***: `'YYYY-MM-DD'` date in UTC time.
16 | Conversions up to midnight on this date are included.
17 | * ***`conversion_window_end_today_offset_days`***: Sets the
18 | `conversion_window_end_date` as an offset from today's date. This is an
19 | alternative to `conversion_window_end_date` used in regular scheduled runs
20 | of fractribution.
21 | * ***`conversion_window_length`***: Number of days in the conversion window,
22 | leading up to the end date of the conversion window.
23 |
24 | ### Step 2: Defining marketing channels (e.g. `Paid_Search_Brand`, `Email`, etc):
25 |
26 | Marketing channels are defined in `templates/channel_definitions.sql`. If the
27 | default set are not suitable, use the instructions in the file to write your
28 | own definitions.
29 |
30 | Fractribution uses ad spend by channel to compute return on ad spend (ROAS).
31 | Overwrite `templates/extract_channel_spend_data.sql` to enable ROAS reporting.
32 | Note that the ad spend period should include both the conversion window, and
33 | the preceding `path_lookback_days`.
34 |
35 | ### Step 3: Support for cross-device tracking:
36 |
37 | GA360 tracks users at the device level with a `fullVisitorId`. If a user logs
38 | into your site, you can send a custom `userId` to GA360, which is then
39 | associated with the `fullVisitorId`. If two `fullVisitorId`s map to the same
40 | `userId`, it can mean the user is logging in from two different devices.
41 |
42 | There are two ways to send a `userId` to GA360. First, there is a top-level
43 | `userId` field added specifically for this purpose. However, this userId may not
44 | be present in the GA360 BigQuery table, even if it is set in GA360. So the
45 | second way is to use a custom dimension (either hit-level or top-level).
46 |
47 | Fractribution supports cross-device tracking by maintaining its own mapping
48 | table of `fullVisitorId` to `userId`. Whenever a `userId` is present,
49 | Fractribution will use that to group GA360 sessions. Otherwise, it falls back to
50 | using the `fullVisitorId`.
51 |
52 | The main flags for supporting cross-device tracking are:
53 |
54 | * ***`update_fullvisitorid_userid_map`***: `True` to update the internal map
55 | from `fullVisitorId` to `userId`, and `False` otherwise. Default: `True`.
56 | * ***`userid_ga_custom_dimension_index`***: If you use a custom dimension for
57 | storing the `userId` in Google Analytics, set the index here. Fractribution
58 | will automatically look for the top-level `userId` field, even if this index
59 | is defined.
60 | * ***`userid_ga_hits_custom_dimension_index`***: If you use a hit-level custom
61 | dimension for storing the `userId` in Google Analytics, set the index here.
62 | Fractribution will automatically look for the top-level `userId` field, even
63 | if this index is defined.
64 |
65 | If you maintain your own mapping of `fullVisitorId` to `userId`, overwrite the
66 | script `templates/extract_fullvisitorid_userid_map.sql`.
67 |
68 | ## Fractribution Parameters:
69 |
70 | * ***`project_id`***: Google Cloud `project_id` to run Fractribution inside.
71 | * ***`dataset`***: BigQuery dataset to write the Fractribution output.
72 | * ***`region`***: Region to create the dataset if it does not exist (see
73 | https://cloud.google.com/bigquery/docs/locations).
74 | * ***`ga_sessions_table`***: Name of the GA360 BigQuery table in the format
75 | `..
_*`.
76 | * ***`hostnames`***: Comma separated list of hostnames. Restrict user sessions
77 | to this set of hostnames (Default: no restriction).
78 | * ***`conversion_window_end_date`***: `'YYYY-MM-DD'` date in UTC time to define
79 | the end of the reporting period (inclusive) to look for a conversion.
80 | * ***`conversion_window_end_today_offset_days`***: Set the conversion window end
81 | date to this many days before today. This is an alternative to
82 | `conversion_window_end_date` used in regular scheduled runs Fractribution.
83 | * ***`conversion_window_length`***: Number of days in the conversion window.
84 | * ***`path_lookback_days`***: Number of days in a user\'s path to
85 | (non)conversion. Recommended values: `30`, `14`, or `7`.
86 | * ***`path_lookback_steps`***: Limit the number of steps / marketing channels in
87 | a user's path to (non)conversion to the most recent path_lookback_steps.
88 | (Default: no restriction).
89 | * ***`update_fullvisitorid_userid_map`***: `True` to update the internal map
90 | from `fullVisitorId` to `userId`, and `False` otherwise. Default: `True`.
91 | * ***`userid_ga_custom_dimension_index`***: If you use a custom dimension for
92 | storing the `userId` in Google Analytics, set the index here. Fractribution
93 | will automatically look for the top-level `userId` field, even if this index
94 | is defined.
95 | * ***`userid_ga_hits_custom_dimension_index`***: If you use a hit-level custom
96 | dimension for storing the `userId` in Google Analytics, set the index here.
97 | Fractribution will automatically look for the top-level `userId` field, even
98 | if this index is defined.
99 | * ***`path_transform`***: Fractribution extracts a path of marketing channels
100 | for each user. The path transform will change this path to improve
101 | matching and performance of the Fractribution algorithm on sparse data. For
102 | example, if a user has several Direct to website visits, this can be
103 | compressed to one representative Direct to website visit. There are 4
104 | transforms to choose from. Given a path of channels
105 | `(D, A, B, B, C, D, C, C)`, the transforms are:
106 |
107 | * ***`unique`***: (identity transform): yielding `(D, A, B, B, C, D, C, C)`,
108 | * ***`exposure`***: (collapse sequential repeats, default option):
109 | yielding `(D, A, B, C, D, C)`,
110 | * ***`first`***: (remove repeats): yielding `(D, A, B, C)`,
111 | * ***`frequency`***: (remove repeats, but keep a count): yielding
112 | `(D(2), A(1), B(2), C(3))`
113 |
114 | Path transforms can now be chained together and are executed in the order
115 | specified. To specify multiple transforms from the command line, use one
116 | separate --path_transform for each transform. Otherwise, pass in a list of
117 | strings, one per transform. Additional options for transform are:
118 |
119 | * ***`trimLongPath(n)`***:
120 | * ***`removeIfNotAll(channel)`***:
121 | * ***`removeIfLastAndNotAll(channel)`***:
122 |
123 | Both removeIfNotAll and removeIfLastAndNotAll are typically used to downweight
124 | the contribution of the 'Direct' / 'Direct-to-site' channel.
125 | * ***`attribution_model`***: Which attribution model to use. Models include:
126 | `shapley`, `first_touch`, `last_touch`, `position_based` and `linear`.
127 | (Default: `shapley`).
128 | * ***`templates_dir`***: Optional directory containing custom SQL templates.
129 | When loading a template, this directory is checked first before the default
130 | ./templates directory.
131 | * ***`channel_definitions_sql`***: Optional argument to override the default
132 | filename of the SQL template for mapping channel definitions to channel names.
133 | * ***`conversion_definition_sql`***: Optional argument to override the default
134 | filename of the SQL template that defines a conversion.
135 | * ***`extract_conversions_sql`***: Optional argument to override the default
136 | filename of the SQL template for extracting all conversions.
137 |
138 | ## Tutorial: Running Fractribution on the Google Merchandise Store.
139 |
140 | We will run Fractribution over the
141 | [publicly-available GA360 dataset](https://support.google.com/analytics/answer/7586738?hl=en)
142 | for the [Google Merchandise store](https://googlemerchandisestore.com/), a real
143 | ecommerce store that sells Google-branded merchandise. You can view the
144 | obfuscated data on BigQuery
145 | [here](https://bigquery.cloud.google.com/table/bigquery-public-data:google_analytics_sample.ga_sessions_20170801).
146 |
147 | The easiest way to run Fractribution is manually from the command line. This
148 | works well for experimenting (e.g. with new conversion or channel definitions)
149 | and debugging. If you want to setup Fractribution to run on a schedule though,
150 | please see the following section on [Deploying Fractribution](#deploying-fractribution).
151 |
152 |
153 | To run from the command line, begin by downloading the fractribution folder to
154 | your local computer and then change directory into `fractribution/py`
155 |
156 | Next, select values for the following:
157 |
158 | * ***``***: GCP project in which to run Fractribution
159 | * ***``***: BigQuery dataset name to store the Fractribution output.
160 | * ***``***:
161 | [Region name](https://cloud.google.com/bigquery/docs/locations) in which to
162 | create ***``*** if it doesn't already exist. E.g. us-central1
163 |
164 |
165 |
166 | Then run the following command to authenticate with GCP:
167 |
168 | ```export GOOGLE_APPLICATION_CREDENTIALS=```
169 |
170 | Finally, run Fractribution with the following command:
171 |
172 | ```
173 | python3 main.py \
174 | --project_id= \
175 | --dataset=
176 | --region= \
177 | --ga_sessions_table=bigquery-public-data.google_analytics_sample.ga_sessions_* \
178 | --conversion_window_end_date=2017-08-01 \
179 | --conversion_window_length=30 \
180 | --path_lookback_days=30 \
181 | --path_transform=exposure \
182 | --attribution_model=shapley
183 | ```
184 |
185 | Once the command finishes, go to your BigQuery ***``*** and look at the
186 | results, including the final report table.
187 |
188 |
189 | ### Deploying Fractribution on GCP: Cloud Functions vs VM
190 |
191 | We recommend deploying Fractribution via Cloud Functions. Setup and maintenance
192 | are easier, and because Cloud Functions are serverless, you only pay for what
193 | you use. The main downsides are that Cloud Functions are limited to 2GB of RAM
194 | and 9 minutes of runtime. If Cloud Functions run out of memory or time on your
195 | data, switch to the VM approach, which allows you to select a compute engine
196 | with much higher memory and no time limits.
197 |
198 | Either way, as for the command line approach above, please select:
199 |
200 | * ***``***: GCP project in which to run Fractribution
201 | * ***``***: BigQuery dataset name to store the Fractribution output.
202 | * ***``***:
203 | [Region name](https://cloud.google.com/bigquery/docs/locations) in which to
204 | create ***``*** if it doesn't already exist. E.g. us-central1
205 |
206 | ### Approach 1: Running Python Cloud Functions (recommended)
207 |
208 | #### Setup
209 | [Install gcloud SDK](https://cloud.google.com/sdk/install)
210 |
211 | ```
212 | gcloud auth login && gcloud config set project
213 | ```
214 |
215 | Download the fractribution folder to your local computer and change directory
216 | into `fractribution/py`. We will use the default definition of customer
217 | conversion and revenue. We will also use the default channel definitions.
218 | However, to make the report more interesting, in
219 | `templates/extract_channel_spend_data.sql`, comment out the default SQL, and
220 | uncomment the sample uniform spend data instead.
221 |
222 | #### Deploying and Running the Fractribution Cloud Function
223 |
224 | For this tutorial, we will create a Cloud Function called `FractributionTest`.
225 |
226 | ```
227 | gcloud functions deploy FractributionTest \
228 | --runtime python37 \
229 | --region \
230 | --entry-point main \
231 | --trigger-event google.pubsub.topic.publish \
232 | --trigger-resource FractributionTestPubSub \
233 | --timeout 540s \
234 | --memory 2GB
235 | ```
236 |
237 | The `trigger-*` flags above setup how to run `FractributionTest`. The
238 | `trigger-resource` flags creates a PubSub topic called `FractributionTestPubSub`.
239 | The Cloud Function executes when it receives a message on this topic.
240 | To publish a message and trigger `FractributionTest`, use the following command:
241 |
242 | ```
243 | gcloud pubsub topics publish FractributionTestPubSub --message '{
244 | "project_id":"",
245 | "dataset":"",
246 | "region":"",
247 | "ga_sessions_table":"bigquery-public-data.google_analytics_sample.ga_sessions_*",
248 | "conversion_window_end_date":"2017-08-01",
249 | "conversion_window_length":30,
250 | "path_lookback_days":"30",
251 | "path_transform":"exposure",
252 | "attribution_model":"shapley"
253 | }'
254 | ```
255 |
256 | You can now go to your BigQuery `` to view the output of the first
257 | stage of fractribution. Note that the output tables all have the same suffix,
258 | which is the ``. This helps separate regular
259 | scheduled runs of Fractribution over time. We recommend looking at:
260 |
261 | * ***`report_table`***: Channel-level summary of attributed conversions,
262 | revenue, spend and ROAS.
263 | * ***`path_summary_table`***: For each transfomed path, total number of
264 | conversions, non-conversions, revenue, and channel-level fractional
265 | attribution values out of 1.
266 | * ***`channel_counts_table`***: Number of marketing events, aggregated by
267 | `channel`, `campaign`, `source` and `medium`
268 |
269 | #### Scheduling Fractribution to run end-to-end on the latest data.
270 |
271 | 1. Decide how often you want to run Fractribution.
272 |
273 | `CRON_SCHEDULE` = Schedule on which Fractribution will be executed in
274 | cron-unix format.
275 |
276 | Example: `15 1 1 * *` - Run every first day of the month at 1:15AM.
277 |
278 | 1. Use ***`conversion_window_end_today_offset_days`*** instead of the fixed
279 | ***`conversion_window_end_date`*** in the parameters. Suggested values are
280 | `1` or `2`, to give enough time for the Google Analytics data tables to be
281 | fully ingested into BigQuery.
282 |
283 | 1. Create a cron job to run Fractribution using Cloud Scheduler.
284 |
285 | ```
286 | gcloud scheduler jobs create pubsub Fractribution --schedule
287 | "" --topic FractributionTest --message-body '{
288 | "project_id":"",
289 | "dataset":"",
290 | "region":"",
291 | "ga_sessions_table":".ga_sessions_*",
292 | "conversion_window_end_today_offset_days":1,
293 | "conversion_window_length":30,
294 |
295 | }'
296 | ```
297 |
298 | #### Debugging Fractribution
299 |
300 | If you need to debug changes you've made, it is much faster to do locally,
301 | rather than going through the slower process of uploading several versions of
302 | the Cloud Function for each small change. The easiest way to debug is to use the
303 | standalone command-line version of Fractribution, as
304 | [described above](#running-fractribution). However, Cloud Functions do have a
305 | local-execution framework called `functions-framework`, which is described
306 | below:
307 |
308 | First, follow
309 | [these instructions](https://cloud.google.com/python/setup?hl=en#installing_python)
310 | for installing python and running a virtual environment to sandbox dependencies.
311 | In particular, from inside the `fractribution/py` directory:
312 |
313 | ```
314 | python3 -m venv venv
315 | source venv/bin/activate
316 | pip3 install -r requirements.txt
317 | export GOOGLE_APPLICATION_CREDENTIALS=
318 | functions-framework --target main --signature-type=event --debug
319 | ```
320 |
321 | The Fractribution cloud function is now running in a local web server. Instead
322 | of using PubSub, we `POST` the parameters to the Cloud Function using
323 | ***`curl`***. This means we have to encode the parameters in base64, e.g.
324 |
325 | Linux:
326 | ```
327 | echo -n '{"project_id":””, ...}’))
338 | ```
339 |
340 | Copy the encoded parameters text into the `curl` command below:
341 |
342 | ```
343 | export GOOGLE_APPLICATION_CREDENTIALS=
344 | curl -d '{"data": {"data": ""}}' \-X POST -H "Content-Type: application/json" http://0.0.0.0:8080
345 | ```
346 |
347 | That should capture both print statements and debug traces when things go wrong.
348 |
349 | ### Approach 2: Deploy Fractribution Docker Image on GCP VM Instance
350 |
351 | 1. Variables:
352 |
353 | 1. `` - Region of the VM Instance
354 | 1. `` - Zone of the VM Instance
355 | 1. `` - Service Account for Fractribution.
356 | It should have the following roles:
357 | * BigQuery Data Editor
358 | * BigQuery Job User
359 | * Compute Instance Admin (beta)
360 | * Logs Writer
361 | * Storage Object Viewer
362 | 1. `` - Schedule on which Fractribution will be
363 | executed in cron-unix format. Example: `"15 1 1 * *"` - Run every first
364 | day of the month at 1:15AM.
365 | 1. `` - Fractribution parameters for data and model
366 | in JSON format. Example value below:
367 |
368 | ```
369 | '{"project_id":"",
370 | "dataset":"",
371 | "region":"",
372 | "ga_sessions_table":"bigquery-public-data.google_analytics_sample.ga_sessions_*",
373 | "conversion_window_end_date":"2017-08-01",
374 | "conversion_window_length":30,
375 | "path_lookback_days":30,
376 | "path_transform":"exposure",
377 | "attribution_model":"shapley"}'
378 | ```
379 |
380 | 1. Create a docker image. From the Fractribution code directory:
381 |
382 | ```bash
383 | gcloud builds submit --tag gcr.io//
384 | ```
385 |
386 | 1. Set up the Compute Engine Instance
387 |
388 | 1. Go to the VM instances page
389 | https://console.cloud.google.com/compute/instances
390 | 1. Click Create instance.
391 | 1. Set the _Name_.
392 | 1. Click Add label. Enter `env` for _Key_ and `fractribution` for _Value_.
393 | 1. Select Region, then select `.en`
394 | 1. For _Zone_ select ``.
395 | 1. Select Deploy a container image.
396 | 1. Specify the container image (`gcr.io//`)
397 | created in Step #1.
398 | 1. Expand Advanced container options section.
399 | 1. Under Environment variables, click Add variable. Enter
400 | `fractribution_param` for _NAME_ and `` for
401 | _VALUE_.
402 | 1. Under Identity and API access, for Service account, select
403 | ``.
404 | 1. Click Create at the bottom of the page.
405 |
406 | 1. Set up Cloud Function to start a VM instance. (Reference)
407 |
408 | 1. Go to the Cloud Functions page in the Cloud Console.
409 | 1. Click Create Function.
410 | 1. Set the Name to startInstancePubSub.
411 | 1. Leave Memory allocated at its default value.
412 | 1. For Trigger, select Cloud Pub/Sub.
413 | 1. For Topic, select Create new topic....
414 | 1. A New pub/sub topic dialog box should appear.
415 | 1. Under Name, enter start-instance-event.
416 | 1. Click Create to finish the dialog box.
417 | 1. For Runtime, select Node.js 10.
418 | 1. Above the code text block, select the index.js tab.
419 | 1. Replace the starter code with the following code:
420 |
421 | ```
422 | const Compute = require('@google-cloud/compute');
423 | const compute = new Compute();
424 | /**
425 | * Starts Compute Engine instances.
426 | *
427 | * Expects a PubSub message with JSON-formatted event data containing the
428 | * following attributes:
429 | * zone - the GCP zone the instances are located in.
430 | * label - the label of instances to start.
431 | *
432 | * @param {!object} event Cloud Function PubSub message event.
433 | * @param {!object} callback Cloud Function PubSub callback indicating
434 | * completion.
435 | */
436 | exports.startInstancePubSub = async (event, context, callback) => {
437 | try {
438 | const payload = _validatePayload(
439 | JSON.parse(Buffer.from(event.data, 'base64').toString())
440 | );
441 | const options = {filter: `labels.${payload.label}`};
442 | const [vms] = await compute.getVMs(options);
443 | await Promise.all(
444 | vms.map(async (instance) => {
445 | if (payload.zone === instance.zone.id) {
446 | const [operation] = await compute
447 | .zone(payload.zone)
448 | .vm(instance.name)
449 | .start();
450 |
451 | // Operation pending
452 | return operation.promise();
453 | }
454 | })
455 | );
456 |
457 | // Operation complete. Instance successfully started.
458 | const message = `Successfully started instance(s)`;
459 | console.log(message);
460 | callback(null, message);
461 | } catch (err) {
462 | console.log(err);
463 | callback(err);
464 | }
465 | };
466 |
467 | /**
468 | * Validates that a request payload contains the expected fields.
469 | *
470 | * @param {!object} payload the request payload to validate.
471 | * @return {!object} the payload object.
472 | */
473 | const _validatePayload = (payload) => {
474 | if (!payload.zone) {
475 | throw new Error(`Attribute 'zone' missing from payload`);
476 | } else if (!payload.label) {
477 | throw new Error(`Attribute 'label' missing from payload`);
478 | }
479 | return payload;
480 | };
481 | ```
482 |
483 | 1. Above the code text block, select the package.json tab.
484 | 1. Replace the starter code with the following code:
485 |
486 | ```
487 | {
488 | "name": "cloud-functions-schedule-instance",
489 | "version": "0.1.0",
490 | "private": true,
491 | "license": "Apache-2.0",
492 | "author": "Google LLC",
493 | "repository": {
494 | "type": "git",
495 | "url": "https://github.com/GoogleCloudPlatform/nodejs-docs-samples.git"
496 | },
497 | "engines": {
498 | "node": ">=8.0.0"
499 | },
500 | "dependencies": {
501 | "@google-cloud/compute": "^1.0.0"
502 | }
503 | }
504 | ```
505 |
506 | 1. For Function to execute, enter `startInstancePubSub`.
507 | 1. Click Create.
508 |
509 | 1. Set up Cloud Scheduler to trigger Pub/Sub. (Reference)
510 |
511 | 1. Go to the Cloud Scheduler page in the Cloud Console.
512 | 1. Click Create Job.
513 | 1. Set the Name to `startup-fractribution-instance`.
514 | 1. For Frequency, enter ``.
515 | 1. For Timezone, select your desired country and timezone.
516 | 1. For Target, select Pub/Sub.
517 | 1. For Topic, enter `start-instance-event`.
518 | 1. For Payload, enter the following:
519 | `{"zone":"","label":"env=fractribution"}`
520 | 1. Click Create.
521 |
522 | Disclaimer: This is not an officially supported Google product.
523 |
--------------------------------------------------------------------------------
/py/fractribution.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2022 Google LLC..
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Library for computing fractional attribution."""
17 |
18 | import io
19 | import json
20 | import re
21 | from typing import Iterable, List, Mapping, Tuple
22 | from google.cloud import bigquery
23 |
24 | # Default channel name when no match is found.
25 | UNMATCHED_CHANNEL = 'Unmatched_Channel'
26 |
27 |
28 | class _PathSummary(object):
29 | """Stores conversion and attribution information.
30 |
31 | To save space, the path itself is not stored here, as it is already stored
32 | as the key of the _path_tuple_to_summary dict in Fractribution.
33 | """
34 |
35 | def __init__(self, conversions: int, non_conversions: int, revenue: float):
36 | self.conversions = conversions
37 | self.non_conversions = non_conversions
38 | self.revenue = revenue
39 | self.channel_to_attribution = {}
40 |
41 |
42 | class Fractribution(object):
43 | """Runs Fractribution on a set of marketing paths to (non-)conversion."""
44 |
45 | @classmethod
46 | def _get_path_string(cls, path_tuple: Iterable[str]) -> str:
47 | return ' > '.join(path_tuple)
48 |
49 | def __init__(self, query_job: bigquery.job.QueryJob):
50 | """Loads (path_str, conversions, non_conversions, revenue) from query_job.
51 |
52 | Args:
53 | query_job: QueryJob of (path_str, conversions, non_conversions, revenue).
54 | """
55 | self._path_tuple_to_summary = {}
56 | for (path_str, conversions, non_conversions, revenue) in query_job:
57 | path_tuple = ()
58 | if path_str:
59 | path_tuple = tuple(path_str.split(' > '))
60 | if path_tuple not in self._path_tuple_to_summary:
61 | self._path_tuple_to_summary[path_tuple] = _PathSummary(
62 | conversions, non_conversions, revenue)
63 | else:
64 | path_summary = self._path_tuple_to_summary[path_tuple]
65 | path_summary.conversions += conversions
66 | path_summary.non_conversions += non_conversions
67 |
68 | def _get_conversion_probability(
69 | self, path_tuple: Tuple[str, ...]) -> float:
70 | """Returns path_tuple conversion/(conversion+non_conversion) probability.
71 |
72 | Args:
73 | path_tuple: Tuple of channel names in the path.
74 |
75 | Returns:
76 | Conversion probability of customers with this path.
77 | """
78 |
79 | if path_tuple not in self._path_tuple_to_summary:
80 | return 0.0
81 | path_summary = self._path_tuple_to_summary[path_tuple]
82 | count = path_summary.conversions + path_summary.non_conversions
83 | if not count:
84 | return 0.0
85 | return path_summary.conversions / count
86 |
87 | def _get_counterfactual_marginal_contributions(
88 | self, path_tuple: Tuple[str, ...]) -> List[float]:
89 | """Returns the marginal contribution of each channel in the path.
90 |
91 | Args:
92 | path_tuple: Tuple of channel names in the path.
93 |
94 | Returns:
95 | List of marginal contribution values, one for each channel in path_tuple.
96 | """
97 | if not path_tuple:
98 | return []
99 | marginal_contributions = [0] * len(path_tuple)
100 | path_conversion_probability = self._get_conversion_probability(path_tuple)
101 | # If the path contains a single channel, it gets 100% of the contribution.
102 | if len(path_tuple) == 1:
103 | marginal_contributions[0] = path_conversion_probability
104 | else:
105 | # Otherwise, compute the counterfactual marginal contributions by channel.
106 | for i in range(len(path_tuple)):
107 | counterfactual_tuple = path_tuple[:i] + path_tuple[i+1:]
108 | raw_marginal_contribution = (
109 | path_conversion_probability -
110 | self._get_conversion_probability(counterfactual_tuple))
111 | # Avoid negative contributions by flooring to 0.
112 | marginal_contributions[i] = max(raw_marginal_contribution, 0)
113 | return marginal_contributions
114 |
115 | def run_fractribution(self, attribution_model: str) -> None:
116 | """Runs Fractribution with the given attribution_model.
117 |
118 | Side-effect: Updates channel_to_attribution dicts in _path_tuple_to_summary.
119 |
120 | Args:
121 | attribution_model: Must be a key in ATTRIBUTION_MODELS
122 | """
123 | self.ATTRIBUTION_MODELS[attribution_model](self)
124 |
125 | def run_shapley_attribution(self) -> None:
126 | """Compute fractional attribution values for all given paths.
127 |
128 | Side-effect: Updates channel_to_attribution dicts in _path_tuple_to_summary.
129 | """
130 | for path_tuple, path_summary in self._path_tuple_to_summary.items():
131 | # Ignore empty paths, which can happen when there is a conversion, but
132 | # no matching marketing channel events. Also ignore paths with no
133 | # conversions, since there is no attribution to make.
134 | if not path_tuple or not path_summary.conversions:
135 | continue
136 | path_summary.channel_to_attribution = {}
137 | marginal_contributions = self._get_counterfactual_marginal_contributions(
138 | path_tuple)
139 | sum_marginal_contributions = sum(marginal_contributions)
140 | if sum_marginal_contributions:
141 | marginal_contributions = [
142 | marginal_contribution / sum_marginal_contributions
143 | for marginal_contribution in marginal_contributions]
144 | # Use last touch attribution if no channel has a marginal_contribution.
145 | if sum_marginal_contributions == 0:
146 | marginal_contributions[-1] = 1
147 | # Aggregate the marginal contributions by channel, as channels can occur
148 | # more than once in the path.
149 | for i, channel in enumerate(path_tuple):
150 | path_summary.channel_to_attribution[channel] = (
151 | marginal_contributions[i]
152 | + path_summary.channel_to_attribution.get(channel, 0.0))
153 |
154 | def run_first_touch_attribution(self) -> None:
155 | """Assigns 100% attribution to the first channel in each path.
156 |
157 | Side-effect: Updates channel_to_attribution dicts in _path_tuple_to_summary.
158 | """
159 | for path_tuple, path_summary in self._path_tuple_to_summary.items():
160 | path_summary.channel_to_attribution = {}
161 | if not path_tuple:
162 | continue
163 | for channel in path_tuple:
164 | path_summary.channel_to_attribution[channel] = 0.0
165 | path_summary.channel_to_attribution[path_tuple[0]] = 1
166 |
167 | def run_last_touch_attribution(self) -> None:
168 | """Assigns 100% attribution to the last channel in each path.
169 |
170 | Side-effect: Updates channel_to_attribution dicts in _path_tuple_to_summary.
171 | """
172 | for path_tuple, path_summary in self._path_tuple_to_summary.items():
173 | path_summary.channel_to_attribution = {}
174 | if not path_tuple:
175 | continue
176 | for channel in path_tuple:
177 | path_summary.channel_to_attribution[channel] = 0.0
178 | path_summary.channel_to_attribution[path_tuple[-1]] = 1
179 |
180 | def run_linear_attribution(self) -> None:
181 | """Assigns attribution evenly between all channels on the path.
182 |
183 | Side-effect: Updates channel_to_attribution dicts in _path_tuple_to_summary.
184 | """
185 | for path_tuple, path_summary in self._path_tuple_to_summary.items():
186 | path_summary.channel_to_attribution = {}
187 | if not path_tuple:
188 | continue
189 | credit = 1.0 / len(path_tuple)
190 | for channel in path_tuple:
191 | path_summary.channel_to_attribution[channel] = (
192 | path_summary.channel_to_attribution.get(channel, 0.0) + credit)
193 |
194 | def run_position_based_attribution(self) -> None:
195 | """Assigns attribution using the position based algorithm.
196 |
197 | The first and last channels get 40% of the credit each, with the remaining
198 | channels getting the leftover 20% distributed evenly.
199 |
200 | Side-effect: Updates channel_to_attribution dicts in _path_tuple_to_summary.
201 | """
202 | for path_tuple, path_summary in self._path_tuple_to_summary.items():
203 | path_summary.channel_to_attribution = {}
204 | if not path_tuple:
205 | continue
206 | path_summary.channel_to_attribution[path_tuple[0]] = 0.4
207 | path_summary.channel_to_attribution[path_tuple[-1]] = (
208 | path_summary.channel_to_attribution.get(path_tuple[-1], 0) + 0.4)
209 | leftover_credit = 0
210 | middle_path = []
211 | if len(path_tuple) == 1:
212 | # All the leftover credit goes to the first and only channel
213 | leftover_credit = 0.2
214 | middle_path = path_tuple
215 | elif len(path_tuple) == 2:
216 | # The leftover credit is split between the two channels in the path.
217 | leftover_credit = 0.1
218 | middle_path = path_tuple
219 | else:
220 | # The leftover credit is evenly distributed among the middle channels.
221 | leftover_credit = 0.2 / (len(path_tuple) - 2)
222 | middle_path = path_tuple[1:-1]
223 | for channel in middle_path:
224 | path_summary.channel_to_attribution[channel] = (
225 | path_summary.channel_to_attribution.get(channel, 0.0) +
226 | leftover_credit)
227 |
228 | def normalize_channel_to_attribution_names(self) -> None:
229 | """Normalizes channel names and aggregates attribution values if necessary.
230 |
231 | Path transforms can also transform channel names to include a count
232 | related suffix (). This function undoes the transform on the channel
233 | name by removing the suffix, so that a single channel with two different
234 | suffixes can be aggregated.
235 |
236 | Side-effect: Updates channel_to_attribution names in _path_tuple_to_summary.
237 | """
238 | for path_summary in self._path_tuple_to_summary.values():
239 | channel_to_attribution = {}
240 | for channel in path_summary.channel_to_attribution:
241 | normalized_channel = re.sub(r'\(.*', '', channel)
242 | channel_to_attribution[normalized_channel] = (
243 | channel_to_attribution.get(normalized_channel, 0) +
244 | path_summary.channel_to_attribution[channel])
245 | path_summary.channel_to_attribution = channel_to_attribution
246 |
247 | def _path_summary_to_json_stringio(self) -> io.BytesIO:
248 | """Returns a BytesIO file with one JSON-encoded _PathSummary per line."""
249 |
250 | default_attribution = {UNMATCHED_CHANNEL: 1.0}
251 | bytesio = io.BytesIO()
252 | for path_tuple, path_summary in self._path_tuple_to_summary.items():
253 | row = {'transformedPath': self._get_path_string(path_tuple),
254 | 'conversions': path_summary.conversions,
255 | 'nonConversions': path_summary.non_conversions,
256 | 'revenue': path_summary.revenue}
257 | if path_summary.channel_to_attribution:
258 | row.update(path_summary.channel_to_attribution)
259 | else:
260 | row.update(default_attribution)
261 | bytesio.write(json.dumps(row).encode('utf-8'))
262 | bytesio.write('\n'.encode('utf-8'))
263 | bytesio.flush()
264 | bytesio.seek(0)
265 | return bytesio
266 |
267 | def upload_path_summary(
268 | self, client: bigquery.client.Client, path_summary_table: str) -> None:
269 | """Uploads the path summary data to the given path_summary_table.
270 |
271 | Args:
272 | client: BigQuery Client
273 | path_summary_table: Name of the table to write the path summaries.
274 | """
275 | job_config = bigquery.LoadJobConfig()
276 | job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
277 | job_config.autodetect = False
278 | job = client.load_table_from_file(
279 | self._path_summary_to_json_stringio(),
280 | client.get_table(path_summary_table),
281 | job_config=job_config)
282 | job.result() # Waits for table load to complete.
283 |
284 | def _get_channel_to_attribution(self) -> Mapping[str, float]:
285 | """Returns a mapping from channel to overall conversion attribution.
286 |
287 | Returns:
288 | Mapping from channel to overall conversion attribution.
289 | """
290 | default_attribution = {UNMATCHED_CHANNEL: 1.0}
291 | overall_channel_to_attribution = {}
292 | for path_summary in self._path_tuple_to_summary.values():
293 | channel_to_attribution = path_summary.channel_to_attribution
294 | if not channel_to_attribution:
295 | channel_to_attribution = default_attribution
296 | for channel, attribution in channel_to_attribution.items():
297 | overall_channel_to_attribution[channel] = (
298 | overall_channel_to_attribution.get(channel, 0.0)
299 | + attribution * path_summary.conversions)
300 | return overall_channel_to_attribution
301 |
302 | def _get_channel_to_revenue(self) -> Mapping[str, float]:
303 | """Returns a mapping from channel to overall revenue attribution.
304 |
305 | Returns:
306 | Mapping from channel to overall revenue attribution.
307 | """
308 | default_attribution = {UNMATCHED_CHANNEL: 1.0}
309 | overall_channel_to_revenue = {}
310 | for path_summary in self._path_tuple_to_summary.values():
311 | channel_to_attribution = path_summary.channel_to_attribution
312 | if not channel_to_attribution:
313 | channel_to_attribution = default_attribution
314 | revenue = path_summary.revenue
315 | if not revenue:
316 | revenue = 0.0
317 | for channel, attribution in channel_to_attribution.items():
318 | overall_channel_to_revenue[channel] = (
319 | overall_channel_to_revenue.get(channel, 0.0)
320 | + attribution * revenue)
321 | return overall_channel_to_revenue
322 |
323 | def upload_report_table(
324 | self,
325 | client: bigquery.client.Client,
326 | conversion_window_start_date: str,
327 | conversion_window_end_date: str,
328 | report_table: str) -> None:
329 | """Uploads the path summary data to the given path_summary_table.
330 |
331 | Args:
332 | client: BigQuery Client
333 | conversion_window_start_date: Start date of the report conversion window.
334 | conversion_window_end_date: End date of the report conversion window.
335 | report_table: Name of the table to write the report.
336 | """
337 | bytesio = io.BytesIO()
338 | channel_to_attribution = self._get_channel_to_attribution()
339 | channel_to_revenue = self._get_channel_to_revenue()
340 | for channel, attribution in channel_to_attribution.items():
341 | row = {'conversionWindowStartDate': conversion_window_start_date,
342 | 'conversionWindowEndDate': conversion_window_end_date,
343 | 'channel': channel,
344 | 'conversions': attribution,
345 | 'revenue': channel_to_revenue.get(channel, 0.0)
346 | }
347 | bytesio.write(json.dumps(row).encode('utf-8'))
348 | bytesio.write('\n'.encode('utf-8'))
349 | bytesio.flush()
350 | bytesio.seek(0)
351 | job_config = bigquery.LoadJobConfig()
352 | job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
353 | job_config.autodetect = True
354 | job_config.write_disposition = 'WRITE_TRUNCATE'
355 | job = client.load_table_from_file(
356 | bytesio,
357 | report_table,
358 | job_config=job_config)
359 | job.result() # Waits for table load to complete.
360 |
361 | ATTRIBUTION_MODELS = {
362 | 'shapley': run_shapley_attribution,
363 | 'first_touch': run_first_touch_attribution,
364 | 'last_touch': run_last_touch_attribution,
365 | 'position_based': run_position_based_attribution,
366 | 'linear': run_linear_attribution
367 | }
368 |
--------------------------------------------------------------------------------
/py/main.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2022 Google LLC..
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Loads the data into BigQuery needed to run Fractribution."""
17 |
18 | import base64
19 | import datetime
20 | import json
21 | import os
22 | import re
23 | from typing import Any, Dict, List, Mapping, Optional, Tuple
24 | from absl import app
25 | from absl import flags
26 | from google.cloud import bigquery
27 | import jinja2
28 | import fractribution
29 |
30 |
31 | FLAGS = flags.FLAGS
32 |
33 | # GCP flags.
34 | flags.DEFINE_string('project_id', None, 'Google Cloud project to run inside.')
35 | flags.DEFINE_string('dataset', None, 'BigQuery dataset to write the output.')
36 | flags.DEFINE_string('region', None,
37 | 'Region to create the dataset if it does not exist (see '
38 | 'https://cloud.google.com/bigquery/docs/locations).')
39 |
40 | # Google Analytics flags.
41 | flags.DEFINE_string('ga_sessions_table', None,
42 | 'Name of the GA360 BigQuery table in the format '
43 | '`..
_*`.')
44 | flags.DEFINE_string('hostnames', None,
45 | 'Comma separated list of hostnames. Restrict user sessions '
46 | 'to this set of hostnames (Default: no restriction).')
47 |
48 | # Model flag
49 | flags.DEFINE_string('attribution_model', 'shapley',
50 | 'Which Attribution model to use. Models include: shapley, '
51 | 'first_touch, last_touch, position_based and linear. '
52 | '(Default: shapley).')
53 |
54 | # Conversion window flags.
55 | flags.DEFINE_integer('conversion_window_length', None,
56 | 'Number of days in the conversion window.')
57 | flags.DEFINE_string('conversion_window_end_date', None,
58 | 'Ignore conversions after this YYYY-MM-DD UTC date.')
59 | flags.DEFINE_integer('conversion_window_end_today_offset_days', None,
60 | 'Set the conversion window end date to this many days '
61 | 'before today. This is an alternative to '
62 | 'conversion_window_end_date used in regular scheduled '
63 | 'runs of fractribution.')
64 |
65 | # Path flags
66 | flags.DEFINE_integer('path_lookback_days', None,
67 | 'Number of days in a user\'s path to (non)conversion. '
68 | 'Recommended values: 30, 14, or 7.')
69 | flags.DEFINE_integer('path_lookback_steps', None,
70 | 'Optional limit on the number of steps/marketing-channels '
71 | 'in a user\'s path to (non)conversion to the most recent '
72 | 'path_lookback_steps. (Default: no restriction).')
73 | flags.DEFINE_multi_string('path_transform', 'exposure',
74 | 'Name of the path transform function(s) for changing '
75 | 'user paths to improve matching and performance on '
76 | 'sparse data. Note path transforms are executed in '
77 | 'the order that they are specified. Options: unique, '
78 | 'exposure, first, frequency, trimLongPath, '
79 | 'removeIfNotAll and removeIfLastAndNotAll. See the '
80 | 'README for more details.')
81 |
82 | # UserId mapping
83 | flags.DEFINE_boolean('update_fullvisitorid_userid_map', True,
84 | 'True to update the internal map from fullVisitorId to '
85 | 'userId, and False otherwise. (Default: True).')
86 | flags.DEFINE_integer('userid_ga_custom_dimension_index', None,
87 | 'Index of the GA custom dimension storing the non-Google '
88 | 'userId. If set, a map is created between Google '
89 | 'fullVisitorIds and userIds. (Default: no index).')
90 | flags.DEFINE_integer('userid_ga_hits_custom_dimension_index', None,
91 | 'Index of the GA hit-level custom dimension storing the '
92 | 'non-Google userId. If set, a map is created between '
93 | 'Google fullVisitorIds and userIds. (Default: no index).')
94 |
95 | # Location of SQL templates that can be overridden by the user.
96 | flags.DEFINE_string('templates_dir', None,
97 | 'Optional directory containing custom SQL templates. When '
98 | 'loading a template, this directory is checked first '
99 | 'before the default ./templates directory.')
100 | flags.DEFINE_string('channel_definitions_sql', 'channel_definitions.sql',
101 | 'SQL template file with the mapping from channel '
102 | 'definitions to channel names.')
103 | flags.DEFINE_string('conversion_definition_sql', 'conversion_definition.sql',
104 | 'SQL template file with the definition of a conversion.')
105 | flags.DEFINE_string('extract_conversions_sql', 'extract_conversions.sql',
106 | 'SQL template file for extracting all conversions.')
107 |
108 |
109 | _FULLVISITORID_USERID_MAP_TABLE = 'fullvisitorid_userid_map_table'
110 | _PATHS_TO_CONVERSION_TABLE = 'paths_to_conversion_table'
111 | _PATHS_TO_NON_CONVERSION_TABLE = 'paths_to_non_conversion_table'
112 | _PATH_SUMMARY_TABLE = 'path_summary_table'
113 | _CHANNEL_COUNTS_TABLE = 'channel_counts_table'
114 | _REPORT_TABLE = 'report_table'
115 | _OUTPUT_TABLES = [_FULLVISITORID_USERID_MAP_TABLE, _PATHS_TO_CONVERSION_TABLE,
116 | _PATHS_TO_NON_CONVERSION_TABLE, _PATH_SUMMARY_TABLE,
117 | _CHANNEL_COUNTS_TABLE, _REPORT_TABLE]
118 |
119 | _PATH_TRANSFORMS_MAP = {
120 | 'unique': 'Unique',
121 | 'exposure': 'Exposure',
122 | 'first': 'First',
123 | 'frequency': 'Frequency',
124 | 'trimLongPath': 'TrimLongPath',
125 | 'removeIfNotAll': 'RemoveIfNotAll',
126 | 'removeIfLastAndNotAll': 'RemoveIfLastAndNotAll'
127 | }
128 |
129 | VALID_CHANNEL_NAME_PATTERN = re.compile(r'^[a-zA-Z_]\w+$', re.ASCII)
130 |
131 |
132 | def _is_valid_column_name(column_name: str) -> bool:
133 | """Returns True if the column_name is a valid BigQuery column name."""
134 | return (len(column_name) <= 300 and
135 | VALID_CHANNEL_NAME_PATTERN.match(column_name) is not None)
136 |
137 |
138 | def _strip_sql(sql: str) -> str:
139 | """Returns a copy of sql with empty lines and -- comments stripped.
140 |
141 | Args:
142 | sql: A SQL string.
143 | Returns:
144 | A copy of sql with empty lines and -- comments removed.
145 | """
146 | lines = []
147 | for line in sql.split('\n'):
148 | line = re.sub(r'\s*--.*', '', line)
149 | if line.strip():
150 | lines.append(line)
151 | return '\n'.join(lines)
152 |
153 |
154 | def _get_param_or_die(input_params: Mapping[str, Any], param: str) -> Any:
155 | """Returns value of param. Dies with user-formatted message if not defined.
156 |
157 | Args:
158 | input_params: Mapping from input parameter names to values.
159 | param: Name of the param to get the value of.
160 | Returns:
161 | Value of the given param.
162 | Raises:
163 | ValueError: User formatted message on error.
164 | """
165 | value = input_params.get(param, None)
166 | if not value:
167 | raise ValueError('Missing parameter: %s' % param)
168 | return value
169 |
170 |
171 | def parse_int_param(input_params: Mapping[str, Any], param: str,
172 | lower_bound: Optional[int] = None,
173 | upper_bound: Optional[int] = None) -> Any:
174 | """Returns int value of param. Dies with user-formatted message on error.
175 |
176 | Args:
177 | input_params: Mapping from input parameter names to values.
178 | param: Name of the param to get the value of.
179 | lower_bound: If not None, the value must be at least lower_bound.
180 | upper_bound: If not None, the value must be at most upper_bound.
181 | Returns:
182 | Integer value of the given param.
183 | Raises:
184 | ValueError: User formatted message on error.
185 | """
186 | value = _get_param_or_die(input_params, param)
187 | try:
188 | int_value = int(value)
189 | except ValueError:
190 | raise ValueError('Parameter %s must be an int' % param)
191 | if lower_bound and int_value < lower_bound:
192 | raise ValueError('Parameter %s must be at least %i' % (param, lower_bound))
193 | if upper_bound and int_value > upper_bound:
194 | raise ValueError('Parameter %s must be at most %i' % (param, upper_bound))
195 | return int_value
196 |
197 |
198 | def _get_table_name(
199 | project: str, dataset: str, table: str, date_suffix: str) -> str:
200 | """Returns the name of the table in BigQuery dotted format."""
201 | return '{}.{}.{}_{}'.format(project, dataset, table, date_suffix)
202 |
203 |
204 | def _get_output_table_ids(
205 | project_id: str, dataset: str, date_suffix: str) -> Mapping[str, str]:
206 | """Returns mapping from output table names to full BigQuery table ids.
207 |
208 | Args:
209 | project_id: Google Cloud Platform project id.
210 | dataset: Id of the dataset inside project_id to write the output tables.
211 | date_suffix: date_suffix string to append to the end of the tablenames.
212 |
213 | Returns:
214 | Mapping of table names to full BigQuery table ids.
215 | Format of returned id: .._.
216 | """
217 | table_ids = {}
218 | for table in _OUTPUT_TABLES:
219 | table_ids[table] = _get_table_name(project_id, dataset, table, date_suffix)
220 | return table_ids
221 |
222 |
223 | def _get_conversion_window_date_params(
224 | input_params: Mapping[str, Any]) -> Mapping[str, Any]:
225 | """Checks, transforms and returns conversion_window_date input_params.
226 |
227 | Args:
228 | input_params: Mapping from input parameter names to values.
229 | Returns:
230 | Mapping from conversion window date parameters to values.
231 | Raises:
232 | ValueError: User formatted message on error.
233 | """
234 | params = {}
235 | params['conversion_window_length'] = parse_int_param(
236 | input_params, 'conversion_window_length', 1)
237 | has_conversion_window_end_date = (
238 | input_params.get('conversion_window_end_date', None) is not None)
239 | has_conversion_window_end_today_offset_days = (
240 | input_params.get('conversion_window_end_today_offset_days', None)
241 | is not None)
242 | if (has_conversion_window_end_date ==
243 | has_conversion_window_end_today_offset_days):
244 | raise ValueError('Specify either conversion_window_end_date or '
245 | 'conversion_window_end_today_offset_days')
246 | # Compute the conversion window end date.
247 | end_date = None
248 | if has_conversion_window_end_today_offset_days:
249 | offset_days = parse_int_param(
250 | input_params, 'conversion_window_end_today_offset_days', 0)
251 | end_date = (datetime.date.today() - datetime.timedelta(days=offset_days))
252 | params['conversion_window_end_date'] = end_date.isoformat()
253 | else:
254 | end_date = datetime.datetime.strptime(
255 | _get_param_or_die(input_params, 'conversion_window_end_date'),
256 | '%Y-%m-%d').date()
257 | if end_date > datetime.date.today():
258 | raise ValueError('conversion_window_end_date is in the future.')
259 |
260 | params['conversion_window_end_date'] = end_date.isoformat()
261 | start_date = end_date - datetime.timedelta(
262 | days=(params['conversion_window_length'] - 1))
263 | params['conversion_window_start_date'] = start_date.isoformat()
264 | return params
265 |
266 |
267 | def _get_path_lookback_params(
268 | input_params: Mapping[str, Any]) -> Mapping[str, Any]:
269 | """Checks, transforms and returns path_lookback input_params.
270 |
271 | Args:
272 | input_params: Mapping from input parameter names to values.
273 | Returns:
274 | Mapping from path_lookback parameters to values.
275 | Raises:
276 | ValueError: User formatted message on error.
277 | """
278 | params = {}
279 | params['path_lookback_days'] = parse_int_param(
280 | input_params, 'path_lookback_days', 1)
281 | if input_params.get('path_lookback_steps', None) is None:
282 | params['path_lookback_steps'] = 0
283 | else:
284 | params['path_lookback_steps'] = parse_int_param(
285 | input_params, 'path_lookback_steps', 1)
286 | return params
287 |
288 |
289 | def parse_path_transforms(
290 | path_transforms_arg: List[str]) -> List[Tuple[str, Optional[str]]]:
291 | """Parses the given list of path transform strings from the command line.
292 |
293 | Args:
294 | path_transforms_arg: List of path transforms. Each path transform must be a
295 | from _PATH_TRANSFORMS_MAP. The path transform string
296 | can include arguments surrounded by () if needed.
297 |
298 | Returns:
299 | List of (path_transform_name, arg_str) pairs, where the path_transform_name
300 | is from templates/path_transforms.sql, and the arg_str is empty or includes
301 | the full argument string for the path transform, with no surrounding
302 | parentheses.
303 | Raises:
304 | ValueError: User formatted message on error if a path_transform is invalid.
305 | """
306 | path_transforms = []
307 | for path_transform in path_transforms_arg:
308 | match = re.match(r'^(\w+)(?:\((.*)\))?$', path_transform)
309 | if match is None:
310 | raise ValueError('Unable to parse path_transform: ', path_transform)
311 | path_transform_name = match.groups()[0]
312 | arg_str = match.groups()[1]
313 | if path_transform_name not in _PATH_TRANSFORMS_MAP.keys():
314 | raise ValueError(
315 | 'Unknown path_transform. Use one of: ', _PATH_TRANSFORMS_MAP.keys())
316 | path_transforms.append(
317 | (_PATH_TRANSFORMS_MAP[path_transform_name], arg_str))
318 | return path_transforms
319 |
320 |
321 | def _extract_channels(
322 | client: bigquery.client.Client, params: Mapping[str, Any]) -> List[str]:
323 | """Returns the list of names by running extract_channels.sql.
324 |
325 | Args:
326 | client: BigQuery client.
327 | params: Mapping of template parameter names to values.
328 | Returns:
329 | List of channel names.
330 | Raises:
331 | ValueError: User-formatted error if channel is not a valid BigQuery column.
332 | """
333 | extract_channels_sql = params['jinja_env'].get_template(
334 | 'extract_channels.sql').render(params)
335 | channels = [
336 | row.channel for row in client.query(extract_channels_sql).result()]
337 | if fractribution.UNMATCHED_CHANNEL not in channels:
338 | channels.append(fractribution.UNMATCHED_CHANNEL)
339 | for channel in channels:
340 | if not _is_valid_column_name(channel):
341 | raise ValueError('Channel is not a legal BigQuery column name: ', channel)
342 | return channels
343 |
344 |
345 | def _get_fullvisitorid_userid_map_params(
346 | input_params: Mapping[str, Any]) -> Mapping[str, Any]:
347 | """Checks, transforms and returns the userid-mapping input_params.
348 |
349 | Args:
350 | input_params: Mapping from input parameter names to values.
351 | Returns:
352 | Mapping from userid-mapping parameters to values.
353 | Raises:
354 | ValueError: User formatted message on error.
355 | """
356 | params = {}
357 | # Set the default behavior to update the id map.
358 | params['update_fullvisitorid_userid_map'] = input_params.get(
359 | 'update_fullvisitorid_userid_map', True)
360 | # Extract the custom dimension containing the userid mapping
361 | if input_params.get('userid_ga_custom_dimension_index', None) is not None:
362 | params['userid_ga_custom_dimension_index'] = parse_int_param(
363 | input_params, 'userid_ga_custom_dimension_index', 1)
364 | else:
365 | params['userid_ga_custom_dimension_index'] = 0
366 | # Extract the hit-level custom dimension containing the userid mapping
367 | if input_params.get(
368 | 'userid_ga_hits_custom_dimension_index', None) is not None:
369 | params['userid_ga_hits_custom_dimension_index'] = parse_int_param(
370 | input_params, 'userid_ga_hits_custom_dimension_index', 1)
371 | else:
372 | params['userid_ga_hits_custom_dimension_index'] = 0
373 | return params
374 |
375 |
376 | def _get_template_params(input_params: Mapping[str, Any]) -> Dict[str, Any]:
377 | """Checks, transforms and returns input_params into an internal param mapping.
378 |
379 | Args:
380 | input_params: Mapping from input parameter names to values.
381 | Returns:
382 | Mapping of template parameter names to parameter values.
383 | Raises:
384 | ValueError: User formatted if input_params contains an error.
385 | """
386 | params = {}
387 | params.update(input_params)
388 | jinja_env = _get_jinja_env(params)
389 | params['jinja_env'] = jinja_env
390 | params['project_id'] = _get_param_or_die(params, 'project_id')
391 | params['dataset'] = _get_param_or_die(params, 'dataset')
392 | params['ga_sessions_table'] = _get_param_or_die(params, 'ga_sessions_table')
393 | # Check the model
394 | if ('attribution_model' not in params or params['attribution_model'] not in
395 | fractribution.Fractribution.ATTRIBUTION_MODELS):
396 | raise ValueError(
397 | 'Unknown attribution_model. Use one of: ',
398 | fractribution.Fractribution.ATTRIBUTION_MODELS.keys())
399 | # Conversion window parameters
400 | params.update(_get_conversion_window_date_params(params))
401 | params.update(_get_output_table_ids(
402 | params['project_id'],
403 | params['dataset'],
404 | datetime.datetime.strptime(
405 | params['conversion_window_end_date'],
406 | '%Y-%m-%d').date().strftime('%Y%m%d')))
407 | # Get the conversion extraction SQL.
408 | params['conversion_definition_sql'] = _strip_sql(
409 | jinja_env.get_template(
410 | params.get('conversion_definition_sql',
411 | 'conversion_definition.sql')).render(params))
412 | params['extract_conversions_sql'] = _strip_sql(
413 | jinja_env.get_template(params.get(
414 | 'extract_conversions_sql', 'extract_conversions.sql')).render(params))
415 | # Get the channel definition SQL.
416 | params['channel_definitions_sql'] = _strip_sql(
417 | jinja_env.get_template(params.get(
418 | 'channel_definitions_sql', 'channel_definitions.sql')).render(params))
419 | params.update(_get_path_lookback_params(params))
420 | params.update(_get_fullvisitorid_userid_map_params(params))
421 | # Process the hostname restrictions.
422 | if params.get('hostnames', None) is not None:
423 | params['hostnames'] = ', '.join([
424 | "'%s'" % hostname for hostname in params['hostnames'].split(',')])
425 | # Check the path_transforms.
426 | path_transform_param = _get_param_or_die(params, 'path_transform')
427 | # For backwards compatibility, if path_transform_param is a string, for
428 | # example, if Fractribution is invoked as a cloud function, convert it
429 | # to a list.
430 | if isinstance(path_transform_param, str):
431 | path_transform_param = [path_transform_param]
432 | params['path_transforms'] = []
433 | if params['path_lookback_steps'] > 0:
434 | # Implementation note: Line below is equivalent to .append(), however append
435 | # does not pass the static type checking.
436 | params['path_transforms'] = [(
437 | _PATH_TRANSFORMS_MAP['trimLongPath'],
438 | str(params['path_lookback_steps']))]
439 | params['path_transforms'] += parse_path_transforms(path_transform_param)
440 | if not params['path_transforms']:
441 | raise ValueError('Must specify at least one path_transform.')
442 | return params
443 |
444 |
445 | def extract_fractribution_input_data(
446 | client: bigquery.client.Client, params: Mapping[str, Any]) -> None:
447 | """Extracts the input data for fractribution into BigQuery.
448 |
449 | Args:
450 | client: BigQuery client.
451 | params: Mapping of all template parameter names to values.
452 | """
453 | extract_data_sql = _strip_sql(
454 | params['jinja_env'].get_template('extract_data.sql').render(params))
455 | # Issue the query, and call result() to wait for it to finish. No results
456 | # are returned as all output is stored on BigQuery.
457 | client.query(extract_data_sql).result()
458 |
459 |
460 | def run_fractribution(
461 | client: bigquery.client.Client, params: Mapping[str, Any]) -> None:
462 | """Runs fractribution on the extract_fractribution_input_data BigQuery tables.
463 |
464 | Args:
465 | client: BigQuery client.
466 | params: Mapping of all template parameter names to values.
467 | """
468 |
469 | # Step 1: Extract the paths from the path_summary_table.
470 | frac = fractribution.Fractribution(client.query(
471 | params['jinja_env'].get_template('select_path_summary_query.sql').render(
472 | path_summary_table=params['path_summary_table'])))
473 | frac.run_fractribution(params['attribution_model'])
474 | frac.normalize_channel_to_attribution_names()
475 | # Step 3: Create the path_summary_table and upload the results.
476 | create_path_summary_table_sql = params['jinja_env'].get_template(
477 | 'create_path_summary_results_table.sql').render(params)
478 | client.query(create_path_summary_table_sql).result()
479 | frac.upload_path_summary(client, params['path_summary_table'])
480 | frac.upload_report_table(client,
481 | params['conversion_window_start_date'],
482 | params['conversion_window_end_date'],
483 | params['report_table'])
484 |
485 |
486 | def generate_report(
487 | client: bigquery.client.Client, params: Mapping[str, Any]) -> None:
488 | """Generates the final BigQuery Table with channel-level attribution and ROAS.
489 |
490 | Args:
491 | client: BigQuery client.
492 | params: Mapping of all template parameter names to values.
493 | """
494 | client.query(params['jinja_env'].get_template(
495 | 'generate_report.sql').render(params)).result()
496 |
497 |
498 | def run(input_params: Mapping[str, Any]) -> int:
499 | """Main entry point to run Fractribution with the given input_params.
500 |
501 | Args:
502 | input_params: Mapping from input parameter names to values.
503 | Returns:
504 | 0 on success and non-zero otherwise
505 | """
506 | params = _get_template_params(input_params)
507 | client = bigquery.Client(params['project_id'])
508 | dataset = bigquery.Dataset(
509 | '{}.{}'.format(params['project_id'], params['dataset']))
510 | if 'region' in params and params['region']:
511 | dataset.location = params['region']
512 | client.create_dataset(dataset, exists_ok=True)
513 | extract_fractribution_input_data(client, params)
514 | # Extract the channel definitions into params for use in later queries.
515 | params['channels'] = _extract_channels(client, params)
516 | run_fractribution(client, params)
517 | generate_report(client, params)
518 | return 0
519 |
520 |
521 | def _get_jinja_env(input_params: Mapping[str, Any]) -> jinja2.Environment:
522 | """Returns a jinja environment for template instantiation.
523 |
524 | Args:
525 | input_params: Mapping from input parameter names to values.
526 | By default templates are extracted from ./templates. Include a value for
527 | the parameter 'templates_dir' to provide an additional location to search
528 | for templates.
529 | Returns:
530 | Jinja Environment.
531 | """
532 | loaders = []
533 | if input_params.get('templates_dir', None):
534 | loaders.append(jinja2.FileSystemLoader(
535 | os.path.normpath(input_params['templates_dir'])))
536 | loaders.append(
537 | jinja2.FileSystemLoader(
538 | os.path.join(os.path.dirname(__file__), 'templates')))
539 | return jinja2.Environment(
540 | loader=jinja2.ChoiceLoader(loaders),
541 | keep_trailing_newline=True,
542 | lstrip_blocks=True,
543 | trim_blocks=True)
544 |
545 |
546 | def main(event, unused_context=None) -> int:
547 | """Entry point for Cloud Function."""
548 | input_params = json.loads(base64.b64decode(event['data']).decode('utf-8'))
549 | return run(input_params)
550 |
551 |
552 | def standalone_main(_):
553 | input_params = FLAGS.flag_values_dict()
554 | run(input_params)
555 |
556 | if __name__ == '__main__':
557 | app.run(standalone_main)
558 |
--------------------------------------------------------------------------------
/py/requirements.txt:
--------------------------------------------------------------------------------
1 | absl-py
2 | google-cloud-bigquery
3 | google-cloud-logging
4 | functions-framework==1.3.0
5 | google-cloud-pubsub
6 |
--------------------------------------------------------------------------------
/py/start.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2022 Google LLC..
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Entry point for running fractribution in Docker container."""
17 |
18 | import json
19 | import logging
20 | import os
21 | import time
22 | import googleapiclient.discovery
23 | import main
24 | import requests
25 |
26 | import google.cloud.logging
27 |
28 | client = google.cloud.logging.Client()
29 | client.get_default_handler()
30 | client.setup_logging()
31 | logging.info(os.environ["fractribution_param"])
32 | param = json.loads(os.environ["fractribution_param"])
33 |
34 | logging.info("Start Fractribution")
35 | logging.info(param)
36 | try:
37 | main.run(param)
38 | logging.info("Fractribution Done!")
39 |
40 | except Exception as e:
41 | logging.error("An exception occurred")
42 | logging.exception(e)
43 |
44 | logging.info("Shutting down.....")
45 | headers = {"Metadata-Flavor": "Google"}
46 | meta_response = requests.get(
47 | url="http://metadata.google.internal/computeMetadata/v1/instance/name",
48 | headers=headers)
49 | instance_name = meta_response.text
50 | meta_response = requests.get(
51 | url="http://metadata.google.internal/computeMetadata/v1/instance/zone",
52 | headers=headers)
53 | zone = meta_response.text.split("/")[-1]
54 |
55 | meta_response = requests.get(
56 | url="http://metadata.google.internal/computeMetadata/v1/project/project-id",
57 | headers=headers)
58 | project = meta_response.text
59 | compute = googleapiclient.discovery.build(
60 | "compute", "v1", cache_discovery=False)
61 | request = compute.instances().stop(
62 | project=project, zone=zone, instance=instance_name)
63 | response = request.execute()
64 | logging.info(response)
65 |
66 | logging.getLogger().handlers[0].flush()
67 | time.sleep(120)
68 |
--------------------------------------------------------------------------------
/py/templates/channel_counts.sql:
--------------------------------------------------------------------------------
1 | # Copyright 2022 Google LLC..
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Query the number of marketing events, aggregated by channel, campaign, source and medium.
16 | SELECT
17 | channel,
18 | campaign,
19 | source,
20 | medium,
21 | COUNT(*) AS number_of_events
22 | FROM SessionsByCustomerId
23 | GROUP BY channel, campaign, source, medium
24 | ORDER BY channel, number_of_events DESC
25 |
--------------------------------------------------------------------------------
/py/templates/channel_definitions.sql:
--------------------------------------------------------------------------------
1 | # Copyright 2022 Google LLC..
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # SQL mapping from channel definition to channel name.
16 | -- A final catch-all 'Unmatched_Channel' must be included for unmatched channels.
17 | -- Note: Channel names become BigQuery column names, so they must consist of letters, numbers and
18 | -- underscores only. Also, column names must be at most 300 characters long. See
19 | -- https://cloud.google.com/bigquery/docs/schemas#column_names for the full specification.
20 | --
21 | -- Default channel definitions (see the end for campaign-level definitions)
22 | CASE
23 | WHEN
24 | LOWER(trafficSource.medium) IN ('cpc', 'ppc')
25 | AND REGEXP_CONTAINS(LOWER(trafficSource.campaign), r'brand')
26 | THEN 'Paid_Search_Brand'
27 | WHEN
28 | LOWER(trafficSource.medium) IN ('cpc', 'ppc')
29 | AND REGEXP_CONTAINS(LOWER(trafficSource.campaign), r'generic')
30 | THEN 'Paid_Search_Generic'
31 | WHEN
32 | LOWER(trafficSource.medium) IN ('cpc', 'ppc')
33 | AND NOT REGEXP_CONTAINS(LOWER(trafficSource.campaign), r'brand|generic')
34 | THEN 'Paid_Search_Other'
35 | WHEN LOWER(trafficSource.medium) = 'organic' THEN 'Organic_Search'
36 | WHEN
37 | LOWER(trafficSource.medium) IN ('display', 'cpm', 'banner')
38 | AND REGEXP_CONTAINS(LOWER(trafficSource.campaign), r'prospect')
39 | THEN 'Display_Prospecting'
40 | WHEN
41 | LOWER(trafficSource.medium) IN ('display', 'cpm', 'banner')
42 | AND REGEXP_CONTAINS(
43 | LOWER(trafficSource.campaign),
44 | r'retargeting|re-targeting|remarketing|re-marketing')
45 | THEN 'Display_Retargeting'
46 | WHEN
47 | LOWER(trafficSource.medium) IN ('display', 'cpm', 'banner')
48 | AND NOT REGEXP_CONTAINS(
49 | LOWER(trafficSource.campaign),
50 | r'prospect|retargeting|re-targeting|remarketing|re-marketing')
51 | THEN 'Display_Other'
52 | WHEN
53 | REGEXP_CONTAINS(LOWER(trafficSource.campaign), r'video|youtube')
54 | OR REGEXP_CONTAINS(LOWER(trafficSource.source), r'video|youtube')
55 | THEN 'Video'
56 | WHEN
57 | LOWER(trafficSource.medium) = 'social'
58 | AND REGEXP_CONTAINS(LOWER(trafficSource.campaign), r'prospect')
59 | THEN 'Paid_Social_Prospecting'
60 | WHEN
61 | LOWER(trafficSource.medium) = 'social'
62 | AND REGEXP_CONTAINS(
63 | LOWER(trafficSource.campaign),
64 | r'retargeting|re-targeting|remarketing|re-marketing')
65 | THEN 'Paid_Social_Retargeting'
66 | WHEN
67 | LOWER(trafficSource.medium) = 'social'
68 | AND NOT REGEXP_CONTAINS(
69 | LOWER(trafficSource.campaign),
70 | r'prospect|retargeting|re-targeting|remarketing|re-marketing')
71 | THEN 'Paid_Social_Other'
72 | WHEN trafficSource.source = '(direct)' THEN 'Direct'
73 | WHEN LOWER(trafficSource.medium) = 'referral' THEN 'Referral'
74 | WHEN LOWER(trafficSource.medium) = 'email' THEN 'Email'
75 | WHEN
76 | LOWER(trafficSource.medium) IN ('cpc', 'ppc', 'cpv', 'cpa', 'affiliates')
77 | THEN 'Other_Advertising'
78 | ELSE 'Unmatched_Channel'
79 | END
80 |
81 | -- Campaign-level channel definitions:
82 | -- Channel name format: __, with NULLs and any illegal BigQuery Column
83 | -- characters replaced with '_'. If the channel name would be too long for a BigQuery column, it
84 | -- is cropped and appended with a unique id. By default, the channel name is 'Unmatched_Channel',
85 | -- whenever all of , and are NULL.
86 | -- CASE
87 | -- WHEN
88 | -- trafficSource.medium IS NOT NULL
89 | -- OR trafficSource.source IS NOT NULL
90 | -- OR trafficSource.campaign IS NOT NULL
91 | -- THEN
92 | -- REGEXP_REPLACE(
93 | -- IF (LENGTH(
94 | -- ARRAY_TO_STRING([
95 | -- 'medium', trafficSource.medium,
96 | -- 'source', trafficSource.source,
97 | -- 'campaign', trafficSource.campaign], '_', '')) <= 300,
98 | -- ARRAY_TO_STRING([
99 | -- 'medium', trafficSource.medium,
100 | -- 'source', trafficSource.source,
101 | -- 'campaign', trafficSource.campaign], '_', ''),
102 | -- CONCAT(LEFT(ARRAY_TO_STRING([
103 | -- 'medium', trafficSource.medium,
104 | -- 'source', trafficSource.source,
105 | -- 'campaign', trafficSource.campaign], '_', ''), 279),
106 | -- '_',
107 | -- FARM_FINGERPRINT(ARRAY_TO_STRING([
108 | -- 'medium', trafficSource.medium,
109 | -- 'source', trafficSource.source,
110 | -- 'campaign', trafficSource.campaign], '_', ''))
111 | -- )
112 | -- ), '[^a-zA-Z0-9_]','_')
113 | -- ELSE
114 | -- 'Unmatched_Channel'
115 | -- END
116 |
--------------------------------------------------------------------------------
/py/templates/conversion_definition.sql:
--------------------------------------------------------------------------------
1 | # Copyright 2022 Google LLC..
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | -- SQL conversion logic fragment. See extract_conversions.sql for how the fragment is included.
16 | --
17 | -- Use the field names described in the BigQuery export schema for Google Analytics here:
18 | -- https://support.google.com/analytics/answer/3437719?hl=en
19 | -- However, when referencing a repeated field, use the relevant aliases below:
20 | -- Field: Alias
21 | -- hits.customVariables: hitsCustomVariables
22 | -- hits.customDimensions: hitsCustomDimensions
23 | -- hits.customMetrics: hitsCustomMetrics
24 | -- hits.products: hitsProducts
25 | -- hits.promotions: hitsPromotions
26 | -- hits.experiments: hitsExperiments
27 | -- hits.publisher_infos: hitsPublisherInfos
28 | -- customDimensions: customDimensions
29 | --
30 | -- Example 1:
31 | totals.totalTransactionRevenue > 0
32 | --
33 | -- Example 2: Using hits and hits.customDimensions.
34 | -- hits.eventInfo.eventCategory = 'customer_registration'
35 | -- AND REGEXP_CONTAINS(hits.eventInfo.eventAction, r'complete|success')
36 | -- AND hits.page.hostname = 'signup.your-site.com'
37 | -- AND hitsCustomDimensions.index = 2
38 | -- AND hitsCustomDimensions.value = 'specific_tag'
39 |
--------------------------------------------------------------------------------
/py/templates/create_path_summary_results_table.sql:
--------------------------------------------------------------------------------
1 | # Copyright 2022 Google LLC..
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | CREATE OR REPLACE TABLE `{{path_summary_table}}` (
16 | transformedPath STRING NOT NULL,
17 | conversions INT64 NOT NULL,
18 | nonConversions INT64 NOT NULL,
19 | revenue FLOAT64,
20 | {% for channel in channels %}
21 | {{channel}} FLOAT64,
22 | {% endfor %}
23 | );
24 |
--------------------------------------------------------------------------------
/py/templates/extract_channel_spend_data.sql:
--------------------------------------------------------------------------------
1 | # Copyright 2022 Google LLC..
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | -- User supplied SQL script to extract total ad spend by channel.
16 | --
17 | -- Required output schema:
18 | -- channel: STRING NOT NULL (Must match those in channel_definitions.sql.)
19 | -- spend: FLOAT64 (Use the same monetary units as conversion revenue, and NULL if unknown.)
20 | --
21 | -- Note that all flags are passed into this template (e.g. conversion_window_start/end_date).
22 | --
23 | -- Sample uniform spend data for bigquery-public-data.google_analytics_sample.ga_sessions_*:
24 | {% raw %}
25 | -- SELECT * FROM UNNEST({{channels}}) AS channel, UNNEST([10000]) AS spend
26 | {% endraw %}
27 | --
28 | -- DEFAULT: If no spend information is available, use the SQL below to assign a NULL value to the
29 | -- spend for each channel.
30 | SELECT * FROM UNNEST({{channels}}) AS channel, UNNEST([NULL]) AS spend
31 |
--------------------------------------------------------------------------------
/py/templates/extract_channels.sql:
--------------------------------------------------------------------------------
1 | # Copyright 2022 Google LLC..
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Query to extract the set of channels
16 | SELECT DISTINCT channel
17 | FROM `{{channel_counts_table}}`
18 |
--------------------------------------------------------------------------------
/py/templates/extract_conversions.sql:
--------------------------------------------------------------------------------
1 | # Copyright 2022 Google LLC..
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Extracts customer conversions.
16 | -- Args:
17 | -- ga_sessions_table: Google Analytics BigQuery table.
18 | -- conversion_window_start_date: Ignore conversions before this %Y-%m-%d date string.
19 | -- conversion_window_end_date: Ignore conversions after this %Y-%m-%d date string.
20 | -- conversion_definition_sql: Custom SQL that defines a customer conversion.
21 | -- hostnames: Comma separated list of hostnames to restrict to.
22 | -- fullvisitorid_userid_map_table: BigQuery table with mapping from fullVisitorId to userId.
23 | --
24 | -- By default, this script extracts conversions from the Google Analytics BigQuery table. If your
25 | -- conversions are stored in a separate table, replace this script with SQL that SELECTs the data
26 | -- with the following schema:
27 | -- customerId: STRING NOT NULL
28 | -- For multi-device support, set the customerId to the userId when possible. If your conversion
29 | -- table has fullVisitorId, join it with the fullvisitorid_userid_map_table to lookup
30 | -- corresponding userIds as below. For the final customerId, prepend it with 'u' if it is
31 | -- a userId, and 'f' otherwise.
32 | -- conversionTimestamp: TIMESTAMP NOT NULL
33 | -- revenue: FLOAT64
34 | -- Use NULL if revenue is unknown. NULL values are ignored in the final ROAS calculations.
35 | -- However, the channels leading to this conversion will still get attribution credit.
36 | WITH
37 | ConversionsByFullVisitorId AS (
38 | SELECT
39 | fullVisitorId,
40 | TIMESTAMP_SECONDS(
41 | MIN(SAFE_CAST(visitStartTime + (hits.time / 1e3) AS INT64))) AS conversionTimestamp,
42 | totals.totalTransactionRevenue / 1e6 AS revenue
43 | FROM
44 | `{{ga_sessions_table}}` AS Sessions,
45 | UNNEST(hits) AS hits
46 | {% if 'customDimensions.' in conversion_definition_sql %}
47 | -- Using LEFT JOIN because if the UNNEST is empty, a CROSS JOIN will be empty too, and we may
48 | -- want to inspect a separate UNNEST below.
49 | LEFT JOIN UNNEST(customDimensions) AS customDimensions
50 | {% endif %}
51 | {% if 'hitsCustomDimensions.' in conversion_definition_sql %}
52 | LEFT JOIN UNNEST(hits.customDimensions) AS hitsCustomDimensions
53 | {% endif %}
54 | {% if 'hitsCustomVariables.' in conversion_definition_sql %},
55 | LEFT JOIN UNNEST(hits.customVariables) AS hitsCustomVariables
56 | {% endif %}
57 | {% if 'hitsCustomMetrics.' in conversion_definition_sql %},
58 | LEFT JOIN UNNEST(hits.customMetrics) AS hitsCustomMetrics
59 | {% endif %}
60 | {% if 'hitsProducts.' in conversion_definition_sql %},
61 | LEFT JOIN UNNEST(hits.products) AS hitsProducts
62 | {% endif %}
63 | {% if 'hitsPromotions.' in conversion_definition_sql %},
64 | LEFT JOIN UNNEST(hits.promotions) AS hitsPromotions
65 | {% endif %}
66 | {% if 'hitsExperiments.' in conversion_definition_sql %},
67 | LEFT JOIN UNNEST(hits.experiments) AS hitsExperiments
68 | {% endif %}
69 | {% if 'hitsPublisherInfos.' in conversion_definition_sql %},
70 | LEFT JOIN UNNEST(hits.publisher_infos) AS hitsPublisherInfos
71 | {% endif %}
72 | WHERE
73 | _TABLE_SUFFIX BETWEEN
74 | FORMAT_TIMESTAMP(
75 | '%Y%m%d', TIMESTAMP_SUB(TIMESTAMP('{{conversion_window_start_date}}'), INTERVAL 1 DAY))
76 | AND FORMAT_TIMESTAMP(
77 | "%Y%m%d", TIMESTAMP_ADD(TIMESTAMP('{{conversion_window_end_date}}'), INTERVAL 1 DAY))
78 | AND visitStartTime BETWEEN
79 | UNIX_SECONDS('{{conversion_window_start_date}} 00:00:00 UTC')
80 | AND UNIX_SECONDS('{{conversion_window_end_date}} 23:59:59 UTC')
81 | AND (
82 | {% filter indent(width=8) %}{{conversion_definition_sql}}{% endfilter %}
83 | )
84 | GROUP BY
85 | fullVisitorId,
86 | visitStartTime,
87 | revenue
88 | ),
89 | FullVisitorIdUserIdMapTable AS (
90 | SELECT DISTINCT fullVisitorId, userId FROM `{{fullvisitorid_userid_map_table}}`
91 | )
92 | SELECT
93 | CASE
94 | WHEN FullVisitorIdUserIdMapTable.userId IS NOT NULL
95 | THEN CONCAT('u', FullVisitorIdUserIdMapTable.userId)
96 | ELSE CONCAT('f', ConversionsByFullVisitorId.fullVisitorId)
97 | END AS customerId,
98 | conversionTimestamp,
99 | revenue
100 | FROM ConversionsByFullVisitorId
101 | LEFT JOIN FullVisitorIdUserIdMapTable USING (fullVisitorId)
102 | -- Do not include a trailing ; as this query is included in another SQL query.
103 |
104 |
--------------------------------------------------------------------------------
/py/templates/extract_data.sql:
--------------------------------------------------------------------------------
1 | # Copyright 2022 Google LLC..
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # SQL script for extracting the paths to conversion and non-conversion used in Fractribution.
16 | -- Args:
17 | -- fullvisitorid_userid_map_table
18 | -- update_fullvisitorid_userid_map
19 | -- extract_conversions_sql: Custom SQL for extracting all conversions.
20 | -- paths_to_conversion_table
21 | -- paths_to_non_conversion_table
22 | -- path_summary_table
23 | -- channel_counts_table
24 |
25 | # Create the fullvisitorid_userid_map_table if it does not exist.
26 | CREATE TABLE IF NOT EXISTS `{{fullvisitorid_userid_map_table}}` (
27 | fullVisitorId STRING NOT NULL,
28 | userId STRING NOT NULL,
29 | mapStartTimestamp TIMESTAMP NOT NULL,
30 | tableSuffixWhenAdded STRING NOT NULL
31 | );
32 | {% if update_fullvisitorid_userid_map %}
33 | INSERT `{{fullvisitorid_userid_map_table}}`
34 | (fullVisitorId, userId, mapStartTimestamp, tableSuffixWhenAdded)
35 | {% include 'extract_fullvisitorid_userid_map.sql' %};
36 | {% endif %}
37 |
38 | CREATE TEMP TABLE ConversionsByCustomerId AS (
39 | {% filter indent(width=2) %}
40 | {{extract_conversions_sql}}
41 | {% endfilter %}
42 | -- Including blank line to force a newline, in case extract_conversions.sql ends with a comment.
43 |
44 | );
45 |
46 | CREATE TEMP TABLE SessionsByCustomerId AS (
47 | {% filter indent(width=2) %}
48 | {% include 'extract_ga_sessions.sql' %}
49 | {% endfilter %}
50 | -- Including blank line to force a newline, in case extract_ga_sessions.sql ends with a comment.
51 |
52 | );
53 |
54 | {% include 'path_transforms.sql' %}
55 |
56 | CREATE OR REPLACE TABLE `{{paths_to_conversion_table}}` AS (
57 | {% filter indent(width=2) %}
58 | {% include 'paths_to_conversion.sql' %}
59 | {% endfilter %}
60 | );
61 |
62 | CREATE OR REPLACE TABLE `{{paths_to_non_conversion_table}}` AS (
63 | {% filter indent(width=2) %}
64 | {% include 'paths_to_non_conversion.sql' %}
65 | {% endfilter %}
66 | );
67 |
68 | CREATE OR REPLACE TABLE `{{path_summary_table}}` AS (
69 | {% filter indent(width=2) %}
70 | {% include 'path_summary.sql' %}
71 | {% endfilter %}
72 | );
73 |
74 | CREATE OR REPLACE TABLE `{{channel_counts_table}}` AS (
75 | {% filter indent(width=2) %}
76 | {% include 'channel_counts.sql' %}
77 | {% endfilter %}
78 | );
79 |
--------------------------------------------------------------------------------
/py/templates/extract_fullvisitorid_userid_map.sql:
--------------------------------------------------------------------------------
1 | # Copyright 2022 Google LLC..
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Extracts new mappings between fullVistorId and userId from the Google Analytics BigQuery table.
16 | -- Args:
17 | -- ga_sessions_table: Google Analytics BigQuery table.
18 | -- userid_ga_custom_dimension_index: Index of the userId in the Google Analytics custom dimension.
19 | -- userid_ga_hits_custom_dimension_index: Index of the userId in the Google Analytics hits custom
20 | -- dimension.
21 | -- fullvisitorid_userid_map_table: BigQuery table with mappings from fullVisitorIds to userIds.
22 | --
23 | -- If your fullVisitorId to userId mappings are outside Google Analytics, replace this script with
24 | -- one that queries your table and extracts the following:
25 | -- fullVisitorId STRING NOT NULL,
26 | -- userId STRING NOT NULL,
27 | -- mapStartTimestamp TIMESTAMP NOT NULL,
28 | -- tableSuffixWhenAdded STRING
29 | -- For efficiency, from run to run, only query newer mappings in your table, instead of the
30 | -- entire table.
31 | SELECT DISTINCT
32 | fullVisitorId,
33 | CASE
34 | WHEN userId IS NOT NULL THEN userId
35 | {% if userid_ga_custom_dimension_index > 0 %}
36 | WHEN
37 | customDimension.index = {{userid_ga_custom_dimension_index}}
38 | AND customDimension.value IS NOT NULL
39 | THEN customDimension.value
40 | {% endif %}
41 | {% if userid_ga_hits_custom_dimension_index > 0 %}
42 | WHEN
43 | hitsCustomDimension.index = {{userid_ga_hits_custom_dimension_index}}
44 | AND hitsCustomDimension.value IS NOT NULL
45 | THEN hitsCustomDimension.value
46 | {% endif %}
47 | ELSE NULL
48 | END AS userId,
49 | TIMESTAMP_SECONDS(visitStartTime) AS mapStartTimestamp,
50 | _TABLE_SUFFIX AS tableSuffixWhenAdded
51 | FROM `{{ga_sessions_table}}` AS Sessions
52 | {% if userid_ga_custom_dimension_index > 0 %}
53 | LEFT JOIN UNNEST(Sessions.customDimensions) as customDimension
54 | {% endif %}
55 | {% if userid_ga_hits_custom_dimension_index > 0 %},
56 | UNNEST(Sessions.hits) as hits -- hits cannot be empty, since Sessions begin with a hit.
57 | LEFT JOIN UNNEST(hits.customDimensions) as hitsCustomDimension
58 | {% endif %}
59 | WHERE
60 | _TABLE_SUFFIX BETWEEN
61 | -- From one day after the maximum table suffix previously recorded.
62 | (SELECT
63 | FORMAT_DATE(
64 | "%Y%m%d",
65 | DATE_ADD(PARSE_DATE("%Y%m%d", IFNULL(MAX(tableSuffixWhenAdded), "19700101")),
66 | INTERVAL 1 DAY)) -- 1 day after the latest tableSuffixWhenAdded.
67 | FROM `{{fullvisitorid_userid_map_table}}`)
68 | -- To yesterday.
69 | AND FORMAT_DATE('%Y%m%d', CURRENT_DATE('UTC') - 1)
70 | AND fullVisitorId IS NOT NULL
71 | AND (
72 | Sessions.userId IS NOT NULL
73 | AND LOWER(Sessions.userId) NOT IN ('', 'undefined', 'n/a')
74 | {% if userid_ga_custom_dimension_index > 0 %}
75 | OR (
76 | customDimension.index = {{userid_ga_custom_dimension_index}}
77 | AND customDimension.value IS NOT NULL
78 | AND LOWER(customDimension.value) NOT IN ('', 'undefined', 'n/a')
79 | )
80 | {% endif %}
81 | {% if userid_ga_hits_custom_dimension_index > 0 %}
82 | OR (
83 | hitsCustomDimension.index = {{userid_ga_hits_custom_dimension_index}}
84 | AND hitsCustomDimension.value IS NOT NULL
85 | AND LOWER(hitsCustomDimension.value) NOT IN ('', 'undefined', 'n/a')
86 | )
87 | {% endif %}
88 | )
89 |
90 |
--------------------------------------------------------------------------------
/py/templates/extract_ga_sessions.sql:
--------------------------------------------------------------------------------
1 | # Copyright 2022 Google LLC..
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Extracts information from the BigQuery Google Analytics table for constructing paths of channels.
16 | --
17 | -- This script goes through the Google Analytics sessions and pulls out the customerId and
18 | -- traffic-source marketing channel, as defined in channel_definitions.sql. The window for
19 | -- extracting sessions includes the conversion window, plus the preceding {{path_lookback_days}}.
20 | -- Also, the window is extended by one day on either side to account for the sessions being
21 | -- partitioned in local time, while the session.visitStartTime is in UTC (and fractribution
22 | -- reports in UTC).
23 | --
24 | -- Args:
25 | -- ga_sessions_table: Google Analytics BigQuery table.
26 | -- fullvisitorid_userid_map_table: BigQuery table of distinct (fullVisitorId, userId) mappings
27 | -- conversion_window_start_date: Start date of the conversion window in %Y-%m-%d format.
28 | -- conversion_window_end_date: End date of the conversion window in %Y-%m-%d format.
29 | -- channel_definitions_sql: SQL mapping from channel definitions to channel names
30 | -- path_lookback_days: Number of days to extract sessions before the conversion_window_start_date.
31 | -- hostnames: Comma separated list of hostnames to restrict to.
32 | --
33 | -- What about channel touchpoints not recorded in Google Analytics:
34 | -- If you have a third party source of channel touchpoints, add a new SELECT statement below
35 | -- to extact the additional channels touchpoints and then UNION ALL the results with the channels
36 | -- extracted in this script.
37 | WITH
38 | FilteredSessions AS (
39 | SELECT
40 | fullVisitorId,
41 | TIMESTAMP_SECONDS(visitStartTime) as visitStartTimestamp,
42 | {% filter indent(width=6) %}
43 | {{channel_definitions_sql}} AS channel,
44 | {% endfilter %}
45 | trafficSource.referralPath,
46 | trafficSource.campaign,
47 | trafficSource.source,
48 | trafficSource.medium
49 | FROM
50 | `{{ga_sessions_table}}` AS Sessions
51 | {% if hostnames %},
52 | UNNEST(hits) AS hits
53 | {% endif %}
54 | WHERE
55 | _TABLE_SUFFIX
56 | BETWEEN FORMAT_TIMESTAMP('%Y%m%d', TIMESTAMP_SUB(
57 | TIMESTAMP('{{conversion_window_start_date}}'), INTERVAL {{path_lookback_days + 1}} DAY))
58 | AND FORMAT_TIMESTAMP('%Y%m%d', TIMESTAMP_ADD(
59 | TIMESTAMP('{{conversion_window_end_date}}'), INTERVAL 1 DAY))
60 | AND visitStartTime BETWEEN
61 | UNIX_SECONDS(TIMESTAMP_SUB(
62 | TIMESTAMP('{{conversion_window_start_date}}'), INTERVAL {{path_lookback_days}} DAY))
63 | AND UNIX_SECONDS('{{conversion_window_end_date}} 23:59:59 UTC')
64 | {% if hostnames %}
65 | AND hits.hitNumber = 1
66 | AND hits.page.hostname IN ({{hostnames}})
67 | {% endif %}
68 | ),
69 | FullVisitorIdUserIdMapTable AS (
70 | SELECT DISTINCT fullVisitorId, userId FROM `{{fullvisitorid_userid_map_table}}`
71 | )
72 | SELECT
73 | CASE
74 | WHEN FullVisitorIdUserIdMapTable.userId IS NOT NULL
75 | THEN CONCAT('u', FullVisitorIdUserIdMapTable.userId)
76 | ELSE CONCAT('f', FilteredSessions.fullVisitorId)
77 | END AS customerId,
78 | FilteredSessions.* EXCEPT (fullVisitorId)
79 | FROM FilteredSessions
80 | LEFT JOIN FullVisitorIdUserIdMapTable USING (fullVisitorId)
81 | -- Do not include a trailing ; as this query is included in another SQL query.
82 |
--------------------------------------------------------------------------------
/py/templates/generate_report.sql:
--------------------------------------------------------------------------------
1 | # Copyright 2022 Google LLC..
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Construct the fractribution report table, with channel level conversions, revenue spend and ROAS.
16 | --
17 | -- Aggregates fractional conversion and revenue data by channel, joins it with channel-level spend
18 | -- if supplied, and computes channel-level return on ad spend (ROAS).
19 | --
20 | -- Note that revenue data is not required. If revenue is NULL for a customer conversion, the
21 | -- conversion is ignored in the ROAS calculation. However, the channels on the path to conversion
22 | -- will still receive fractional attribution for the conversion.
23 | --
24 | -- Although the column name is revenue, alternative values can be substituted, like predicted
25 | -- customer-lifetime value.
26 |
27 | CREATE OR REPLACE TEMP TABLE ChannelSpendTable AS (
28 | {% include 'extract_channel_spend_data.sql' %}
29 | );
30 |
31 | CREATE OR REPLACE TABLE `{{report_table}}` AS (
32 | SELECT
33 | ConversionRevenueTable.conversionWindowStartDate,
34 | ConversionRevenueTable.conversionWindowEndDate,
35 | ConversionRevenueTable.channel,
36 | ConversionRevenueTable.conversions,
37 | ConversionRevenueTable.revenue,
38 | ChannelSpendTable.spend,
39 | SAFE_DIVIDE(ConversionRevenueTable.revenue, ChannelSpendTable.spend) AS roas
40 | FROM `{{report_table}}` AS ConversionRevenueTable
41 | LEFT JOIN ChannelSpendTable USING (channel)
42 | ORDER BY conversions DESC
43 | );
44 |
--------------------------------------------------------------------------------
/py/templates/path_summary.sql:
--------------------------------------------------------------------------------
1 | # Copyright 2022 Google LLC..
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Gets the total number of conversions, non-conversions and revenue by path.
16 | -- Args:
17 | -- paths_to_conversion_table: BigQuery table described in paths_to_conversion.sql
18 | -- paths_to_non_conversion_table: BigQuery table described in paths_to_non_conversion.sql
19 | WITH PathsToConversion AS (
20 | SELECT transformedPath, COUNT(*) AS conversions, SUM(revenue) AS revenue
21 | FROM `{{paths_to_conversion_table}}`
22 | GROUP BY transformedPath
23 | ), PathsToNonConversion AS (
24 | SELECT transformedPath, COUNT(*) AS nonConversions
25 | FROM `{{paths_to_non_conversion_table}}` GROUP BY transformedPath
26 | )
27 | SELECT
28 | IFNULL(PathsToConversion.transformedPath,
29 | PathsToNonConversion.transformedPath) AS transformedPath,
30 | IFNULL(PathsToConversion.conversions, 0) AS conversions,
31 | IFNULL(PathsToNonConversion.nonConversions, 0) AS nonConversions,
32 | PathsToConversion.revenue
33 | FROM PathsToConversion
34 | FULL JOIN PathsToNonConversion
35 | USING(transformedPath)
36 | -- Do not include a trailing ; as this query is included in another SQL query.
37 |
--------------------------------------------------------------------------------
/py/templates/path_transforms.sql:
--------------------------------------------------------------------------------
1 | # Copyright 2022 Google LLC..
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Functions for applying transformations to path arrays.
16 | -- unique: Identity transform.
17 | -- E.g. [D, A, B, B, C, D, C, C] --> [D, A, B, B, C, D, C, C].
18 | -- exposure: Collapse sequential repeats.
19 | -- E.g. [D, A, B, B, C, D, C, C] --> [D, A, B, C, D, C].
20 | -- first: Removes repeated events.
21 | -- E.g. [D, A, B, B, C, D, C, C] --> [D, A, B, C].
22 | -- frequency: Removes repeat events but tracks them with a count.
23 | -- E.g. [D, A, B, B, C, D, C, C] --> [D(2), A(1), B(2), C(3)).
24 |
25 | -- Returns the last path_lookback_steps channels in the path if path_lookback_steps > 0,
26 | -- or the full path otherwise.
27 | CREATE TEMP FUNCTION TrimLongPath(path ARRAY, path_lookback_steps INT64)
28 | RETURNS ARRAY
29 | LANGUAGE js AS """
30 | if (path_lookback_steps > 0) {
31 | return path.slice(Math.max(0, path.length - path_lookback_steps));
32 | }
33 | return path;
34 | """;
35 |
36 | -- Returns the path with all copies of targetElem removed, unless the path consists only of
37 | -- targetElems, in which case the original path is returned.
38 | CREATE TEMP FUNCTION RemoveIfNotAll(path ARRAY, targetElem STRING)
39 | RETURNS ARRAY
40 | LANGUAGE js AS """
41 | var transformedPath = [];
42 | for (var i = 0; i < path.length; i++) {
43 | if (path[i] !== targetElem) {
44 | transformedPath.push(path[i]);
45 | }
46 | }
47 | if (!transformedPath.length) {
48 | return path;
49 | }
50 | return transformedPath;
51 | """;
52 |
53 | -- Returns the path with all copies of targetElem removed from the tail, unless the path consists
54 | -- only of targetElems, in which case the original path is returned.
55 | CREATE TEMP FUNCTION RemoveIfLastAndNotAll(path ARRAY, targetElem STRING)
56 | RETURNS ARRAY
57 | LANGUAGE js AS """
58 | var tailIndex = path.length;
59 | for (var i = path.length - 1; i >= 0; i = i - 1) {
60 | if (path[i] != targetElem) {
61 | break;
62 | }
63 | tailIndex = i;
64 | }
65 | if (tailIndex > 0) {
66 | return path.slice(0, tailIndex);
67 | }
68 | return path;
69 | """;
70 |
71 | -- Returns the unique/identity transform of the given path array.
72 | -- E.g. [D, A, B, B, C, D, C, C] --> [D, A, B, B, C, D, C, C].
73 | CREATE TEMP FUNCTION Unique(path ARRAY)
74 | RETURNS ARRAY
75 | LANGUAGE js AS """
76 | return path;
77 | """;
78 |
79 | -- Returns the exposure transform of the given path array.
80 | -- Sequential duplicates are collapsed.
81 | -- E.g. [D, A, B, B, C, D, C, C] --> [D, A, B, C, D, C].
82 | CREATE TEMP FUNCTION Exposure(path ARRAY)
83 | RETURNS ARRAY
84 | LANGUAGE js AS """
85 | var transformedPath = [];
86 | for (var i = 0; i < path.length; i++) {
87 | if (i == 0 || path[i] != path[i-1]) {
88 | transformedPath.push(path[i]);
89 | }
90 | }
91 | return transformedPath;
92 | """;
93 |
94 | -- Returns the first transform of the given path array.
95 | -- Repeated channels are removed.
96 | -- E.g. [D, A, B, B, C, D, C, C] --> [D, A, B, C].
97 | CREATE TEMP FUNCTION First(path ARRAY)
98 | RETURNS ARRAY
99 | LANGUAGE js AS """
100 | var transformedPath = [];
101 | var channelSet = new Set();
102 | for (const channel of path) {
103 | if (!channelSet.has(channel)) {
104 | transformedPath.push(channel);
105 | channelSet.add(channel)
106 | }
107 | }
108 | return transformedPath;
109 | """;
110 |
111 | -- Returns the frequency transform of the given path array.
112 | -- Repeat events are removed, but tracked with a count.
113 | -- E.g. [D, A, B, B, C, D, C, C] --> [D(2), A(1), B(2), C(3)].
114 | CREATE TEMP FUNCTION Frequency(path ARRAY)
115 | RETURNS ARRAY
116 | LANGUAGE js AS """
117 | var channelToCount = {};
118 | for (const channel of path) {
119 | if (!(channel in channelToCount)) {
120 | channelToCount[channel] = 1
121 | } else {
122 | channelToCount[channel] +=1
123 | }
124 | }
125 | var transformedPath = [];
126 | for (const channel of path) {
127 | count = channelToCount[channel];
128 | if (count > 0) {
129 | transformedPath.push(channel + '(' + count.toString() + ')');
130 | // Reset count to 0, since the output has exactly one copy of each event.
131 | channelToCount[channel] = 0;
132 | }
133 | }
134 | return transformedPath;
135 | """;
136 |
--------------------------------------------------------------------------------
/py/templates/paths_to_conversion.sql:
--------------------------------------------------------------------------------
1 | # Copyright 2022 Google LLC..
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Extracts marketing channel paths that end in conversion for the customer.
16 | -- Args:
17 | -- path_lookback_days: Restrict to marketing channels within this many days of the conversion.
18 | -- path_lookback_steps: Limit the number of marketing channels before the conversion.
19 | -- path_transform: Function name for transforming the path
20 | -- (e.g. unique, exposure, first, frequency).
21 | SELECT
22 | ConversionsByCustomerId.customerId,
23 | conversionTimestamp,
24 | revenue,
25 | ARRAY_TO_STRING(TrimLongPath(
26 | ARRAY_AGG(channel ORDER BY visitStartTimestamp), {{path_lookback_steps}}),
27 | ' > ') AS path,
28 | ARRAY_TO_STRING(
29 | {% for path_transform_name, _ in path_transforms|reverse %}
30 | {{path_transform_name}}(
31 | {% endfor %}
32 | ARRAY_AGG(channel ORDER BY visitStartTimestamp)
33 | {% for _, arg_str in path_transforms %}
34 | {% if arg_str %}, {{arg_str}}{% endif %})
35 | {% endfor %}
36 | , ' > ') AS transformedPath,
37 | FROM ConversionsByCustomerId
38 | LEFT JOIN SessionsByCustomerId
39 | ON
40 | ConversionsByCustomerId.customerId = SessionsByCustomerId.customerId
41 | AND TIMESTAMP_DIFF(conversionTimestamp, visitStartTimestamp, DAY)
42 | BETWEEN 0 AND {{path_lookback_days}}
43 | GROUP BY
44 | ConversionsByCustomerId.customerId,
45 | conversionTimestamp,
46 | revenue
47 | -- Do not include a trailing ; as this query is included in another SQL query.
48 |
--------------------------------------------------------------------------------
/py/templates/paths_to_non_conversion.sql:
--------------------------------------------------------------------------------
1 | # Copyright 2022 Google LLC..
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Extracts marketing channel paths for customers that have not converted.
16 | -- Args:
17 | -- path_lookback_days: Restrict to marketing channels within this many days of the conversion.
18 | -- path_lookback_steps: Limit the number of marketing channels before the conversion.
19 | -- path_transform: Function name for transforming the path
20 | -- (e.g. unique, exposure, first, frequency).
21 | WITH Conversions AS (
22 | SELECT DISTINCT customerId
23 | FROM ConversionsByCustomerId
24 | ),
25 | NonConversions AS (
26 | SELECT
27 | SessionsByCustomerId.customerId,
28 | MAX(visitStartTimestamp) AS nonConversionTimestamp
29 | FROM SessionsByCustomerId
30 | LEFT JOIN Conversions
31 | USING (customerId)
32 | WHERE Conversions.customerId IS NULL
33 | GROUP BY SessionsByCustomerId.customerId
34 | )
35 | SELECT
36 | NonConversions.customerId,
37 | ARRAY_TO_STRING(TrimLongPath(
38 | ARRAY_AGG(channel ORDER BY visitStartTimestamp), {{path_lookback_steps}}), ' > ') AS path,
39 | ARRAY_TO_STRING(
40 | {% for path_transform_name, _ in path_transforms|reverse %}
41 | {{path_transform_name}}(
42 | {% endfor %}
43 | ARRAY_AGG(channel ORDER BY visitStartTimestamp)
44 | {% for _, arg_str in path_transforms %}
45 | {% if arg_str %}, {{arg_str}}{% endif %})
46 | {% endfor %}
47 | , ' > ') AS transformedPath,
48 | FROM NonConversions
49 | LEFT JOIN SessionsByCustomerId
50 | ON
51 | NonConversions.customerId = SessionsByCustomerId.customerId
52 | AND TIMESTAMP_DIFF(nonConversionTimestamp, visitStartTimestamp, DAY)
53 | BETWEEN 0 AND {{path_lookback_days}}
54 | GROUP BY NonConversions.customerId
55 | -- Do not include a trailing ; as this query is included in another SQL query.
56 |
--------------------------------------------------------------------------------
/py/templates/select_path_summary_query.sql:
--------------------------------------------------------------------------------
1 | # Copyright 2022 Google LLC..
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | SELECT transformedPath, conversions, nonConversions, revenue FROM `{{path_summary_table}}`
16 |
--------------------------------------------------------------------------------