├── .gitignore ├── LICENSE ├── README.md ├── contributing.md └── pills ├── CM ├── [DATA_PILL]_[CM]_Campaign_Overlap_(ADH)_v1.ipynb ├── [DATA_PILL]_[CM]_Frequency_Analysis_(ADH).ipynb └── [DATA_PILL]_[CM]_Offline_Conversions_uploader.ipynb ├── GA ├── [DATA_PILL]_[GA360]_Conversion_Blockers.ipynb ├── [DATA_PILL]_[GA360]_Feature_Importance_(v2).ipynb ├── [DATA_PILL]_[GA360]_Predictive_Lifetime_Value.ipynb ├── [DATA_PILL]_[GA]_Customer_Market_Intelligence_(CMI).ipynb ├── [DATA_PILL]_[GA]_Offline_conversion_upload_(from_Google_Cloud_Storage).ipynb └── [DATA_PILL]_[GA]_Offline_conversion_upload_(from_Google_Drive).ipynb ├── Google Ads ├── [DATA_PILL]_[Google_Ads]_Customer_Market_Intelligence_(CMI).ipynb └── [DATA_PILL]_[Google_Ads]_Frequency_and_Audience_Analysis_(ADH).ipynb └── Third Party └── [DATA_PILL]_[Appsflyer]_Install_Report_Analysis.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | **/.DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Welcome to Data Pills! 2 | 3 | Each _Data Pill_ consists in a _codelab_ styled, jupyter notebook built for Google Colabs (hosted by Google Drive), which performs advanced analysis on top of GA360, GA4, Google Ads, GMP and Third Party data and provides a **actionable** next steps. 4 | 5 | All Data Pills were built with simplicity in mind, in a way that it should only take a couple of minutes to set the environment variables to run an analysis against your media and analytics data. 6 | 7 | The complexity of each pill varies, requiring from basic python and SQL knowledge to more advanced Machine Learning proficiency 8 | 9 | **Disclaimer:** This is not an officially supported Google product. 10 | 11 | # Available Pills: 12 | 13 | | Data Pill | Objective | Data Source | Technologies used | 14 | | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------ | ------------------------------------------------------------ | 15 | | [CM] Frequency Analysis | Calculate the ideal frequency for campaigns ran through Campaign Manager or DV360 | Data Transfer or ADH | - Python (pandas)
-SQL | 16 | | [CM] Campaign Overlap | Understand the impact of each step of the funnel on client's conversion rate | Data Transfer or ADH | - Python (pandas)
-SQL | 17 | | [CM] Offline Conversions Uploader | Upload gclid or device_id based floodlight conversions to Campaing manager | Google Sheets | - Python | 18 | | [Google Ads] Frequency and Audience analysis | Calculate the ideal frequency and top performing audiences for display and video ads in Google Ads | ADH | - Python (pandas)
-SQL | 19 | | [Google Ads] Customer Market Intelligence (CMI) | Find out which Google Audiences are more aligned with your Customer Match audiences | Google Ads Audience Insights report | - Python (pandas) | 20 | | [GA360] Conversion Blockers | Automatically Identify UX issues on the website which are blocking conversions (i.e. lack of browser or language support, bad performing landing pages, …) | GA360 export to BigQuery | - Python (pandas)
-SQL | 21 | | [GA360] Feature Importance | Understand what behaviors (features) are most correlated with purchase probability | GA360 export to BigQuery | - Python (pandas)
- Sklearn (ML)
-SQL | 22 | | [GA360] Predictive Lifetime Value | Calculate the predicted lifetime value (amount of money the client will spend) in the future | GA360 export to BigQuery | - Python (pandas)
-SQL | 23 | | [GA] Measurement Protocol sender | Import offline conversions to Google Analytics Ecommerce | GA (free or 360) | - Python
- ClientId collection
- Offline Sales data | 24 | | [GA] Customer Market Intelligence (CMI) | Find out which Google Audiences are more aligned with your products or content | GA(free or 360) | - Python (pandas)
-JavaScript | 25 | | [Third Party] Appsflyer Install Report Analysis | Analyse CTIT behavior of multiple media sources using export file from Appsflyer's Install Report | Appsflyers Install Report CSV Export | - Python (numpy, pandas, seaborn, matplotlib)
| 26 | 27 | # How to use a Data Pill 28 | 29 | #### 1. Download `.ipynb` file to your local machine 30 | 31 |  Option #1 - From you browser: 32 | 33 | 1. Click on `Code` and then `Download ZIP`. 34 | 2. After download finished, simply unzip the folder. 35 | 36 |  Option #2 - git clone: 37 | 38 | 1. in your terminal, run `git clone https://github.com/google/data-pills.git` 39 | 40 | #### 2. Upload your `.ipynb` to a Google Drive folder 41 | 42 | As any other file... 43 | 44 | #### 3. Open `.ipynb` in Google Colaboratory 45 | 46 | 1. In your drive folder, right click on the `.ipynb` file and select open with >> Google Colaboratory 47 | > If that option is not available, go to drive.google.com, Select New >> More >> Google Colaboratory, close the new tab and try again 48 | 49 | #### 5. Follows notebook instructions 50 | 51 | You are all set, just follow the notebook instructions to run the use-case. 52 | 53 | # Feedback? 54 | 55 | Email: data-pills-faq@google.com 56 | -------------------------------------------------------------------------------- /contributing.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution; 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code Reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | 25 | ## Community Guidelines 26 | 27 | This project follows [Google's Open Source Community 28 | Guidelines](https://opensource.google/conduct/). 29 | -------------------------------------------------------------------------------- /pills/CM/[DATA_PILL]_[CM]_Offline_Conversions_uploader.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "[DATA PILL] [CM] - Offline Conversions uploader.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "lGpD88nBKjFV" 20 | }, 21 | "source": [ 22 | "### **MAKE A COPY OF THIS DOCUMENT BEFORE START WORKING ON IT**\n" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": { 28 | "id": "_elHVFYBbHcw" 29 | }, 30 | "source": [ 31 | "Copyright 2021 Google LLC\n", 32 | "\n", 33 | "Licensed under the Apache License, Version 2.0 (the \"License\");\n", 34 | "you may not use this file except in compliance with the License.\n", 35 | "You may obtain a copy of the License at\n", 36 | "\n", 37 | " https://www.apache.org/licenses/LICENSE-2.0\n", 38 | "\n", 39 | "Unless required by applicable law or agreed to in writing, software\n", 40 | "distributed under the License is distributed on an \"AS IS\" BASIS,\n", 41 | "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", 42 | "See the License for the specific language governing permissions and\n", 43 | "limitations under the License." 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": { 49 | "id": "rTEVcLgTCAQs" 50 | }, 51 | "source": [ 52 | "# Upload offline conversions to Campaign Manager\n", 53 | "\n", 54 | "\n", 55 | "\n", 56 | "> **What is?**\n", 57 | "\n", 58 | "\n", 59 | "CM Offline Conversions uploader is sample code that can easily be run by advertisers. With the CM Offline Conversions uploader, you can easily import conversions and build audiences out of those conversions. This is a great solution if you are looking for:\n", 60 | "\n", 61 | "* Uploading conversions that have been impacted by a campaign of the advertiser\n", 62 | "\n", 63 | "* Optimize a campaign based on offline Conversions\n", 64 | "\n", 65 | "* Build Audiences out of the conversions that you can upload.\n", 66 | "\n", 67 | "\n", 68 | "\n", 69 | "> **What do I need to use this Datapill?**\n", 70 | "\n", 71 | "1. Make a copy of this Colab (It is ready to go!)\n", 72 | "2. Create a spreadsheet in the Google Drive of the person that will run the cells of this DataPill (colab). The spreadsheet must have the information of the gclids or Device_IDs listed in the first column. There should be 1 ID per row. \n", 73 | "3. Create Sercvice Account and give it access to the Campaign Manager Advertiser\n", 74 | "\n", 75 | "For more detail visit [this document](https://docs.google.com/document/d/1P9ApL9_lMna53LYB6LzgHYkq_ecJaI5imT2FDG83fmg/edit?usp=sharing)\n", 76 | "\n", 77 | "\n", 78 | "\n", 79 | "\n", 80 | "> **Is there anything else I need to know?**\n", 81 | "\n", 82 | "\n", 83 | "1. To upload Offline conversions based on Device_IDs (IDFA,ADID) they should have been exposed to a Campaign Manager's Campaign from the Advertiser who is trying to upload the conversions.\n", 84 | "\n", 85 | "\n", 86 | "\n", 87 | "\n", 88 | "> **Documentation**\n", 89 | "\n", 90 | "[Conversions batchUpdate via API](https://developers.google.com/doubleclick-advertisers/v3.3/conversions/batchupdate) \n", 91 | "\n", 92 | "[FAQ Campaign Manager API](https://developers.google.com/doubleclick-advertisers/guides/conversions_faq)\n", 93 | "\n", 94 | "\n", 95 | "\n", 96 | "> **Disclaimers**\n", 97 | "\n", 98 | "Copyright 2019 Google LLC. This solution, including any related sample code or data, is made available on an “as is,” “as available,” and “with all faults” basis, solely for illustrative purposes, and without warranty or representation of any kind. This solution is experimental, unsupported and provided solely for your convenience. Your use of it is subject to your agreements with Google, as applicable, and may constitute a beta feature as defined under those agreements. To the extent that you make any data available to Google in connection with your use of the solution, you represent and warrant that you have all necessary and appropriate rights, consents and permissions to permit Google to use and process that data. By using any portion of this solution, you acknowledge, assume and accept all risks, known and unknown, associated with its usage, including with respect to your deployment of any portion of this solution in your systems, or usage in connection with your business, if at all.\n" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": { 104 | "id": "paTZ7QtKp4kl" 105 | }, 106 | "source": [ 107 | "\n", 108 | "\n", 109 | "---\n", 110 | "\n", 111 | "\n", 112 | "\n", 113 | "---\n", 114 | "\n", 115 | "\n", 116 | "\n", 117 | "# **Upload Necesary information for Offline conversions**\n", 118 | "\n", 119 | "**key_type** needs to have one of those two values: \n", 120 | "- gclid\n", 121 | "- mobileDeviceId\n" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "metadata": { 127 | "id": "VtfUTwXgdRnR" 128 | }, 129 | "source": [ 130 | "# Model parameters\n", 131 | "floodlightActivityId = 123123#@param {type:\"integer\"}\n", 132 | "floodlightConfigurationId = 3123123 #@param {type:\"integer\"}\n", 133 | "credentials_dict = \"\" #@param {type:\"string\"}\n", 134 | "profileId_CM = \"1231231\" #@param {type:\"string\"}\n", 135 | "sheet_name = \"devices_ids\" #@param {type:\"string\"}\n", 136 | "key_type = \"mobileDeviceId\" #@param {type: \"string\"}\n" 137 | ], 138 | "execution_count": null, 139 | "outputs": [] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "metadata": { 144 | "id": "wodc-R2BrIGD" 145 | }, 146 | "source": [ 147 | "" 148 | ], 149 | "execution_count": null, 150 | "outputs": [] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "metadata": { 155 | "id": "pdmPOg9UaXf4" 156 | }, 157 | "source": [ 158 | "print(credentials_dict)" 159 | ], 160 | "execution_count": null, 161 | "outputs": [] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "metadata": { 166 | "id": "Yv6XJpP22s3B" 167 | }, 168 | "source": [ 169 | "!pip install --upgrade -q gspread" 170 | ], 171 | "execution_count": null, 172 | "outputs": [] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "metadata": { 177 | "id": "ukjdOsHN2s15" 178 | }, 179 | "source": [ 180 | "from google.colab import auth\n", 181 | "auth.authenticate_user()\n", 182 | "\n", 183 | "import gspread\n", 184 | "from oauth2client.client import GoogleCredentials\n", 185 | "\n", 186 | "gc = gspread.authorize(GoogleCredentials.get_application_default())\n", 187 | "\n" 188 | ], 189 | "execution_count": null, 190 | "outputs": [] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "metadata": { 195 | "id": "llQXm8hS2nh0" 196 | }, 197 | "source": [ 198 | "import json\n", 199 | "import httplib2\n", 200 | "import time\n", 201 | "from oauth2client.service_account import ServiceAccountCredentials\n", 202 | "from googleapiclient import discovery\n", 203 | "\n", 204 | "\n", 205 | "def chunks(l, n):\n", 206 | " for i in range(0, len(l), n):\n", 207 | " yield l[i:i+n]\n", 208 | "\n", 209 | "\n", 210 | "def get_conversions():\n", 211 | " conversions = []\n", 212 | " conversion = {\n", 213 | " 'floodlightActivityId': floodlightActivityId,\n", 214 | " 'floodlightConfigurationId': floodlightConfigurationId,\n", 215 | " } \n", 216 | " worksheet = gc.open(sheet_name).sheet1\n", 217 | "\n", 218 | " # get_all_values gives a list of rows.\n", 219 | " results = worksheet.get_all_values()\n", 220 | "\n", 221 | " customVars = []\n", 222 | "\n", 223 | " for row in results:\n", 224 | " \n", 225 | " current_time_in_micros = int(time.time() * 1000000)\n", 226 | " conversion[key_type] = row[0]\n", 227 | " conversion['ordinal'] = current_time_in_micros\n", 228 | " conversion['timestampMicros'] = current_time_in_micros\n", 229 | " # customVars.append(\n", 230 | " # {\n", 231 | " # \"kind\": \"dfareporting#customFloodlightVariable\",\n", 232 | " # \"type\": 'u1',\n", 233 | " # \"value\": row[1]\n", 234 | " # }\n", 235 | " # )\n", 236 | " \n", 237 | "\n", 238 | " # conversion['customVariables'] = customVars\n", 239 | " print(conversion)\n", 240 | " conversions.append(conversion)\n", 241 | " return conversions\n", 242 | "\n", 243 | "\n", 244 | "def send_conversions(conversion, service):\n", 245 | " request_body = {\n", 246 | " 'conversions': conversion,\n", 247 | " }\n", 248 | " request = service.conversions().batchinsert(profileId= profileId_CM,\n", 249 | " body=request_body)\n", 250 | " \n", 251 | " ### SHOULD WE CHANGE THIS PROFILE ID \n", 252 | " \n", 253 | " response = request.execute()\n", 254 | "\n", 255 | " print(response)\n", 256 | "\n", 257 | "\n", 258 | "\n", 259 | "\n", 260 | "def get_service_auth():\n", 261 | " OAUTH_SCOPES = ['https://www.googleapis.com/auth/dfareporting', 'https://www.googleapis.com/auth/ddmconversions', 'https://www.googleapis.com/auth/dfatrafficking']\n", 262 | "\n", 263 | "\n", 264 | "\n", 265 | " json_data = json.loads(credentials_dict, strict=False)\n", 266 | "\n", 267 | "\n", 268 | " \"\"\"Authorizes an httplib2.Http instance using service account credentials.\"\"\"\n", 269 | " # Load the service account credentials from the specified JSON keyfile.\n", 270 | " credentials = ServiceAccountCredentials.from_json_keyfile_dict(json_data, scopes=OAUTH_SCOPES)\n", 271 | " \n", 272 | " # Use the credentials to authorize an httplib2.Http instance.\n", 273 | " http = credentials.authorize(httplib2.Http())\n", 274 | "\n", 275 | " # Construct a service object via the discovery service.\n", 276 | " service = discovery.build('dfareporting', 'v3.3', http=http)\n", 277 | " \n", 278 | " return service\n", 279 | "\n", 280 | "\n", 281 | "def main():\n", 282 | " data = get_conversions()\n", 283 | " print(\"________________________________________\") \n", 284 | " print(\"Sendind chunks\")\n", 285 | " service = get_service_auth()\n", 286 | " conversions = chunks(data, 100)\n", 287 | " for conversion in conversions:\n", 288 | " print(conversion)\n", 289 | " send_conversions(conversion, service)\n", 290 | " return \"Done\"\n", 291 | "\n", 292 | "main()\n", 293 | "\n" 294 | ], 295 | "execution_count": null, 296 | "outputs": [] 297 | } 298 | ] 299 | } -------------------------------------------------------------------------------- /pills/GA/[DATA_PILL]_[GA360]_Conversion_Blockers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 2, 4 | "metadata": { 5 | "colab": { 6 | "name": "[DATA PILL] [GA360] - Conversion Blockers.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "toc_visible": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "Copyright 2021 Google LLC\n", 21 | "\n", 22 | "Licensed under the Apache License, Version 2.0 (the \"License\");\n", 23 | "you may not use this file except in compliance with the License.\n", 24 | "You may obtain a copy of the License at\n", 25 | "\n", 26 | " https://www.apache.org/licenses/LICENSE-2.0\n", 27 | "\n", 28 | "Unless required by applicable law or agreed to in writing, software\n", 29 | "distributed under the License is distributed on an \"AS IS\" BASIS,\n", 30 | "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", 31 | "See the License for the specific language governing permissions and\n", 32 | "limitations under the License.\n", 33 | "\n", 34 | "\n", 35 | "#Important\n", 36 | "This content are intended for educational and informational purposes only." 37 | ], 38 | "metadata": { 39 | "id": "J_xMGY09NFC3" 40 | } 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "source": [ 45 | "## Conversion Blockers Analysis\n", 46 | "
\n", 47 | "In this analysis we will be looking into main user characteristics captured by Google Analytics which can affect website UX and how they impact e-commerce transaction rate.\n", 48 | "
\n", 49 | "**Key notes / assumptions**\n", 50 | "
\n", 51 | "For the following analysis, we will call specific data properties (i.e. Browser version) a FEATURE, and each value of a feature (i.e. Chrome V10.1), a LABEL\n" 52 | ], 53 | "metadata": { 54 | "id": "bkSydQkR2uxI" 55 | } 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "source": [ 60 | "## Step 1: Setup " 61 | ], 62 | "metadata": { 63 | "id": "9jhVJvanZdvn" 64 | } 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "source": [ 69 | "### Install all dependencies and authorize bigQuery access \n", 70 | "\n", 71 | "---\n", 72 | "\n" 73 | ], 74 | "metadata": { 75 | "id": "7J7PyfCWcp5P" 76 | } 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "source": [ 82 | "# Import all necessary libs\n", 83 | "from google.colab import auth\n", 84 | "import pandas as pd\n", 85 | "import numpy as np\n", 86 | "from matplotlib import pyplot as plt\n", 87 | "from IPython.display import display, HTML\n", 88 | "\n", 89 | "# Authenticate the user to query datasets in Google BigQuery\n", 90 | "auth.authenticate_user()\n", 91 | "%matplotlib inline\n" 92 | ], 93 | "outputs": [], 94 | "metadata": { 95 | "id": "N1d8sVBCNjsb" 96 | } 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "source": [ 101 | "###Define analysis parameters" 102 | ], 103 | "metadata": { 104 | "id": "y8mvCvMw5PnC" 105 | } 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "source": [ 111 | "#@title Define the data source in BigQuery:\n", 112 | "project_id = 'bigquery-public-data' #@param\n", 113 | "dataset_name = 'google_analytics_sample' #@param\n", 114 | "table_name = 'ga_sessions_*'#@param\n", 115 | "start_date = '2014-10-01'#@param {type:\"date\"}\n", 116 | "end_date = '2019-12-12'#@param{type:\"date\"}\n", 117 | "billing_project_id = 'my-project' #@param" 118 | ], 119 | "outputs": [], 120 | "metadata": { 121 | "id": "f5NT-5pT22rn", 122 | "cellView": "form" 123 | } 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "source": [ 128 | "##Step 2: Create analysis building blocks" 129 | ], 130 | "metadata": { 131 | "id": "48dXc8TyZti9" 132 | } 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "source": [ 137 | "On the following coding blocks, we will create functions that will allow us to easily run the analysis multiple times, one for each feature" 138 | ], 139 | "metadata": { 140 | "id": "dfkKLHrUZJDV" 141 | } 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "source": [ 146 | "###Create query builder function based on tamplate" 147 | ], 148 | "metadata": { 149 | "id": "ga44qyoUcWi3" 150 | } 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "source": [ 156 | "#assemble dynamic content dictionary\n", 157 | "dc = {}\n", 158 | "dc['project_id'] = project_id\n", 159 | "dc['dataset_name'] = dataset_name\n", 160 | "dc['table_name'] = table_name\n", 161 | "dc['start_date'] = start_date.replace('-','')\n", 162 | "dc['end_date'] = end_date.replace('-','')\n", 163 | "\n", 164 | "#render final query function\n", 165 | "def render_final_query(dc, display = False):\n", 166 | " q1 = '''\n", 167 | " #fetch # of transaction, sessions and transaction rate for each feature value\n", 168 | " WITH t0 AS \n", 169 | " (SELECT\n", 170 | " {feature} AS feature,\n", 171 | " SUM(IFNULL(sessions.totals.transactions, 0)) AS transactions,\n", 172 | " COUNT(sessions.visitStartTime) AS count_sessions,\n", 173 | " SUM(IFNULL(sessions.totals.transactions, 0))/COUNT(sessions.visitStartTime) AS transaction_rate\n", 174 | " FROM\n", 175 | " `{project_id}.{dataset_name}.{table_name}` as sessions,\n", 176 | " UNNEST(hits) AS hits\n", 177 | " WHERE\n", 178 | " hits.hitNumber = 1 AND\n", 179 | " date BETWEEN '{start_date}'\n", 180 | " AND '{end_date}'\n", 181 | " GROUP BY 1\n", 182 | " ),\n", 183 | "\n", 184 | " #calculate % of total sessions of each feature value and global (avg) transaction rate \n", 185 | " t1 AS\n", 186 | " (\n", 187 | " SELECT \n", 188 | " *,\n", 189 | " SUM(count_sessions) OVER() AS total_sessions,\n", 190 | " SUM(transactions) OVER() AS total_transaction,\n", 191 | " AVG(transaction_rate) OVER() AS average_transaction_rate,\n", 192 | " count_sessions/SUM(count_sessions) OVER() AS sessions_percentage\n", 193 | " FROM t0\n", 194 | " ORDER BY transaction_rate\n", 195 | " )\n", 196 | "\n", 197 | " #limit results to only values that represent over 2% of all sessions\n", 198 | " #and, for remaining lines evaluate if they are bellow stdev limit \n", 199 | " SELECT *,\n", 200 | " IF(transaction_rate < average_transaction_rate * 0.2, true, false) AS bellow_limit\n", 201 | " from t1\n", 202 | " WHERE sessions_percentage > 0.01\n", 203 | " '''.format(**dc)\n", 204 | " if display:\n", 205 | " print('Final BigQuery SQL:')\n", 206 | " print(q1)\n", 207 | " return q1" 208 | ], 209 | "outputs": [], 210 | "metadata": { 211 | "id": "Z9w_ClosPdlD" 212 | } 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "source": [ 218 | "#run bigQuery query function\n", 219 | "def run_big_query(q):\n", 220 | " return pd.io.gbq.read_gbq(q, project_id=billing_project_id, verbose=False, dialect='standard')" 221 | ], 222 | "outputs": [], 223 | "metadata": { 224 | "id": "cNCQqNb0aDIX" 225 | } 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "source": [ 230 | "### Create function to Display Query results in bar chart \n" 231 | ], 232 | "metadata": { 233 | "id": "dVz9AYy9ccw1" 234 | } 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "source": [ 240 | "def plot_graph(df, title):\n", 241 | " #define column colors:\n", 242 | " colors = []\n", 243 | " for index, row in df.iterrows():\n", 244 | " bellow_limit = df['bellow_limit'][index]\n", 245 | " if(bellow_limit):\n", 246 | " colors.append('r') #set color to red\n", 247 | " else:\n", 248 | " colors.append('b') #set color to blue\n", 249 | "\n", 250 | "\n", 251 | " # Specify this list of colors as the `color` option to `plot`.\n", 252 | " df.plot(x='feature', y='transaction_rate', kind='bar', stacked=False, color = colors, title = title, yticks=[])" 253 | ], 254 | "outputs": [], 255 | "metadata": { 256 | "id": "FlffPsUt8ez4" 257 | } 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "source": [ 262 | "## Step 3: Run entire pipeline for each feature and plot results\n" 263 | ], 264 | "metadata": { 265 | "id": "z1p7TXK2Lu5U" 266 | } 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "source": [ 272 | "#uncomment each line to enable that analysis\n", 273 | "features = [\n", 274 | "(\"Operating System\",\"CONCAT(sessions.device.operatingSystem, ' ', sessions.device.operatingSystemVersion)\"),\n", 275 | "(\"Browser\",\"CONCAT( sessions.device.browser, ' ', sessions.device.browserversion)\"),\n", 276 | "(\"Language\",\"sessions.device.language\"),\n", 277 | "#(\"Device Type\",\"sessions.device.deviceCategory\"),\n", 278 | "#(\"Country\",\"sessions.geoNetwork.country\"),\n", 279 | "#(\"Region\",\"sessions.geoNetwork.region\"),\n", 280 | "#(\"City\",\"sessions.geoNetwork.city\"),\n", 281 | "#(\"Landing Page\",\"CONCAT(hits.page.hostname, hits.page.pagePath)\"),\n", 282 | "#(\"Screen Pixels (e5)\",\"IF(ARRAY_LENGTH(SPLIT(sessions.device.screenResolution,'x')) = 2,ROUND(CAST(SPLIT(sessions.device.screenResolution,'x')[OFFSET(0)] AS INT64) * CAST(SPLIT(sessions.device.screenResolution,'x')[OFFSET(1)] AS INT64)/100000), Null)\")\n", 283 | "]" 284 | ], 285 | "outputs": [], 286 | "metadata": { 287 | "id": "tOUMtK44AIJT" 288 | } 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "source": [ 294 | "#for each feature Tuple\n", 295 | "for item in features:\n", 296 | " #define custom values for SQL Query generation\n", 297 | " dc['feature'] = item[1]\n", 298 | " #generate sql\n", 299 | " q = render_final_query(dc, display=True)\n", 300 | "\n", 301 | " # REMOVE LINE BELLOW to execute query (this might result in bigQuery costs)\n", 302 | "\n", 303 | " \n", 304 | " #run query in BQ\n", 305 | " df = run_big_query(q)\n", 306 | " #print query results\n", 307 | " print(\"Results for \" + item[0])\n", 308 | " display(df)\n", 309 | " print(\" \")\n", 310 | " #plot graph\n", 311 | " plot_graph(df, item[0])" 312 | ], 313 | "outputs": [], 314 | "metadata": { 315 | "id": "1nnGeTpIaLkz" 316 | } 317 | } 318 | ] 319 | } -------------------------------------------------------------------------------- /pills/GA/[DATA_PILL]_[GA360]_Feature_Importance_(v2).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "[DATA PILL] [GA360] - Feature Importance (v2).ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "MmVwSGJZdhwr" 20 | }, 21 | "source": [ 22 | "**PLEASE MAKE A COPY BEFORE CHANGING**\n", 23 | "\n", 24 | "**Copyright** 2021 Google LLC\n", 25 | "\n", 26 | "Licensed under the Apache License, Version 2.0 (the \"License\");\n", 27 | "you may not use this file except in compliance with the License.\n", 28 | "You may obtain a copy of the License at\n", 29 | "\n", 30 | " https://www.apache.org/licenses/LICENSE-2.0\n", 31 | "\n", 32 | "Unless required by applicable law or agreed to in writing, software\n", 33 | "distributed under the License is distributed on an \"AS IS\" BASIS,\n", 34 | "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", 35 | "See the License for the specific language governing permissions and\n", 36 | "limitations under the License.\n", 37 | "\n", 38 | "\n", 39 | "Important\n", 40 | "This content are intended for educational and informational purposes only." 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": { 46 | "id": "PuoFxgMPfFvT" 47 | }, 48 | "source": [ 49 | "## Introduction \n", 50 | "Purpose: The goal of this colab is to show an example of how to calculate conversion probability. As a result we can create the feature importance report.\n", 51 | "\n", 52 | "**Key notes**\n", 53 | "\n", 54 | "* This example assumes enhanced ecommerce is implemented (we are predicting transactions).\n", 55 | "* It is possible to adjust the query to predict other events instead of a transaction.\n", 56 | "\n", 57 | "**Instructions**\n", 58 | "* First of all: MAKE A COPY;\n", 59 | "* Fulfill the query parameters in the Box 1;\n", 60 | "* In the menu above click in Runtime > Run All;\n", 61 | "* Authorize your credentials;" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": { 67 | "id": "RYYfkF-7vdTP" 68 | }, 69 | "source": [ 70 | "## User Input (Training Query)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "metadata": { 76 | "id": "Wx8PfPYcLQqg", 77 | "cellView": "form" 78 | }, 79 | "source": [ 80 | "project_id = 'your-billing-project-id'#@param\n", 81 | "table = 'your-project-id.your-ga-dataset.ga_sessions_*'#@param\n", 82 | "lookback_start_date = '2018-08-01'#@param {type:\"date\"}\n", 83 | "lookback_end_date = '2018-08-31'#@param {type:\"date\"}\n", 84 | "conversion_window_start_date = '2018-09-01'#@param {type:\"date\"}\n", 85 | "conversion_window_end_date = '2018-09-30'#@param {type:\"date\"}\n", 86 | "prediction_type = 'transaction'#@param['transaction', 'event']\n", 87 | "event_filter_type = 'eventLabel'#@param['eventCategory', 'eventAction', 'eventLabel', ' ']\n", 88 | "event_filter_value = 'conversion'#@param\n", 89 | "test_size = 0.5#@param\n", 90 | "downsample_majority_class = 0.1#@param {type:\"slider\", min:0.1, max:1, step:0.1}" 91 | ], 92 | "execution_count": null, 93 | "outputs": [] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": { 98 | "id": "Y2WbVWBxfEpi" 99 | }, 100 | "source": [ 101 | "## User Input (Classification Query)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "metadata": { 107 | "id": "I_SC9p2bfCkK", 108 | "cellView": "form" 109 | }, 110 | "source": [ 111 | "classification_start_date = '2019-10-01'#@param {type:\"date\"}\n", 112 | "classification_end_date = '2019-10-31'#@param {type:\"date\"}\n", 113 | "index_dimension = 'ga:dimension14'#@param\n", 114 | "value_dimension = 'ga:dimension15'#@param\n" 115 | ], 116 | "execution_count": null, 117 | "outputs": [] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": { 122 | "id": "Kre62wxNviEg" 123 | }, 124 | "source": [ 125 | "## Code Section" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "metadata": { 131 | "id": "_HJ-BSp1bniG" 132 | }, 133 | "source": [ 134 | "import matplotlib.pyplot as plt\n", 135 | "import numpy as np\n", 136 | "import pandas as pd\n", 137 | "import seaborn as sns\n", 138 | "from google.colab import auth, files\n", 139 | "from sklearn.base import BaseEstimator, TransformerMixin\n", 140 | "from sklearn.compose import ColumnTransformer, make_column_transformer\n", 141 | "from sklearn.decomposition import PCA\n", 142 | "from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier\n", 143 | "from sklearn.linear_model import LogisticRegressionCV\n", 144 | "from sklearn.metrics import roc_auc_score\n", 145 | "from sklearn.metrics import auc\n", 146 | "from sklearn.metrics import accuracy_score\n", 147 | "from sklearn.metrics import confusion_matrix\n", 148 | "from sklearn.metrics import roc_curve\n", 149 | "from sklearn.metrics import accuracy_score\n", 150 | "from sklearn.metrics import classification_report\n", 151 | "from sklearn.model_selection import GridSearchCV, train_test_split\n", 152 | "from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion\n", 153 | "from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler\n", 154 | "\n", 155 | "%matplotlib inline" 156 | ], 157 | "execution_count": null, 158 | "outputs": [] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "metadata": { 163 | "id": "NFDW-WVVas3j" 164 | }, 165 | "source": [ 166 | "# Function to print results\n", 167 | "def results(X_test, Y_test, clf):\n", 168 | " probs = clf.predict_proba(X_test)\n", 169 | " auc_ = roc_auc_score(Y_test, probs[:,1])\n", 170 | " print(\"AUC: %.4f\" % auc_)\n", 171 | " predictions = clf.predict(X_test)\n", 172 | " print(\"accuracy: %.4f\" % accuracy_score(Y_test, predictions))\n", 173 | " print(classification_report(Y_test, clf.predict(X_test)))\n", 174 | "\n", 175 | "# Function to plot a roc curve\n", 176 | "def plot_roc_curve(X_test, model):\n", 177 | " probs = model.predict_proba(X_test)\n", 178 | " preds = probs[:,1]\n", 179 | " fpr, tpr, threshold = roc_curve(y_test, preds)\n", 180 | " roc_auc = auc(fpr, tpr)\n", 181 | " plt.title('AdaBoosting AUC Curve')\n", 182 | " plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)\n", 183 | " plt.legend(loc = 'lower right')\n", 184 | " plt.plot([0, 1], [0, 1],'r--')\n", 185 | " plt.xlim([0, 1])\n", 186 | " plt.ylim([0, 1])\n", 187 | " plt.ylabel('True Positive Rate')\n", 188 | " plt.xlabel('False Positive Rate')\n", 189 | " plt.show()\n", 190 | "\n", 191 | "# Function to plot feature importance (valid for adaBoosting only)\n", 192 | "def feature_relevance(X_test, model):\n", 193 | " names = X_test.columns\n", 194 | " feature_importance = model.feature_importances_\n", 195 | " # make importances relative to max importance\n", 196 | " feature_importance = 100.0 * (feature_importance / feature_importance.max())\n", 197 | " sorted_idx = np.argsort(feature_importance)\n", 198 | " pos = np.arange(sorted_idx.shape[0]) + .5\n", 199 | " plt.figure(figsize=(20,10))\n", 200 | " plt.subplot(1, 2, 2)\n", 201 | " plt.barh(pos, feature_importance[sorted_idx], align='center')\n", 202 | " plt.yticks(pos,map(lambda x: names[x], sorted_idx))\n", 203 | " plt.xlabel('Relative Importance')\n", 204 | " plt.title('Variable Importance')\n", 205 | " plt.show()" 206 | ], 207 | "execution_count": null, 208 | "outputs": [] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "metadata": { 213 | "id": "9-B2SzEEcA0L" 214 | }, 215 | "source": [ 216 | "# Authenticate the user to access BigQuery Projects\n", 217 | "auth.authenticate_user()" 218 | ], 219 | "execution_count": null, 220 | "outputs": [] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": { 225 | "id": "4auG4L21vMMc" 226 | }, 227 | "source": [ 228 | "" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "metadata": { 234 | "id": "XsgDpGu-arO9" 235 | }, 236 | "source": [ 237 | "# Build the query\n", 238 | "dc ={}\n", 239 | "dc['project_id'] = project_id\n", 240 | "dc['table'] = table\n", 241 | "dc['lookback_start_date'] = lookback_start_date.replace('-', '')\n", 242 | "dc['lookback_end_date'] = lookback_end_date.replace('-', '')\n", 243 | "dc['conversion_window_start_date'] = conversion_window_start_date.replace('-', '')\n", 244 | "dc['conversion_window_end_date'] = conversion_window_end_date.replace('-', '')\n", 245 | "dc['prediction_type'] = prediction_type\n", 246 | "dc['event_filter_type'] = event_filter_type\n", 247 | "dc['event_filter_value'] = event_filter_value\n", 248 | "dc['downsample_majority_class'] = downsample_majority_class\n", 249 | "dc['classification_start_date'] = classification_start_date.replace('-', '')\n", 250 | "dc['classification_end_date'] = classification_end_date.replace('-', '')\n", 251 | "\n", 252 | "q1 = \"\"\"\n", 253 | "WITH\n", 254 | " latest_session AS (\n", 255 | " SELECT\n", 256 | " * EXCEPT(rn)\n", 257 | " FROM (\n", 258 | " SELECT\n", 259 | " ROW_NUMBER() OVER(PARTITION BY clientid ORDER BY visitnumber DESC) AS rn,\n", 260 | " clientid,\n", 261 | " visitNumber,\n", 262 | " channelgrouping,\n", 263 | " IF(device.browser NOT IN ('Chrome', 'Safari', 'Firefox', 'Android Webview', 'Edge'), 'Others', device.browser) as browser,\n", 264 | " device.deviceCategory,\n", 265 | " IF(device.operatingSystem NOT IN('Android', 'iOS', 'Windows', 'Macintosh', 'Linux'), 'Others', device.operatingSystem ) AS operatingSystem,\n", 266 | " geoNetwork.region\n", 267 | " FROM\n", 268 | " `{table}`\n", 269 | " WHERE\n", 270 | " _TABLE_SUFFIX BETWEEN '{lookback_start_date}' AND '{lookback_end_date}'\n", 271 | " AND clientid IS NOT NULL)\n", 272 | " WHERE\n", 273 | " rn = 1 ),\n", 274 | "\n", 275 | "session_hits as (\n", 276 | "SELECT\n", 277 | " clientid,\n", 278 | " SUM(totals.visits) AS visits,\n", 279 | " SUM(totals.pageviews) AS pageviews,\n", 280 | " SUM(totals.hits) AS hits,\n", 281 | " SUM(totals.timeonsite) AS timeonsite,\n", 282 | " SUM(totals.bounces) AS bounces,\n", 283 | " SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE \"America/Los_Angeles\") IN (5,6,7,8,9,10) THEN 1 ELSE 0 END) AS morning_visits,\n", 284 | " SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE \"America/Los_Angeles\") IN (11,12,13,14,15,16) THEN 1 ELSE 0 END) AS daytime_visits,\n", 285 | " SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE \"America/Los_Angeles\") IN (17,18,19,20,21,22) THEN 1 ELSE 0 END) AS evening_visits,\n", 286 | " SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE \"America/Los_Angeles\") IN (23,24,0,1,2,3,4) THEN 1 ELSE 0 END) AS midnight_visits,\n", 287 | " SUM(totals.transactions) AS conversion,\n", 288 | " SUM(totals.totalTransactionRevenue) / 100000 AS revenue\n", 289 | "FROM\n", 290 | " `{table}`\n", 291 | "WHERE\n", 292 | " _TABLE_SUFFIX BETWEEN '{lookback_start_date}' AND '{lookback_end_date}' AND clientid IS NOT NULL\n", 293 | "GROUP BY 1),\n", 294 | " \n", 295 | "converted as (\n", 296 | "SELECT \n", 297 | " *\n", 298 | "FROM (\n", 299 | " SELECT \n", 300 | " clientid,\n", 301 | " SUM(totals.transactions) AS y_conversions\n", 302 | " FROM\n", 303 | " `{table}`\n", 304 | " WHERE\n", 305 | " _TABLE_SUFFIX BETWEEN '{conversion_window_start_date}' AND '{conversion_window_end_date}' AND clientid IS NOT NULL\n", 306 | " GROUP BY 1)\n", 307 | "WHERE \n", 308 | " y_conversions > 0\n", 309 | "),\n", 310 | "\n", 311 | "joined as(\n", 312 | "SELECT\n", 313 | " sh.clientid,\n", 314 | " ls.channelgrouping AS last_channel,\n", 315 | " ls.browser,\n", 316 | " ls.deviceCategory,\n", 317 | " ls.operatingSystem,\n", 318 | " ls.region,\n", 319 | " ls.visitnumber AS current_visit,\n", 320 | " IFNULL(SUM(sh.visits), 0) AS total_visits,\n", 321 | " IFNULL(SUM(sh.pageviews), 0) AS total_pageviews,\n", 322 | " IFNULL(SUM(sh.hits), 0) AS total_hits,\n", 323 | " IFNULL(SUM(sh.timeonsite), 0) AS total_timeonsite,\n", 324 | " IFNULL(SUM(sh.bounces), 0) AS total_bounces,\n", 325 | " IFNULL(SUM(sh.morning_visits), 0) AS total_morning_visits,\n", 326 | " IFNULL(SUM(sh.daytime_visits), 0) AS total_daytime_visits,\n", 327 | " IFNULL(SUM(sh.evening_visits), 0) AS total_evening_visits,\n", 328 | " IFNULL(SUM(sh.midnight_visits), 0) AS total_midnight_visits,\n", 329 | " IFNULL(SUM(sh.conversion), 0) AS total_conversions,\n", 330 | " IF(IFNULL(SUM(c.y_conversions), 0) > 0, 1, 0) AS y_conversions\n", 331 | "FROM\n", 332 | " session_hits sh LEFT OUTER JOIN latest_session ls\n", 333 | " ON sh.clientid = ls.clientid\n", 334 | " LEFT OUTER JOIN converted c ON sh.clientid = c.clientid\n", 335 | "GROUP BY 1,2,3,4,5,6,7)\n", 336 | "\n", 337 | "SELECT * FROM joined WHERE y_conversions = 0 AND RAND() <= {downsample_majority_class} UNION ALL(SELECT * FROM joined WHERE y_conversions = 1)\n", 338 | "\"\"\".format(**dc)\n", 339 | "\n", 340 | "q2 = \"\"\"\n", 341 | "\n", 342 | "WITH\n", 343 | " latest_session AS (\n", 344 | " SELECT\n", 345 | " * EXCEPT(rn)\n", 346 | " FROM (\n", 347 | " SELECT\n", 348 | " ROW_NUMBER() OVER(PARTITION BY clientid ORDER BY visitnumber DESC) AS rn,\n", 349 | " clientid,\n", 350 | " visitNumber,\n", 351 | " channelgrouping,\n", 352 | " IF(device.browser NOT IN ('Chrome', 'Safari', 'Firefox', 'Samsung Internet', 'Android Webview', 'Edge'), 'Others', device.browser) as browser,\n", 353 | " device.deviceCategory,\n", 354 | " IF(device.operatingSystem NOT IN('Android', 'iOS', 'Windows', 'Macintosh', 'Linux'), 'Others', device.operatingSystem ) AS operatingSystem,\n", 355 | " geoNetwork.region\n", 356 | " FROM\n", 357 | " `{table}`\n", 358 | " WHERE\n", 359 | " _TABLE_SUFFIX BETWEEN '{lookback_start_date}' AND '{lookback_end_date}'\n", 360 | " AND clientid IS NOT NULL)\n", 361 | " WHERE\n", 362 | " rn = 1 ),\n", 363 | "\n", 364 | "session_hits as (\n", 365 | "SELECT\n", 366 | " clientid,\n", 367 | " SUM(totals.visits) AS visits,\n", 368 | " SUM(totals.pageviews) AS pageviews,\n", 369 | " SUM(totals.hits) AS hits,\n", 370 | " SUM(totals.timeonsite) AS timeonsite,\n", 371 | " SUM(totals.bounces) AS bounces,\n", 372 | " SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE \"America/Los_Angeles\") IN (5,6,7,8,9,10) THEN 1 ELSE 0 END) AS morning_visits,\n", 373 | " SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE \"America/Los_Angeles\") IN (11,12,13,14,15,16) THEN 1 ELSE 0 END) AS daytime_visits,\n", 374 | " SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE \"America/Los_Angeles\") IN (17,18,19,20,21,22) THEN 1 ELSE 0 END) AS evening_visits,\n", 375 | " SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE \"America/Los_Angeles\") IN (23,24,0,1,2,3,4) THEN 1 ELSE 0 END) AS midnight_visits,\n", 376 | " SUM(totals.transactions) AS conversion,\n", 377 | " SUM(totals.totalTransactionRevenue) / 100000 AS revenue\n", 378 | "FROM\n", 379 | " `{table}`\n", 380 | "WHERE\n", 381 | " _TABLE_SUFFIX BETWEEN '{lookback_start_date}' AND '{lookback_end_date}' AND clientid IS NOT NULL\n", 382 | "GROUP BY 1),\n", 383 | " \n", 384 | "converted as (\n", 385 | "SELECT \n", 386 | " *\n", 387 | "FROM (\n", 388 | " SELECT \n", 389 | " clientid,\n", 390 | " COUNT(1) AS y_conversions\n", 391 | " FROM\n", 392 | " `{table}`, UNNEST(hits) h\n", 393 | " WHERE\n", 394 | " _TABLE_SUFFIX BETWEEN '{conversion_window_start_date}' AND '{conversion_window_end_date}' AND clientid IS NOT NULL\n", 395 | " AND h.eventInfo.{event_filter_type}\t= '{event_filter_value}'\n", 396 | " GROUP BY 1)\n", 397 | "WHERE \n", 398 | " y_conversions > 0\n", 399 | "),\n", 400 | "\n", 401 | "joined as(\n", 402 | "SELECT\n", 403 | " sh.clientid,\n", 404 | " ls.channelgrouping AS last_channel,\n", 405 | " ls.browser,\n", 406 | " ls.deviceCategory,\n", 407 | " ls.operatingSystem,\n", 408 | " ls.region,\n", 409 | " ls.visitnumber AS current_visit,\n", 410 | " IFNULL(SUM(sh.visits), 0) AS total_visits,\n", 411 | " IFNULL(SUM(sh.pageviews), 0) AS total_pageviews,\n", 412 | " IFNULL(SUM(sh.hits), 0) AS total_hits,\n", 413 | " IFNULL(SUM(sh.timeonsite), 0) AS total_timeonsite,\n", 414 | " IFNULL(SUM(sh.bounces), 0) AS total_bounces,\n", 415 | " IFNULL(SUM(sh.morning_visits), 0) AS total_morning_visits,\n", 416 | " IFNULL(SUM(sh.daytime_visits), 0) AS total_daytime_visits,\n", 417 | " IFNULL(SUM(sh.evening_visits), 0) AS total_evening_visits,\n", 418 | " IFNULL(SUM(sh.midnight_visits), 0) AS total_midnight_visits,\n", 419 | " IFNULL(SUM(sh.conversion), 0) AS total_conversions,\n", 420 | " IF(IFNULL(SUM(c.y_conversions), 0) > 0, 1, 0) AS y_conversions\n", 421 | "FROM\n", 422 | " session_hits sh LEFT OUTER JOIN latest_session ls\n", 423 | " ON sh.clientid = ls.clientid\n", 424 | " LEFT OUTER JOIN converted c ON sh.clientid = c.clientid\n", 425 | "GROUP BY 1,2,3,4,5,6,7)\n", 426 | "\n", 427 | "SELECT * FROM joined WHERE y_conversions = 0 AND RAND() <= {downsample_majority_class} UNION ALL(SELECT * FROM joined WHERE y_conversions = 1)\n", 428 | "\n", 429 | "\"\"\".format(**dc)\n", 430 | "\n", 431 | "\n", 432 | "q3 = \"\"\"\n", 433 | "WITH\n", 434 | " latest_session AS (\n", 435 | " SELECT\n", 436 | " * EXCEPT(rn)\n", 437 | " FROM (\n", 438 | " SELECT\n", 439 | " ROW_NUMBER() OVER(PARTITION BY clientid ORDER BY visitnumber DESC) AS rn,\n", 440 | " clientid,\n", 441 | " visitNumber,\n", 442 | " channelgrouping,\n", 443 | " IF(device.browser NOT IN ('Chrome', 'Safari', 'Firefox', 'Android Webview', 'Edge'), 'Others', device.browser) as browser,\n", 444 | " device.deviceCategory,\n", 445 | " IF(device.operatingSystem NOT IN('Android', 'iOS', 'Windows', 'Macintosh', 'Linux'), 'Others', device.operatingSystem ) AS operatingSystem,\n", 446 | " geoNetwork.region\n", 447 | " FROM\n", 448 | " `{table}`\n", 449 | " WHERE\n", 450 | " _TABLE_SUFFIX BETWEEN '{classification_start_date}' AND '{classification_end_date}'\n", 451 | " AND clientid IS NOT NULL)\n", 452 | " WHERE\n", 453 | " rn = 1 ),\n", 454 | "\n", 455 | "session_hits as (\n", 456 | "SELECT\n", 457 | " clientid,\n", 458 | " SUM(totals.visits) AS visits,\n", 459 | " SUM(totals.pageviews) AS pageviews,\n", 460 | " SUM(totals.hits) AS hits,\n", 461 | " SUM(totals.timeonsite) AS timeonsite,\n", 462 | " SUM(totals.bounces) AS bounces,\n", 463 | " SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE \"America/Los_Angeles\") IN (5,6,7,8,9,10) THEN 1 ELSE 0 END) AS morning_visits,\n", 464 | " SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE \"America/Los_Angeles\") IN (11,12,13,14,15,16) THEN 1 ELSE 0 END) AS daytime_visits,\n", 465 | " SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE \"America/Los_Angeles\") IN (17,18,19,20,21,22) THEN 1 ELSE 0 END) AS evening_visits,\n", 466 | " SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE \"America/Los_Angeles\") IN (23,24,0,1,2,3,4) THEN 1 ELSE 0 END) AS midnight_visits,\n", 467 | " SUM(totals.transactions) AS conversion,\n", 468 | " SUM(totals.totalTransactionRevenue) / 100000 AS revenue\n", 469 | "FROM\n", 470 | " `{table}`\n", 471 | "WHERE\n", 472 | " _TABLE_SUFFIX BETWEEN '{classification_start_date}' AND '{classification_end_date}' AND clientid IS NOT NULL\n", 473 | "GROUP BY 1),\n", 474 | " \n", 475 | "\n", 476 | "\n", 477 | "joined as(\n", 478 | "SELECT\n", 479 | " sh.clientid,\n", 480 | " ls.channelgrouping AS last_channel,\n", 481 | " ls.browser,\n", 482 | " ls.deviceCategory,\n", 483 | " ls.operatingSystem,\n", 484 | " ls.region,\n", 485 | " ls.visitnumber AS current_visit,\n", 486 | " IFNULL(SUM(sh.visits), 0) AS total_visits,\n", 487 | " IFNULL(SUM(sh.pageviews), 0) AS total_pageviews,\n", 488 | " IFNULL(SUM(sh.hits), 0) AS total_hits,\n", 489 | " IFNULL(SUM(sh.timeonsite), 0) AS total_timeonsite,\n", 490 | " IFNULL(SUM(sh.bounces), 0) AS total_bounces,\n", 491 | " IFNULL(SUM(sh.morning_visits), 0) AS total_morning_visits,\n", 492 | " IFNULL(SUM(sh.daytime_visits), 0) AS total_daytime_visits,\n", 493 | " IFNULL(SUM(sh.evening_visits), 0) AS total_evening_visits,\n", 494 | " IFNULL(SUM(sh.midnight_visits), 0) AS total_midnight_visits,\n", 495 | " IFNULL(SUM(sh.conversion), 0) AS total_conversions\n", 496 | "FROM\n", 497 | " session_hits sh LEFT OUTER JOIN latest_session ls\n", 498 | " ON sh.clientid = ls.clientid\n", 499 | "GROUP BY 1,2,3,4,5,6,7)\n", 500 | "\n", 501 | "SELECT * FROM joined\n", 502 | "\n", 503 | "\n", 504 | "\"\"\".format(**dc)\n", 505 | "if prediction_type == 'transaction':\n", 506 | " q = q1\n", 507 | "else:\n", 508 | " q = q2" 509 | ], 510 | "execution_count": null, 511 | "outputs": [] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "metadata": { 516 | "id": "QVubqAV6NqPp" 517 | }, 518 | "source": [ 519 | "%%time\n", 520 | "df = pd.io.gbq.read_gbq(q, project_id=project_id, verbose=False, dialect='standard')" 521 | ], 522 | "execution_count": null, 523 | "outputs": [] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "metadata": { 528 | "id": "2m-XciRYMO4-" 529 | }, 530 | "source": [ 531 | "df.head()" 532 | ], 533 | "execution_count": null, 534 | "outputs": [] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "metadata": { 539 | "id": "oAeRsAGqSFmM" 540 | }, 541 | "source": [ 542 | "print(\"Dataset has {} rows and {} columns\".format(df.shape[0], df.shape[1]))\n", 543 | "print()\n", 544 | "print(\"Class distribution:\")\n", 545 | "print(df.y_conversions.value_counts())\n", 546 | "print()\n", 547 | "print(\"converters to non converters proportion:\")\n", 548 | "print(df.y_conversions.value_counts()[1] / df.y_conversions.value_counts()[0])" 549 | ], 550 | "execution_count": null, 551 | "outputs": [] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "metadata": { 556 | "id": "XGPTJpKLnC-d" 557 | }, 558 | "source": [ 559 | "# Drop the label and clientid (Xs)\n", 560 | "X_all = df.drop(['y_conversions', 'clientid', 'region', 'browser', 'operatingSystem'],1)\n", 561 | "# Select the label to predict (Ys)\n", 562 | "y_all = df['y_conversions']\n", 563 | "# Get all categorical columns in a list.\n", 564 | "text = list(X_all.select_dtypes(include=['object', 'category']).columns)\n", 565 | "# Get all numeric columns in a list.\n", 566 | "numbers = list(X_all.select_dtypes(include=np.number))\n", 567 | "# Convert categoricals into the proper type\n", 568 | "X_all.loc[:,text] = X_all.loc[:,text].astype('category'\u001c)\n", 569 | "# Stratified split into train, test sets\n", 570 | "X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, \n", 571 | " test_size = test_size,\n", 572 | " random_state = 4,\n", 573 | " stratify = y_all)" 574 | ], 575 | "execution_count": null, 576 | "outputs": [] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "metadata": { 581 | "id": "VzOLghgQDsQb" 582 | }, 583 | "source": [ 584 | "# Build and fit the pipeline\n", 585 | "\n", 586 | "preprocess = make_column_transformer(\n", 587 | " (OneHotEncoder(handle_unknown='ignore'), text),\n", 588 | " (StandardScaler(), numbers))\n", 589 | "\n", 590 | "pipe_ada = make_pipeline(\n", 591 | " preprocess,\n", 592 | " AdaBoostClassifier(n_estimators=150, learning_rate=0.1, random_state=42)\n", 593 | ")\n", 594 | "\n", 595 | "pipe_reg = make_pipeline(\n", 596 | " preprocess,\n", 597 | " LogisticRegressionCV(max_iter=1000)\n", 598 | ")\n", 599 | "\n", 600 | "pipe_ada.fit(X_train,y_train)\n", 601 | "pipe_reg.fit(X_train, y_train)\n", 602 | "\n", 603 | ";" 604 | ], 605 | "execution_count": null, 606 | "outputs": [] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "metadata": { 611 | "id": "a7gaiZIen2e1" 612 | }, 613 | "source": [ 614 | "# Plot the model results\n", 615 | "print('Adaboosting Results:')\n", 616 | "print()\n", 617 | "results(X_test, y_test, pipe_ada)\n", 618 | "plot_roc_curve(X_test, pipe_ada)\n", 619 | "\n", 620 | "print()\n", 621 | "print('Logistic Regression Results:')\n", 622 | "print()\n", 623 | "results(X_test, y_test, pipe_reg)\n", 624 | "plot_roc_curve(X_test, pipe_reg)" 625 | ], 626 | "execution_count": null, 627 | "outputs": [] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "metadata": { 632 | "id": "k09QFncT8WW1" 633 | }, 634 | "source": [ 635 | "cat = list(pipe_ada.named_steps.columntransformer.transformers_[0][1].get_feature_names())\n", 636 | "features = cat + numbers\n", 637 | "X_test = pipe_ada.named_steps.columntransformer.transform(X_test)\n", 638 | "X_test = pd.DataFrame(X_test, columns = features)" 639 | ], 640 | "execution_count": null, 641 | "outputs": [] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "metadata": { 646 | "id": "VvoYzq2IGNwl" 647 | }, 648 | "source": [ 649 | "feature_relevance(X_test, pipe_ada.named_steps.adaboostclassifier)" 650 | ], 651 | "execution_count": null, 652 | "outputs": [] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "metadata": { 657 | "id": "J29fUCshVlvn" 658 | }, 659 | "source": [ 660 | "%%time\n", 661 | "df = pd.io.gbq.read_gbq(q3, project_id=project_id, verbose=False, dialect='standard')" 662 | ], 663 | "execution_count": null, 664 | "outputs": [] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "metadata": { 669 | "id": "yarEJhybV8cx" 670 | }, 671 | "source": [ 672 | "preds = pipe_reg.predict_proba(df.drop(['clientid', 'region', 'browser', 'operatingSystem'],1))\n", 673 | "df['prob'] = preds[:,1]\n", 674 | "df.head(10)" 675 | ], 676 | "execution_count": null, 677 | "outputs": [] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "metadata": { 682 | "id": "5x-eKu4rW2dQ" 683 | }, 684 | "source": [ 685 | "df['segment'] = df.prob.apply(lambda x: 'high' if x > 0.5 else('medium' if x >0.3 else 'low'))" 686 | ], 687 | "execution_count": null, 688 | "outputs": [] 689 | }, 690 | { 691 | "cell_type": "code", 692 | "metadata": { 693 | "id": "r3CbeZ6miVsq" 694 | }, 695 | "source": [ 696 | "df.segment.value_counts()" 697 | ], 698 | "execution_count": null, 699 | "outputs": [] 700 | }, 701 | { 702 | "cell_type": "code", 703 | "metadata": { 704 | "id": "gzmmnfQUYCvq" 705 | }, 706 | "source": [ 707 | "df = df.loc[:, ['clientid', 'segment']]\n", 708 | "df.columns = [index_dimension, value_dimension]\n", 709 | "df.to_csv('dataset.csv', index=False)" 710 | ], 711 | "execution_count": null, 712 | "outputs": [] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "metadata": { 717 | "id": "GTAVp-q7icxP" 718 | }, 719 | "source": [ 720 | "df.head()" 721 | ], 722 | "execution_count": null, 723 | "outputs": [] 724 | } 725 | ] 726 | } -------------------------------------------------------------------------------- /pills/GA/[DATA_PILL]_[GA360]_Predictive_Lifetime_Value.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "[DATA PILL] [GA360] - Predictive Lifetime Value.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "aG6_2HbY7X1H" 20 | }, 21 | "source": [ 22 | "**PLEASE MAKE A COPY BEFORE CHANGING**\n", 23 | "\n", 24 | "**Copyright** 2021 Google LLC\n", 25 | "\n", 26 | "Licensed under the Apache License, Version 2.0 (the \"License\");\n", 27 | "you may not use this file except in compliance with the License.\n", 28 | "You may obtain a copy of the License at\n", 29 | "\n", 30 | " https://www.apache.org/licenses/LICENSE-2.0\n", 31 | "\n", 32 | "Unless required by applicable law or agreed to in writing, software\n", 33 | "distributed under the License is distributed on an \"AS IS\" BASIS,\n", 34 | "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", 35 | "See the License for the specific language governing permissions and\n", 36 | "limitations under the License.\n", 37 | "\n", 38 | "\n", 39 | "Important\n", 40 | "This content are intended for educational and informational purposes only." 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": { 46 | "id": "reE-i-9V7Efh" 47 | }, 48 | "source": [ 49 | "## Introduction\n", 50 | "\n", 51 | "**objective**: The goal of this colab is to calculate the Lifetime Value of your customer base. The method used for calculation is the BG/NDB model as described in this [paper](http://mktg.uni-svishtov.bg/ivm/resources/Counting_Your_Customers.pdf). \n", 52 | "\n" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": { 58 | "id": "hXoq1O3vSlj-" 59 | }, 60 | "source": [ 61 | "# Code Section" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "metadata": { 67 | "id": "mwTtzedhvDvH" 68 | }, 69 | "source": [ 70 | "!pip install -q lifetimes\n", 71 | "!pip install -q --upgrade git+https://github.com/HIPS/autograd.git@master \n", 72 | "!pip install -U -q PyDrive" 73 | ], 74 | "execution_count": null, 75 | "outputs": [] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "metadata": { 80 | "id": "JxUJRrvUjUr4" 81 | }, 82 | "source": [ 83 | "from google.colab import auth\n", 84 | "from googleapiclient.discovery import build\n", 85 | "from pydrive.auth import GoogleAuth\n", 86 | "from pydrive.drive import GoogleDrive\n", 87 | "from oauth2client.client import GoogleCredentials\n", 88 | "\n", 89 | "from lifetimes import BetaGeoFitter\n", 90 | "from lifetimes.plotting import plot_frequency_recency_matrix\n", 91 | "from lifetimes.plotting import plot_probability_alive_matrix\n", 92 | "from lifetimes.plotting import plot_period_transactions\n", 93 | "from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases\n", 94 | "from lifetimes.plotting import plot_history_alive\n", 95 | "from lifetimes import GammaGammaFitter\n", 96 | "\n", 97 | "import datetime\n", 98 | "import pandas as pd\n", 99 | "import matplotlib.pyplot as plt\n", 100 | "\n", 101 | "# Allow plots to be displayed inline.\n", 102 | "%matplotlib inline\n", 103 | "\n", 104 | "# Authenticate the user\n", 105 | "auth.authenticate_user()" 106 | ], 107 | "execution_count": null, 108 | "outputs": [] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": { 113 | "id": "o3_-vXnmTIJ5" 114 | }, 115 | "source": [ 116 | "### Input your settings to run the model\n", 117 | "\n", 118 | "* **project_id**: The id of Google Cloud project where the query will run.\n", 119 | "* **table_name**: Name of the Google Analytics table\n", 120 | "transaction.\n", 121 | "* **time_unit**: The granularity to group transactions (weeks is usually the best)\n", 122 | "* **units_to_predict**: Number of periods to predict (in case of using weeks as time_unit, 52 would predict a year ahead)\n", 123 | "* **number_of_segments**: Number segments to group the users in.\n", 124 | "* **id_type**: Two options of IDs. client_id is based on GA cookie (not cross device and can change overtime. user_id can be provide better acurracy).\n", 125 | "* **use_id_dimension_index**: The custom dimension index that is holding the user id value. If the id_type is user_id, this field is mandatory.\n", 126 | "* **data_import_key_index**: The custom dimension index that is holding the user id. If you are not sure, don't worry, it can be changed later.\n", 127 | "* **data_import_value_index**: The custom dimension index that is holding the LTV segment. If you are not sure, don't worry, it can be changed later.\n", 128 | "\n", 129 | "\n" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": { 135 | "id": "7Mlc3e_8S3qK" 136 | }, 137 | "source": [ 138 | "# New Section" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "metadata": { 144 | "id": "7YrNXngZtNNe" 145 | }, 146 | "source": [ 147 | "project_id = 'my-project'#@param\n", 148 | "table_name = 'bigquery-public-data.google_analytics_sample.ga_sessions_*'#@param\n", 149 | "time_unit = 'weeks'#@param ['days', 'weeks', 'months']\n", 150 | "units_to_predict = 52#@param\n", 151 | "start_date = '2018-01-01'#@param{type:\"date\"}\n", 152 | "end_date = '2019-01-01'#@param{type:\"date\"}\n", 153 | "number_of_segments = 5#@param\n", 154 | "id_type = 'client_id'#@param ['client_id', 'user_id']\n", 155 | "user_id_dimension_index = 0#@param\n", 156 | "data_import_key_index = 11#@param\n", 157 | "data_import_value_index = 12#@param\n", 158 | "\n", 159 | "dc = {}\n", 160 | "dc['start_date'] = start_date.replace('-', '')\n", 161 | "dc['end_date'] = end_date.replace('-', '')\n", 162 | "dc['table_name'] = table_name\n", 163 | "dc['id_type'] = id_type\n", 164 | "dc['user_id_dimension_index'] = user_id_dimension_index\n", 165 | "\n", 166 | "if time_unit == 'days':\n", 167 | " dc['time_unit'] = 1\n", 168 | "elif time_unit == 'weeks' or time_unit == '':\n", 169 | " dc['time_unit'] = 7\n", 170 | "else:\n", 171 | " dc['time_unit'] = 12\n" 172 | ], 173 | "execution_count": null, 174 | "outputs": [] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": { 179 | "id": "KSrNvcEbWTOj" 180 | }, 181 | "source": [ 182 | "### Get data from BigQuery\n", 183 | "\n", 184 | "Query the Google Analytics 360 in BigQuery to create the RFM matrix containing the following columns.\n", 185 | "\n", 186 | "\n", 187 | "* **user_id**: id of the user\n", 188 | "* **total_transactions**: Ammount of transaction during the period.\n", 189 | "* **average_order_value**: Sum of total_transaction_value / total_transactions\n", 190 | "* **frequency**: Represents the number of repeat purchases the customer has made.\n", 191 | "* **recency**: Represents the age of the customer when they made their most recent purchase. This is equal to the duration between a customer’s first purchase and their latest purchase. (Thus if they have made only 1 purchase, the recency is 0.)\n", 192 | "* **T**: Represents the age of the customer in whatever time units chosen (weekly, in the above dataset). This is equal to the duration between a customer’s first purchase and the end of the period under study.\n", 193 | "\n", 194 | "\n", 195 | "\n" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "metadata": { 201 | "id": "v-HSwebe7zF9" 202 | }, 203 | "source": [ 204 | "q1 = \"\"\"\n", 205 | "\n", 206 | "WITH transactions as (\n", 207 | "SELECT\n", 208 | " clientid AS user_id,\n", 209 | " PARSE_DATE('%Y%m%d', date) AS transaction_date,\n", 210 | " SUM(totals.totalTransactionRevenue) / 1000000 AS transaction_value\n", 211 | "FROM \n", 212 | " `{table_name}`\n", 213 | " WHERE\n", 214 | " _TABLE_SUFFIX BETWEEN '{start_date}'\n", 215 | " AND '{end_date}' AND totals.transactions > 0\n", 216 | " AND totals.totalTransactionRevenue > 0\n", 217 | "GROUP BY\n", 218 | " 1, 2)\n", 219 | " \n", 220 | "SELECT\n", 221 | " user_id,\n", 222 | " COUNT(1) AS total_transactions,\n", 223 | " ROUND(SUM(transaction_value)/COUNT(1),1) AS average_order_value,\n", 224 | " (COUNT(1)-1) AS frequency, \n", 225 | " ROUND (DATE_DIFF(MAX(transaction_date),\n", 226 | " MIN(transaction_date),\n", 227 | " DAY) / {time_unit} ,1) # time multiplyer \n", 228 | " AS recency,\n", 229 | "\n", 230 | " ROUND((DATE_DIFF((SELECT MAX(transaction_date) FROM transactions),\n", 231 | " MIN(transaction_date),\n", 232 | " DAY)+1) / {time_unit} ,1) # time multiplyer\n", 233 | " AS T\n", 234 | "FROM\n", 235 | " transactions\n", 236 | "GROUP BY\n", 237 | " 1\n", 238 | " \n", 239 | "\"\"\".format(**dc)\n", 240 | "\n", 241 | "\n", 242 | "q2 = \"\"\"\n", 243 | "\n", 244 | "WITH transactions as(\n", 245 | "SELECT\n", 246 | " cds.value as user_id,\n", 247 | " PARSE_DATE('%Y%m%d', date) AS transaction_date,\n", 248 | " SUM(totals.totalTransactionRevenue) / 1000000 AS transaction_value\n", 249 | "FROM\n", 250 | " `{table_name}`,\n", 251 | " UNNEST(customdimensions) AS cds\n", 252 | "WHERE\n", 253 | " _TABLE_SUFFIX BETWEEN '{start_date}'\n", 254 | " AND '{end_date}'\n", 255 | " AND cds.index = {user_id_dimension_index}\n", 256 | " AND totals.transactions > 0\n", 257 | "GROUP BY\n", 258 | " 1,2)\n", 259 | " \n", 260 | "SELECT\n", 261 | " user_id,\n", 262 | " COUNT(1) AS total_transactions,\n", 263 | " ROUND(SUM(transaction_value)/COUNT(1),1) AS average_order_value,\n", 264 | " (COUNT(1)-1) AS frequency, \n", 265 | " ROUND (DATE_DIFF(MAX(transaction_date),\n", 266 | " MIN(transaction_date),\n", 267 | " DAY) / 7 ,1) # time multiplyer \n", 268 | " AS recency,\n", 269 | "\n", 270 | " ROUND((DATE_DIFF((SELECT MAX(transaction_date) FROM transactions),\n", 271 | " MIN(transaction_date),\n", 272 | " DAY)+1) / 7 ,1) # time multiplyer\n", 273 | " AS T\n", 274 | "FROM\n", 275 | " transactions\n", 276 | "GROUP BY\n", 277 | " 1\n", 278 | "\"\"\".format(**dc)\n", 279 | "\n", 280 | "if id_type == 'user_id' and user_id_dimension_index != 0:\n", 281 | " q = q2\n", 282 | "else:\n", 283 | " q = q1\n" 284 | ], 285 | "execution_count": null, 286 | "outputs": [] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "metadata": { 291 | "id": "HJgr69Q660q8" 292 | }, 293 | "source": [ 294 | "df = pd.io.gbq.read_gbq(q, project_id=project_id, verbose=False, dialect='standard') " 295 | ], 296 | "execution_count": null, 297 | "outputs": [] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "metadata": { 302 | "id": "Jijd7nI9ppt9" 303 | }, 304 | "source": [ 305 | "df.head()" 306 | ], 307 | "execution_count": null, 308 | "outputs": [] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": { 313 | "id": "fkjAd28SccGM" 314 | }, 315 | "source": [ 316 | "### Train the model\n", 317 | "Using the transformed dataset we fit the BetaGeoFitter model. Once the model is trained we can plot probability of alive Matrix.\n", 318 | "\n", 319 | "Once the BetaGeofitter model is trained, we can train a Gamma Model to estimate future average order value." 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "metadata": { 325 | "id": "E4qvnRbaXZTv" 326 | }, 327 | "source": [ 328 | "bgf = BetaGeoFitter(penalizer_coef=0.0)\n", 329 | "bgf.fit(df['frequency'], df['recency'], df['T'])" 330 | ], 331 | "execution_count": null, 332 | "outputs": [] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "metadata": { 337 | "id": "x2UDtwNlzPnp" 338 | }, 339 | "source": [ 340 | "plt.figure(figsize=(10,10))\n", 341 | "plot_frequency_recency_matrix(bgf)\n", 342 | ";" 343 | ], 344 | "execution_count": null, 345 | "outputs": [] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "metadata": { 350 | "id": "F9UxML7cot6G" 351 | }, 352 | "source": [ 353 | "plt.figure(figsize=(10,10))\n", 354 | "plot_probability_alive_matrix(bgf)\n", 355 | ";" 356 | ], 357 | "execution_count": null, 358 | "outputs": [] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "metadata": { 363 | "id": "lW4AeDa909HC" 364 | }, 365 | "source": [ 366 | "ggf = GammaGammaFitter(penalizer_coef = 0)\n", 367 | "ggf.fit(df['total_transactions'],\n", 368 | " df['average_order_value'])" 369 | ], 370 | "execution_count": null, 371 | "outputs": [] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "metadata": { 376 | "id": "0oeF7L384uPW" 377 | }, 378 | "source": [ 379 | "df['prob_alive'] = bgf.conditional_probability_alive(df['frequency'], df['recency'], df['T'])\n", 380 | "df['predicted_transactions'] = bgf.conditional_expected_number_of_purchases_up_to_time(units_to_predict, df['frequency'], df['recency'], df['T'])\n", 381 | "df['predicted_aov'] = ggf.conditional_expected_average_profit(df['total_transactions'], df['average_order_value'])\n", 382 | "df['predicted_ltv'] = df['predicted_transactions'] * df['predicted_aov']\n" 383 | ], 384 | "execution_count": null, 385 | "outputs": [] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": { 390 | "id": "6DFYUqmSdxIN" 391 | }, 392 | "source": [ 393 | "### Results\n", 394 | "\n", 395 | "In this section we display the following by user:\n", 396 | "\n", 397 | "\n", 398 | "\n", 399 | "* **prob_alive**: The probability of a customer being alive\n", 400 | "* **predicted_transactions**: The predicted number of transactions in the predicted period\n", 401 | "* **predicted_aov**: The predicted average order value in the predicted period\n", 402 | "* **predicted_ltv**: The total customer life time value for the predicted period. (predicted_ltv = predicted_aov * predicted_transactions)\n", 403 | "\n", 404 | "\n", 405 | "\n", 406 | "\n", 407 | "\n", 408 | "\n", 409 | "We also group the customer into N segments." 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "metadata": { 415 | "id": "wqjNkQKeb8ma" 416 | }, 417 | "source": [ 418 | "df['segment'] = pd.qcut(df['predicted_ltv'], number_of_segments, labels=list(range(1,number_of_segments+1)))\n", 419 | "summary = df.groupby('segment').agg({'prob_alive':'mean', 'predicted_transactions': 'mean', \n", 420 | " 'predicted_aov': 'mean', \n", 421 | " 'predicted_ltv': ['mean','sum']})" 422 | ], 423 | "execution_count": null, 424 | "outputs": [] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "metadata": { 429 | "id": "Rj5mrsZFY9-t" 430 | }, 431 | "source": [ 432 | "summary = summary.round(2)" 433 | ], 434 | "execution_count": null, 435 | "outputs": [] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "metadata": { 440 | "id": "1p0YF73yY-sX" 441 | }, 442 | "source": [ 443 | "df.head()" 444 | ], 445 | "execution_count": null, 446 | "outputs": [] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "metadata": { 451 | "id": "SJA_d-YATHUp" 452 | }, 453 | "source": [ 454 | "data_import_key_index = 'ga:dimension{}'.format(data_import_key_index)\n", 455 | "data_import_value_index = 'ga:dimension{}'.format(data_import_value_index)\n", 456 | "\n", 457 | "df = df[['user_id', 'segment']] \n", 458 | "df.columns = [data_import_key_index, data_import_value_index]\n", 459 | "df.head()" 460 | ], 461 | "execution_count": null, 462 | "outputs": [] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "metadata": { 467 | "id": "uJXFNdBTUNg2" 468 | }, 469 | "source": [ 470 | "# Optional: Save output to drive." 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "metadata": { 476 | "id": "vekNf6jon4S5" 477 | }, 478 | "source": [ 479 | "def output_to_googledrive(df, output_file_name='ltv.csv'):\n", 480 | " date = str(datetime.datetime.today()).split()[0]\n", 481 | " file_name = date + '_' + output_file_name\n", 482 | " file_url = 'https://drive.google.com/open?id='\n", 483 | " gauth = GoogleAuth()\n", 484 | " gauth.credentials = GoogleCredentials.get_application_default()\n", 485 | " drive = GoogleDrive(gauth)\n", 486 | " uploaded = drive.CreateFile({'title': file_name})\n", 487 | " uploaded.SetContentString(df.to_csv(index=False))\n", 488 | " uploaded.Upload()\n", 489 | " print(file_name)\n", 490 | " print('Full File URL: {}{}'.format(file_url, uploaded.get('id')))" 491 | ], 492 | "execution_count": null, 493 | "outputs": [] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "metadata": { 498 | "id": "9DXLzt8mnQ3v" 499 | }, 500 | "source": [ 501 | "output_to_googledrive(df, output_file_name='full_ltv.csv')\n", 502 | "output_to_googledrive(summary, output_file_name='summary_ltv.csv')\n" 503 | ], 504 | "execution_count": null, 505 | "outputs": [] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "metadata": { 510 | "id": "Z57pTY1KSN0t" 511 | }, 512 | "source": [ 513 | "### Sending data back to Google Analytics 360\n", 514 | "\n", 515 | "We have 2 approaches to upload information back to GA.\n", 516 | "\n", 517 | "\n", 518 | "1. **Measurement Protocol Hits**: http call to our collection servers passing the ltv and segmentation per user.\n", 519 | "2. **Data Import (Query Time)**: Upload a csv into Google Analytics 360 using the management API our the UI.\n", 520 | "\n" 521 | ] 522 | } 523 | ] 524 | } -------------------------------------------------------------------------------- /pills/GA/[DATA_PILL]_[GA]_Customer_Market_Intelligence_(CMI).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "[DATA_PILL]_[GA]_Customer_Market_Intelligence_(CMI).ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "zAjuEzhY2P20" 20 | }, 21 | "source": [ 22 | "**PLEASE MAKE A COPY BEFORE CHANGING**\n", 23 | "\n", 24 | "**Copyright** 2021 Google LLC\n", 25 | "\n", 26 | "Licensed under the Apache License, Version 2.0 (the \"License\");\n", 27 | "you may not use this file except in compliance with the License.\n", 28 | "You may obtain a copy of the License at\n", 29 | "\n", 30 | " https://www.apache.org/licenses/LICENSE-2.0\n", 31 | "\n", 32 | "Unless required by applicable law or agreed to in writing, software\n", 33 | "distributed under the License is distributed on an \"AS IS\" BASIS,\n", 34 | "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", 35 | "See the License for the specific language governing permissions and\n", 36 | "limitations under the License.\n", 37 | "\n", 38 | "\n", 39 | "Important\n", 40 | "This content are intended for educational and informational purposes only." 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": { 46 | "id": "jb5AbRRQjHz0" 47 | }, 48 | "source": [ 49 | "## Instructions\n", 50 | "\n", 51 | "##### 1. Make a copy of this [Google Sheet](https://docs.google.com/spreadsheets/d/1B8jxst5t4cwYdfoycE28Jg-wYyCGEK4MfmLYDJNzH18/edit)\n", 52 | "##### 2. Add your parameters and click \"Get Google Analytics Data\"\n", 53 | "##### 3. Run this colab." 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": { 59 | "id": "O6jaQzqG3XXm" 60 | }, 61 | "source": [ 62 | "##Import Libs and configure Plotly" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "metadata": { 68 | "id": "A-iNHcmf1C_q" 69 | }, 70 | "source": [ 71 | "import IPython\n", 72 | "import plotly\n", 73 | "import plotly.offline as py\n", 74 | "import plotly.graph_objs as go\n", 75 | "import math\n", 76 | "import json\n", 77 | "import numpy as np\n", 78 | "import pandas as pd\n", 79 | "from scipy import spatial\n", 80 | "from scipy.spatial import distance\n", 81 | "from sklearn.cluster import KMeans\n", 82 | "from google.colab import drive\n", 83 | "from google.colab import auth\n", 84 | "from sklearn import preprocessing\n", 85 | "from sklearn.preprocessing import scale\n", 86 | "from sklearn.feature_extraction.text import TfidfTransformer\n", 87 | "from sklearn.preprocessing import MinMaxScaler\n", 88 | "from apiclient.discovery import build\n", 89 | "from oauth2client.service_account import ServiceAccountCredentials\n", 90 | "from IPython.display import display\n", 91 | "\n", 92 | "py.init_notebook_mode(connected=False)\n", 93 | "%matplotlib inline\n", 94 | "py.init_notebook_mode(connected=False)\n" 95 | ], 96 | "execution_count": null, 97 | "outputs": [] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": { 102 | "id": "oJx-lXZG576Q" 103 | }, 104 | "source": [ 105 | "##Mount Drive and read the Analytics report json" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "metadata": { 111 | "id": "IZ_G_2FdcUfW" 112 | }, 113 | "source": [ 114 | "drive.mount('/gdrive')\n", 115 | "with open('/gdrive/My Drive/datapill_cmi_report.json', 'r') as f:\n", 116 | " data = f.read()\n", 117 | "report = json.loads(data)" 118 | ], 119 | "execution_count": null, 120 | "outputs": [] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": { 125 | "id": "JWinJq253vYm" 126 | }, 127 | "source": [ 128 | "## Define Plot Function" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "metadata": { 134 | "id": "4kCAXdcd0J_v" 135 | }, 136 | "source": [ 137 | "def plot3d(df, item_name_col, value_name_cols):\n", 138 | " #add additional column if only 2 audiences presented\n", 139 | " if len(value_name_cols) == 2:\n", 140 | " df['no_audience'] = 0\n", 141 | " value_name_cols.append('no_audience')\n", 142 | "\n", 143 | " py.init_notebook_mode(connected=False)\n", 144 | "\n", 145 | " trace_points = go.Scatter3d(\n", 146 | " x=df[value_name_cols[0]],\n", 147 | " y=df[value_name_cols[1]],\n", 148 | " z=df[value_name_cols[2]],\n", 149 | " #z=df[value_name_cols[2]] if len(value_name_cols) > 2 else 0,\n", 150 | " text=df[item_name_col],\n", 151 | " mode='markers',\n", 152 | " marker=dict(\n", 153 | " size=12,\n", 154 | " line=dict(\n", 155 | " color='rgb(0, 0, 0, 1)',\n", 156 | " width=0.5\n", 157 | " ),\n", 158 | " color=df.apply(lambda x: \"rgba(\" + str(int(x[value_name_cols[0]]*255)) \n", 159 | " + ',' + str(int(x[value_name_cols[1]]*255)) \n", 160 | " + ',' + str(int(x[value_name_cols[2]]*255)) + ',1)', axis=1),\n", 161 | " opacity=1\n", 162 | " )\n", 163 | " )\n", 164 | " trace_c1 = go.Scatter3d(\n", 165 | " x=[1],\n", 166 | " y=[0],\n", 167 | " z=[0],\n", 168 | " text=value_name_cols[0],\n", 169 | " mode='text+markers',\n", 170 | " marker=dict(\n", 171 | " size=120,\n", 172 | " line=dict(\n", 173 | " color='rgb(255, 0, 0, 0.5)',\n", 174 | " width=3\n", 175 | " ),\n", 176 | " color='rgb(255, 0, 0, 0.5)',#'rgba(217, 217, 217, 0.14)\n", 177 | " opacity=.5,\n", 178 | " )\n", 179 | " )\n", 180 | " trace_c2 = go.Scatter3d(\n", 181 | " x=[0],\n", 182 | " y=[1],\n", 183 | " z=[0],\n", 184 | " text=value_name_cols[1],\n", 185 | " mode='text+markers',\n", 186 | " marker=dict(\n", 187 | " size=120,\n", 188 | " line=dict(\n", 189 | " color='rgb(0, 255, 0, 0.5)',\n", 190 | " width=3\n", 191 | " ),\n", 192 | " color='rgb(0, 255, 0, 0.5)',#'rgba(217, 217, 217, 0.14)\n", 193 | " opacity=.5,\n", 194 | " )\n", 195 | " )\n", 196 | " trace_c3 = go.Scatter3d(\n", 197 | " x=[0],\n", 198 | " y=[0],\n", 199 | " z=[1],\n", 200 | " text=value_name_cols[2],\n", 201 | " mode='text+markers',\n", 202 | " marker=dict(\n", 203 | " size=120,\n", 204 | " line=dict(\n", 205 | " color='rgb(0, 0, 255, 0.5)',\n", 206 | " width=3\n", 207 | " ),\n", 208 | " color='rgb(0, 0, 255, 0.5)',#'rgba(217, 217, 217, 0.14)\n", 209 | " opacity=.5,\n", 210 | " )\n", 211 | " )\n", 212 | " data = [trace_points, trace_c1,trace_c2,trace_c3]\n", 213 | " layout = go.Layout(\n", 214 | " margin=dict(\n", 215 | " l=0,\n", 216 | " r=0,\n", 217 | " b=0,\n", 218 | " t=0\n", 219 | " )\n", 220 | " )\n", 221 | " fig = go.Figure(data=data, layout=layout)\n", 222 | " #py.iplot(fig, filename='simple-3d-scatter')\n", 223 | "\n", 224 | " py.iplot(data)\n", 225 | " # Plot and embed in ipython notebook!\n", 226 | " #py.iplot(data, filename='basic-scatter')\n", 227 | "\n", 228 | "def configure_plotly_browser_state():\n", 229 | " import IPython\n", 230 | " display(IPython.core.display.HTML('''\n", 231 | " \n", 232 | " \n", 240 | " '''))" 241 | ], 242 | "execution_count": null, 243 | "outputs": [] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": { 248 | "id": "EdZIUSYW30vA" 249 | }, 250 | "source": [ 251 | "## Define TF-IDF Function" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "metadata": { 257 | "id": "iF7nRPS72Lvk" 258 | }, 259 | "source": [ 260 | "def scalarToSigmod(scalar):#0-1 input\n", 261 | " x = (scalar-.5)*8\n", 262 | " return 1 / (1 + math.exp(-x))\n", 263 | "\n", 264 | "def scalarToTanh(scalar):\n", 265 | " x = (scalar-.5)*6\n", 266 | " return (math.tanh(x)+1)/2\n", 267 | "\n", 268 | "def calc_tfidf(df, label_col_name, transformation='tanh'):\n", 269 | " transformer = TfidfTransformer(smooth_idf=True, norm='l1')\n", 270 | "\n", 271 | " X = df.copy()\n", 272 | " y = X[label_col_name]\n", 273 | " X = X.drop([label_col_name], axis=1)\n", 274 | "\n", 275 | " tfidf = transformer.fit_transform(X)\n", 276 | " #create pd with results\n", 277 | " results = pd.DataFrame.from_records(tfidf.toarray() , columns=list(X.columns.values))\n", 278 | " #transpose\n", 279 | " results_transposed = results.T.reset_index()\n", 280 | " results_transposed.columns = [\"COMPARED_USERLIST_FULL_NAME\"] + list(y)\n", 281 | " results_transposed\n", 282 | " #scale to 0-1\n", 283 | " scaler = MinMaxScaler()\n", 284 | " results_transposed[list(y)] = scaler.fit_transform(results_transposed[list(y)])\n", 285 | "\n", 286 | " for col in list(y):\n", 287 | " if transformation == 'sig':\n", 288 | " results_transposed[col] = results_transposed.apply(lambda x: scalarToSigmod(x[col]), axis=1)\n", 289 | " elif transformation == 'tanh':\n", 290 | " results_transposed[col] = results_transposed.apply(lambda x: scalarToTanh(x[col]), axis=1)\n", 291 | " return results_transposed" 292 | ], 293 | "execution_count": null, 294 | "outputs": [] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": { 299 | "id": "a0EVP1y84A7J" 300 | }, 301 | "source": [ 302 | "## Define GA API reporting functions" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "metadata": { 308 | "id": "hZaxNk8j3HkT" 309 | }, 310 | "source": [ 311 | "def process_report(report):\n", 312 | " data=[]\n", 313 | " columnHeader = report.get('columnHeader', {})\n", 314 | " dimensionHeaders = columnHeader.get('dimensions', [])\n", 315 | " metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])\n", 316 | " metricHeaders = [header['name'] for header in metricHeaders]\n", 317 | " df_headers = dimensionHeaders + metricHeaders\n", 318 | "\n", 319 | " for row in report['data']['rows']:\n", 320 | " d = row['dimensions']\n", 321 | " m = row['metrics'][0]['values']\n", 322 | " data.append(d+m)\n", 323 | " df = pd.DataFrame(data, columns=df_headers)\n", 324 | " pivot = pd.pivot_table(df, \n", 325 | " index=[df.columns[0]], \n", 326 | " columns=['ga:segment'],\n", 327 | " aggfunc='sum').T\n", 328 | " df = pd.DataFrame(pivot.fillna(0).to_records())\n", 329 | " return df[df.columns[1:]]" 330 | ], 331 | "execution_count": null, 332 | "outputs": [] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "metadata": { 337 | "id": "4CqOmE9o2Eqz" 338 | }, 339 | "source": [ 340 | "df = process_report(report['reports'][0])\n", 341 | "cmi_df = calc_tfidf(df, 'ga:segment')\n", 342 | "cmi_df.head()" 343 | ], 344 | "execution_count": null, 345 | "outputs": [] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "metadata": { 350 | "id": "Kdsp44MDap3F" 351 | }, 352 | "source": [ 353 | "configure_plotly_browser_state()\n", 354 | "y = list(cmi_df.drop(['COMPARED_USERLIST_FULL_NAME'],axis=1).columns)\n", 355 | "plot3d(cmi_df,'COMPARED_USERLIST_FULL_NAME',list(y))" 356 | ], 357 | "execution_count": null, 358 | "outputs": [] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "metadata": { 363 | "id": "KJszS7qnoC_2" 364 | }, 365 | "source": [ 366 | "vecs = [[1,0,0], [0,1,0], [0,0,1]]\n", 367 | "segments = list(cmi_df.columns[1:])\n", 368 | "cmi_df['vector'] = cmi_df[[*segments]].values.tolist()\n", 369 | "for i in range(len(segments)):\n", 370 | " data = []\n", 371 | " col = 'distance_{}'.format(segments[i])\n", 372 | " for row in cmi_df.iterrows():\n", 373 | " euc = distance.euclidean(row[1]['vector'], vecs[i])\n", 374 | " data.append(euc)\n", 375 | " cmi_df[col] = data\n", 376 | "\n", 377 | "\n", 378 | "for col in cmi_df.columns[-3:]:\n", 379 | " display(cmi_df[['COMPARED_USERLIST_FULL_NAME', col]].sort_values(by=col, ascending=True))" 380 | ], 381 | "execution_count": null, 382 | "outputs": [] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "metadata": { 387 | "id": "kBY_p_3CfW5M" 388 | }, 389 | "source": [ 390 | "" 391 | ], 392 | "execution_count": null, 393 | "outputs": [] 394 | } 395 | ] 396 | } -------------------------------------------------------------------------------- /pills/GA/[DATA_PILL]_[GA]_Offline_conversion_upload_(from_Google_Cloud_Storage).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 2, 4 | "metadata": { 5 | "colab": { 6 | "name": "[DATA PILL] [GA] Offline conversion upload (from Google Cloud Storage).ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "toc_visible": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "Copyright 2021 Google LLC\n", 21 | "\n", 22 | "Licensed under the Apache License, Version 2.0 (the \"License\");\n", 23 | "you may not use this file except in compliance with the License.\n", 24 | "You may obtain a copy of the License at\n", 25 | "\n", 26 | " https://www.apache.org/licenses/LICENSE-2.0\n", 27 | "\n", 28 | "Unless required by applicable law or agreed to in writing, software\n", 29 | "distributed under the License is distributed on an \"AS IS\" BASIS,\n", 30 | "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", 31 | "See the License for the specific language governing permissions and\n", 32 | "limitations under the License.\n", 33 | "\n", 34 | "\n", 35 | "#Important\n", 36 | "This content are intended for educational and informational purposes only." 37 | ], 38 | "metadata": { 39 | "id": "HVIZc1hFBFT3" 40 | } 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "source": [ 46 | "#@title Install and import dependencies {display-mode: \"form\"}\n", 47 | "%%capture\n", 48 | "!pip install --upgrade google-cloud-storage\n", 49 | "\n", 50 | "from __future__ import division\n", 51 | "from __future__ import print_function\n", 52 | "\n", 53 | "import csv\n", 54 | "import datetime\n", 55 | "import http\n", 56 | "import logging\n", 57 | "import random\n", 58 | "\n", 59 | "from typing import Dict\n", 60 | "\n", 61 | "from google.cloud import storage\n", 62 | "from google.colab import auth" 63 | ], 64 | "outputs": [], 65 | "metadata": { 66 | "id": "7ITLrYqW20xZ" 67 | } 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "source": [ 73 | "#@title Configure required variables\n", 74 | "GCP_PROJECT_ID = \"\" #@param {type:\"string\"}\n", 75 | "GCS_BUCKET_NAME = \"\" #@param {type:\"string\"}\n", 76 | "GCS_SOURCE_PREFIX = \"\" #@param {type:\"string\"}\n", 77 | "GCS_DESTINATION_FOLDER = \"\" #@param {type:\"string\"}\n", 78 | "GA_TRACKING_ID = \"\" #@param {type:\"string\"}" 79 | ], 80 | "outputs": [], 81 | "metadata": { 82 | "id": "9AQOmkIROSLl" 83 | } 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "source": [ 88 | "## Map CSV columns to GA Measurement Protocol parameters\n", 89 | "\n", 90 | "### Add all CSV columns with its respective parameter to Measurement Protocol in format:\n", 91 | "\n", 92 | " ```\n", 93 | "variable_mapping = {\n", 94 | " 'measurement_protocol_param_name_1': 'csv_column_title_1',\n", 95 | " 'measurement_protocol_param_name_2': 'csv_column_title_2',\n", 96 | " .\n", 97 | " .\n", 98 | " .\n", 99 | "}\n", 100 | " ```" 101 | ], 102 | "metadata": { 103 | "id": "e6SEwjobAhXg" 104 | } 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "source": [ 110 | "variable_mapping = {\n", 111 | " 'qt': 'order_date',\n", 112 | " 'cid': 'user_id',\n", 113 | " 'ti': 'order_id',\n", 114 | " 'in': 'name',\n", 115 | " 'ip': 'price',\n", 116 | " 'iq': 'quantity'}" 117 | ], 118 | "outputs": [], 119 | "metadata": { 120 | "id": "un9GSJNxOoPV", 121 | "cellView": "both" 122 | } 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "source": [ 128 | "#@title Authenticate to Google Cloud Storage\n", 129 | "auth.authenticate_user()" 130 | ], 131 | "outputs": [], 132 | "metadata": { 133 | "id": "zcGUJsd_QxIV", 134 | "cellView": "form" 135 | } 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "source": [ 141 | "#@title Import conversions\n", 142 | "TEMP_FILE_PATH = 'tmp_blob_file'\n", 143 | "MAX_QUEUE_TIME_OFFSET = 1000 * 60 * 60 * 4 # 4 hours in milliseconds\n", 144 | "\n", 145 | "_http_client = None\n", 146 | "_required_variables = ('qt', 'cid', 'ti', 'in', 'ip', 'iq')\n", 147 | "\n", 148 | "def main():\n", 149 | " validate_mapping()\n", 150 | "\n", 151 | " storage_client = storage.Client(project=GCP_PROJECT_ID)\n", 152 | " bucket = storage_client.get_bucket(GCS_BUCKET_NAME)\n", 153 | " files = bucket.list_blobs(prefix=GCS_SOURCE_PREFIX)\n", 154 | "\n", 155 | " transactions = {}\n", 156 | "\n", 157 | " for file in files:\n", 158 | " for line in read_file(file):\n", 159 | " process_transaction(transactions, line)\n", 160 | " process_item(line)\n", 161 | "\n", 162 | " move_file(bucket, file)\n", 163 | "\n", 164 | " send_transaction_hits(transactions)\n", 165 | "\n", 166 | "\n", 167 | "def validate_mapping():\n", 168 | " for req_var in _required_variables:\n", 169 | " assert(req_var in variable_mapping), \\\n", 170 | " ('The variable %s must be mapped' % req_var)\n", 171 | "\n", 172 | "\n", 173 | "def list_files():\n", 174 | " \n", 175 | " return bucket.list_blobs()\n", 176 | "\n", 177 | "\n", 178 | "def process_transaction(transactions: Dict[str, Dict], line: Dict[str, str]):\n", 179 | "\n", 180 | " transaction_id = line[variable_mapping['ti']]\n", 181 | " user_id = line[variable_mapping['cid']]\n", 182 | "\n", 183 | " if transaction_id not in transactions:\n", 184 | " transactions[transaction_id] = {\n", 185 | " 'revenue': 0.0,\n", 186 | " 'user_id': user_id,\n", 187 | " 'order_date': datetime.datetime.min\n", 188 | " }\n", 189 | "\n", 190 | " current_transaction_date = datetime.datetime.strptime(\n", 191 | " line[variable_mapping['qt']], '%Y-%m-%d %H:%M:%S.%f')\n", 192 | "\n", 193 | " if transactions[transaction_id]['order_date'] < current_transaction_date:\n", 194 | " transactions[transaction_id]['order_date'] = current_transaction_date\n", 195 | "\n", 196 | " transactions[transaction_id]['revenue'] += float(\n", 197 | " line[variable_mapping['ip']])\n", 198 | "\n", 199 | "\n", 200 | "def process_item(line: Dict[str, str]):\n", 201 | " parameters = format_parameters(line)\n", 202 | "\n", 203 | " if parameters:\n", 204 | " payload = 'v=1&t=item&tid=%s&%s' % (GA_TRACKING_ID, parameters)\n", 205 | " send_hit(payload)\n", 206 | "\n", 207 | "\n", 208 | "def read_file(blob: storage.blob.Blob):\n", 209 | " save_temp_blob(blob)\n", 210 | " with open(TEMP_FILE_PATH, 'r') as tmp_file:\n", 211 | " reader = csv.DictReader(tmp_file)\n", 212 | " for row in reader:\n", 213 | " yield row\n", 214 | "\n", 215 | "\n", 216 | "def save_temp_blob(blob: storage.blob.Blob):\n", 217 | " with open(TEMP_FILE_PATH, 'wb') as tmp_file:\n", 218 | " blob.download_to_file(tmp_file)\n", 219 | "\n", 220 | "\n", 221 | "def format_parameters(line: Dict[str, str]):\n", 222 | " query_string = ''\n", 223 | " for map_key in variable_mapping:\n", 224 | " assert(variable_mapping[map_key] in line), \\\n", 225 | " ('Invalid variable mapping. Missing column %s' %\n", 226 | " variable_mapping[map_key])\n", 227 | "\n", 228 | " if map_key == 'qt':\n", 229 | " transaction_date = datetime.datetime.strptime(\n", 230 | " line[variable_mapping[map_key]], '%Y-%m-%d %H:%M:%S.%f')\n", 231 | " queue_time = calculate_queue_time(transaction_date)\n", 232 | " if queue_time >= MAX_QUEUE_TIME_OFFSET:\n", 233 | " logging.warning('Transaction date older than 4 hours. Ignoring hit')\n", 234 | " return None\n", 235 | "\n", 236 | " parameter = 'qt=%s&' % queue_time\n", 237 | " else:\n", 238 | " parameter = '&{param}={value}'.format(\n", 239 | " param=map_key,\n", 240 | " value=line[variable_mapping[map_key]])\n", 241 | "\n", 242 | " query_string += parameter\n", 243 | "\n", 244 | " return query_string\n", 245 | "\n", 246 | "\n", 247 | "def calculate_queue_time(date: datetime):\n", 248 | " milliseconds_diff = (date - datetime.datetime.now()).total_seconds() * 1000\n", 249 | " return int(abs(milliseconds_diff))\n", 250 | "\n", 251 | "\n", 252 | "def send_transaction_hits(transactions: Dict[str, Dict]):\n", 253 | " for transaction in transactions:\n", 254 | " queue_time = calculate_queue_time(transactions[transaction]['order_date'])\n", 255 | " if queue_time >= MAX_QUEUE_TIME_OFFSET:\n", 256 | " logging.warning('Transaction date older than 4 hours. Ignoring hit')\n", 257 | " continue\n", 258 | "\n", 259 | " payload = (\n", 260 | " 'v=1&t=transaction&tid=%s&ti=%s&cid=%s&tr=%.2f&sc=end' %\n", 261 | " (GA_TRACKING_ID,\n", 262 | " transaction,\n", 263 | " transactions[transaction]['user_id'],\n", 264 | " transactions[transaction]['revenue'])\n", 265 | " )\n", 266 | "\n", 267 | " send_hit(payload)\n", 268 | "\n", 269 | "\n", 270 | "def send_hit(payload: str):\n", 271 | " global _http_client\n", 272 | "\n", 273 | " if not _http_client:\n", 274 | " _http_client = http.client.HTTPSConnection('www.google-analytics.com')\n", 275 | "\n", 276 | " payload += '&z=%s' % str(random.randrange(100000000000, 999999999999))\n", 277 | "\n", 278 | " print(payload)\n", 279 | "\n", 280 | " _http_client.request('POST', '/collect', body=payload, headers={\n", 281 | " 'User-Agent': ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML'\n", 282 | " ', like Gecko) Chrome/76.0.3809.132 Safari/537.36')\n", 283 | " })\n", 284 | " response = _http_client.getresponse()\n", 285 | " response_content = response.read()\n", 286 | "\n", 287 | " if response.code != 200:\n", 288 | " logging.error('An error has occurred during hit post')\n", 289 | " else:\n", 290 | " logging.debug('Hit sent successfully')\n", 291 | "\n", 292 | "\n", 293 | "def move_file(bucket: storage.bucket.Bucket, file: storage.blob.Blob):\n", 294 | " new_file_name = '%s/%s' % (GCS_DESTINATION_FOLDER, file.name)\n", 295 | " new_file = bucket.rename_blob(file, new_file_name)\n", 296 | " logging.debug('File {} moved to {}'.format(file.name, new_file.name))\n", 297 | "\n", 298 | "\n", 299 | "main()" 300 | ], 301 | "outputs": [ 302 | { 303 | "output_type": "stream", 304 | "name": "stdout", 305 | "text": [ 306 | "v=1&t=item&tid=UA-138985750-1&qt=9517399&&cid=f681cef5-7b4c-469e-a178-0b1442c17b5a&ti=128940&in=Black Vans T-Shirt&ip=9.90&iq=1&z=597129382998\n", 307 | "v=1&t=item&tid=UA-138985750-1&qt=9517419&&cid=f681cef5-7b4c-469e-a178-0b1442c17b5a&ti=128940&in=Red Nike Shoes&ip=39.90&iq=1&z=812883007270\n", 308 | "v=1&t=item&tid=UA-138985750-1&qt=9517421&&cid=f681cef5-7b4c-469e-a178-0b1442c17b5a&ti=128940&in=Jeans Calvin Klein Pants&ip=59.90&iq=1&z=232147591534\n", 309 | "v=1&t=transaction&tid=UA-138985750-1&ti=128940&cid=f681cef5-7b4c-469e-a178-0b1442c17b5a&tr=109.70&sc=end&z=384087395714\n" 310 | ] 311 | } 312 | ], 313 | "metadata": { 314 | "id": "n7aOgKub6R-T", 315 | "cellView": "form", 316 | "colab": { 317 | "base_uri": "https://localhost:8080/", 318 | "height": 85 319 | }, 320 | "outputId": "f9f4b3b0-8892-4809-c1c0-32b2cf49034b" 321 | } 322 | } 323 | ] 324 | } -------------------------------------------------------------------------------- /pills/GA/[DATA_PILL]_[GA]_Offline_conversion_upload_(from_Google_Drive).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 2, 4 | "metadata": { 5 | "colab": { 6 | "name": "[DATA PILL] [GA] Offline conversion upload (from Google Drive).ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "toc_visible": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "Copyright 2021 Google LLC\n", 21 | "\n", 22 | "Licensed under the Apache License, Version 2.0 (the \"License\");\n", 23 | "you may not use this file except in compliance with the License.\n", 24 | "You may obtain a copy of the License at\n", 25 | "\n", 26 | " https://www.apache.org/licenses/LICENSE-2.0\n", 27 | "\n", 28 | "Unless required by applicable law or agreed to in writing, software\n", 29 | "distributed under the License is distributed on an \"AS IS\" BASIS,\n", 30 | "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", 31 | "See the License for the specific language governing permissions and\n", 32 | "limitations under the License.\n", 33 | "\n", 34 | "\n", 35 | "#Important\n", 36 | "This content are intended for educational and informational purposes only." 37 | ], 38 | "metadata": { 39 | "id": "HVIZc1hFBFT3" 40 | } 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "source": [ 46 | "#@title Install and import dependencies {display-mode: \"form\"}\n", 47 | "from __future__ import division\n", 48 | "from __future__ import print_function\n", 49 | "\n", 50 | "import csv\n", 51 | "import datetime\n", 52 | "import http\n", 53 | "import logging\n", 54 | "import random\n", 55 | "\n", 56 | "from typing import Dict\n", 57 | "\n", 58 | "from google.colab import auth\n", 59 | "from google.colab import drive" 60 | ], 61 | "outputs": [], 62 | "metadata": { 63 | "id": "7ITLrYqW20xZ" 64 | } 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "source": [ 70 | "#@title Configure required variables\n", 71 | "CSV_FILENAME = \"\" #@param {type:\"string\"}\n", 72 | "GA_TRACKING_ID = \"\" #@param {type:\"string\"}" 73 | ], 74 | "outputs": [], 75 | "metadata": { 76 | "id": "9AQOmkIROSLl" 77 | } 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "source": [ 82 | "## Map CSV columns to GA Measurement Protocol parameters\n", 83 | "\n", 84 | "### Add all CSV columns with its respective parameter to Measurement Protocol in format:\n", 85 | "\n", 86 | " ```\n", 87 | "variable_mapping = {\n", 88 | " 'measurement_protocol_param_name_1': 'csv_column_title_1',\n", 89 | " 'measurement_protocol_param_name_2': 'csv_column_title_2',\n", 90 | " .\n", 91 | " .\n", 92 | " .\n", 93 | "}\n", 94 | " ```" 95 | ], 96 | "metadata": { 97 | "id": "e6SEwjobAhXg" 98 | } 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "source": [ 104 | "variable_mapping = {\n", 105 | " 'qt': 'order_date',\n", 106 | " 'cid': 'user_id',\n", 107 | " 'ti': 'order_id',\n", 108 | " 'in': 'name',\n", 109 | " 'ip': 'price',\n", 110 | " 'iq': 'quantity'}" 111 | ], 112 | "outputs": [], 113 | "metadata": { 114 | "id": "un9GSJNxOoPV", 115 | "cellView": "both" 116 | } 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "source": [ 122 | "#@title Authenticate to Google Cloud Storage\n", 123 | "auth.authenticate_user()" 124 | ], 125 | "outputs": [], 126 | "metadata": { 127 | "id": "zcGUJsd_QxIV", 128 | "cellView": "form" 129 | } 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "source": [ 135 | "#@title Import conversions\n", 136 | "MAX_QUEUE_TIME_OFFSET = 1000 * 60 * 60 * 4 # 4 hours in milliseconds\n", 137 | "\n", 138 | "_http_client = None\n", 139 | "_required_variables = ('qt', 'cid', 'ti', 'in', 'ip', 'iq')\n", 140 | "\n", 141 | "def main():\n", 142 | " validate_mapping()\n", 143 | " drive.mount('/content/drive')\n", 144 | "\n", 145 | " transactions = {}\n", 146 | "\n", 147 | " with open('/content/drive/My Drive/test_file.csv', 'r') as csv_file:\n", 148 | " reader = csv.DictReader(csv_file)\n", 149 | " for line in reader:\n", 150 | " process_transaction(transactions, line)\n", 151 | " process_item(line)\n", 152 | " \n", 153 | " print(transactions)\n", 154 | " send_transaction_hits(transactions)\n", 155 | "\n", 156 | "\n", 157 | "def validate_mapping():\n", 158 | " for req_var in _required_variables:\n", 159 | " assert(req_var in variable_mapping), \\\n", 160 | " ('The variable %s must be mapped' % req_var)\n", 161 | "\n", 162 | "\n", 163 | "def list_files():\n", 164 | " return bucket.list_blobs()\n", 165 | "\n", 166 | "\n", 167 | "def process_transaction(transactions: Dict[str, Dict], line: Dict[str, str]):\n", 168 | "\n", 169 | " transaction_id = line[variable_mapping['ti']]\n", 170 | " user_id = line[variable_mapping['cid']]\n", 171 | "\n", 172 | " if transaction_id not in transactions:\n", 173 | " transactions[transaction_id] = {\n", 174 | " 'revenue': 0.0,\n", 175 | " 'user_id': user_id,\n", 176 | " 'order_date': datetime.datetime.min\n", 177 | " }\n", 178 | "\n", 179 | " current_transaction_date = datetime.datetime.strptime(\n", 180 | " line[variable_mapping['qt']], '%Y-%m-%d %H:%M:%S.%f')\n", 181 | "\n", 182 | " if transactions[transaction_id]['order_date'] < current_transaction_date:\n", 183 | " transactions[transaction_id]['order_date'] = current_transaction_date\n", 184 | "\n", 185 | " transactions[transaction_id]['revenue'] += float(\n", 186 | " line[variable_mapping['ip']])\n", 187 | "\n", 188 | "\n", 189 | "def process_item(line: Dict[str, str]):\n", 190 | " parameters = format_parameters(line)\n", 191 | "\n", 192 | " if parameters:\n", 193 | " payload = 'v=1&t=item&tid=%s&%s' % (GA_TRACKING_ID, parameters)\n", 194 | " send_hit(payload)\n", 195 | "\n", 196 | "\n", 197 | "def format_parameters(line: Dict[str, str]):\n", 198 | " query_string = ''\n", 199 | " for map_key in variable_mapping:\n", 200 | " assert(variable_mapping[map_key] in line), \\\n", 201 | " ('Invalid variable mapping. Missing column %s' %\n", 202 | " variable_mapping[map_key])\n", 203 | "\n", 204 | " if map_key == 'qt':\n", 205 | " transaction_date = datetime.datetime.strptime(\n", 206 | " line[variable_mapping[map_key]], '%Y-%m-%d %H:%M:%S.%f')\n", 207 | " queue_time = calculate_queue_time(transaction_date)\n", 208 | " if queue_time >= MAX_QUEUE_TIME_OFFSET:\n", 209 | " logging.warning('Transaction date older than 4 hours. Ignoring hit')\n", 210 | " return None\n", 211 | "\n", 212 | " parameter = 'qt=%s&' % queue_time\n", 213 | " else:\n", 214 | " parameter = '&{param}={value}'.format(\n", 215 | " param=map_key,\n", 216 | " value=line[variable_mapping[map_key]])\n", 217 | "\n", 218 | " query_string += parameter\n", 219 | "\n", 220 | " return query_string\n", 221 | "\n", 222 | "\n", 223 | "def calculate_queue_time(date: datetime):\n", 224 | " milliseconds_diff = (date - datetime.datetime.now()).total_seconds() * 1000\n", 225 | " return int(abs(milliseconds_diff))\n", 226 | "\n", 227 | "\n", 228 | "def send_transaction_hits(transactions: Dict[str, Dict]):\n", 229 | " for transaction in transactions:\n", 230 | " queue_time = calculate_queue_time(transactions[transaction]['order_date'])\n", 231 | " if queue_time >= MAX_QUEUE_TIME_OFFSET:\n", 232 | " logging.warning('Transaction date older than 4 hours. Ignoring hit')\n", 233 | " continue\n", 234 | "\n", 235 | " payload = (\n", 236 | " 'v=1&t=transaction&tid=%s&ti=%s&cid=%s&tr=%.2f&sc=end' %\n", 237 | " (GA_TRACKING_ID,\n", 238 | " transaction,\n", 239 | " transactions[transaction]['user_id'],\n", 240 | " transactions[transaction]['revenue'])\n", 241 | " )\n", 242 | "\n", 243 | " send_hit(payload)\n", 244 | "\n", 245 | "\n", 246 | "def send_hit(payload: str):\n", 247 | " global _http_client\n", 248 | "\n", 249 | " if not _http_client:\n", 250 | " _http_client = http.client.HTTPSConnection('www.google-analytics.com')\n", 251 | "\n", 252 | " payload += '&z=%s' % str(random.randrange(100000000000, 999999999999))\n", 253 | "\n", 254 | " print(payload)\n", 255 | "\n", 256 | " # _http_client.request('POST', '/collect', body=payload, headers={\n", 257 | " # 'User-Agent': ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML'\n", 258 | " # ', like Gecko) Chrome/76.0.3809.132 Safari/537.36')\n", 259 | " # })\n", 260 | " # response = _http_client.getresponse()\n", 261 | " # response_content = response.read()\n", 262 | "\n", 263 | "\n", 264 | " if response.code != 200:\n", 265 | " logging.error('An error has occurred during hit post')\n", 266 | " else:\n", 267 | " logging.debug('Hit sent successfully')\n", 268 | "\n", 269 | "\n", 270 | "main()" 271 | ], 272 | "outputs": [], 273 | "metadata": { 274 | "id": "n7aOgKub6R-T", 275 | "cellView": "code" 276 | } 277 | } 278 | ] 279 | } -------------------------------------------------------------------------------- /pills/Google Ads/[DATA_PILL]_[Google_Ads]_Customer_Market_Intelligence_(CMI).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "[CM] Customer Market Intelligence (CMI)", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "zAjuEzhY2P20" 20 | }, 21 | "source": [ 22 | "**PLEASE MAKE A COPY BEFORE CHANGING**\n", 23 | "\n", 24 | "**Copyright** 2018 Google LLC\n", 25 | "\n", 26 | "Licensed under the Apache License, Version 2.0 (the \"License\");\n", 27 | "you may not use this file except in compliance with the License.\n", 28 | "You may obtain a copy of the License at\n", 29 | "\n", 30 | " https://www.apache.org/licenses/LICENSE-2.0\n", 31 | "\n", 32 | "Unless required by applicable law or agreed to in writing, software\n", 33 | "distributed under the License is distributed on an \"AS IS\" BASIS,\n", 34 | "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", 35 | "See the License for the specific language governing permissions and\n", 36 | "limitations under the License.\n", 37 | "\n", 38 | "\n", 39 | "Important\n", 40 | "This content are intended for educational and informational purposes only." 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": { 46 | "id": "jb5AbRRQjHz0" 47 | }, 48 | "source": [ 49 | "## Instructions\n", 50 | "\n", 51 | "##### 1. Upload your CRM audience lists to Google Ads through Customer Match \n", 52 | "##### 2. In Google Ads Audience Manager, click the Audiences and then go into \"Audience Insights\"\n", 53 | "##### 3. Download the reports from Audience Insights and place them in a locally acessible folder\n", 54 | "##### 4. Configure the locations below then run this colab." 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": { 60 | "id": "c4743yoOQfzV" 61 | }, 62 | "source": [ 63 | "# Configuration" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": { 69 | "id": "lhMMfBncR7nY" 70 | }, 71 | "source": [ 72 | "**Form fields**\n", 73 | "\n", 74 | "\n", 75 | "* **audienceX_name**: name of the audience segment\n", 76 | "* **audienceX_file_location**: location of the CSV file containing Audience Insights from Customer Match\n", 77 | "* **audienceX_size**: size of the audience as shown in the List Size fields inside Customer Match\n", 78 | "* **isUsingGDrive**: check this box if the location of the CSV files are inside a Google Drive. Make sure to use the \"/gdrive/\" path for file locations.\n", 79 | "\n" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "metadata": { 85 | "id": "etaITd9zQD_m", 86 | "cellView": "form" 87 | }, 88 | "source": [ 89 | "audience1_name = \"\" #@param {type:\"string\"}\n", 90 | "audience1_file_location = \"\" #@param {type:\"string\"}\n", 91 | "audience1_size = 0#@param {type:\"integer\"}\n", 92 | "audience2_name = \"\" #@param {type:\"string\"}\n", 93 | "audience2_file_location = \"\" #@param {type:\"string\"}\n", 94 | "audience2_size = 0 #@param {type:\"integer\"}\n", 95 | "audience3_name = \"\" #@param {type:\"string\"}\n", 96 | "audience3_file_location = \"\" #@param {type:\"string\"}\n", 97 | "audience3_size = 0#@param {type:\"integer\"}\n", 98 | "isUsingGDrive = False #@param {type:\"boolean\"}\n", 99 | "\n" 100 | ], 101 | "execution_count": null, 102 | "outputs": [] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": { 107 | "id": "O6jaQzqG3XXm" 108 | }, 109 | "source": [ 110 | "##Import Libs and configure Plotly" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "metadata": { 116 | "id": "A-iNHcmf1C_q" 117 | }, 118 | "source": [ 119 | "import IPython\n", 120 | "import plotly\n", 121 | "import plotly.offline as py\n", 122 | "import plotly.graph_objs as go\n", 123 | "import math\n", 124 | "import json\n", 125 | "import numpy as np\n", 126 | "import pandas as pd\n", 127 | "import re\n", 128 | "from scipy import spatial\n", 129 | "from scipy.spatial import distance\n", 130 | "from sklearn.cluster import KMeans\n", 131 | "from google.colab import drive\n", 132 | "from google.colab import auth\n", 133 | "from sklearn import preprocessing\n", 134 | "from sklearn.preprocessing import scale\n", 135 | "from sklearn.feature_extraction.text import TfidfTransformer\n", 136 | "from sklearn.preprocessing import MinMaxScaler\n", 137 | "from apiclient.discovery import build\n", 138 | "from oauth2client.service_account import ServiceAccountCredentials\n", 139 | "from IPython.display import display\n", 140 | "import matplotlib as mpl\n", 141 | "\n", 142 | "py.init_notebook_mode(connected=False)\n", 143 | "%matplotlib inline\n", 144 | "py.init_notebook_mode(connected=False)\n" 145 | ], 146 | "execution_count": null, 147 | "outputs": [] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": { 152 | "id": "oJx-lXZG576Q" 153 | }, 154 | "source": [ 155 | "##Mount Drive and read the Customer Match Insights CSVs" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "metadata": { 161 | "id": "IZ_G_2FdcUfW" 162 | }, 163 | "source": [ 164 | "if (isUsingGDrive):\n", 165 | " drive.mount('/gdrive')\n", 166 | "df_1 = pd.read_csv(audience1_file_location,usecols=['Dimension','Audience','List distribution'])\n", 167 | "df_1['List distribution'] = round(df_1['List distribution']*audience1_size)\n", 168 | "df_2 = pd.read_csv(audience2_file_location,usecols=['Dimension','Audience','List distribution'])\n", 169 | "df_2['List distribution'] = round(df_2['List distribution']*audience2_size)\n", 170 | "if ((audience3_name != \"\") & (audience3_file_location != \"\") & (audience3_size > 0)):\n", 171 | " audience3_enabled = True\n", 172 | " df_3 = pd.read_csv(audience3_file_location,usecols=['Dimension','Audience','List distribution'])\n", 173 | " df_3['List distribution'] = round(df_3['List distribution']*audience3_size)\n", 174 | "else:\n", 175 | " audience3_enabled = False" 176 | ], 177 | "execution_count": null, 178 | "outputs": [] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": { 183 | "id": "JWinJq253vYm" 184 | }, 185 | "source": [ 186 | "## Define Plot Function" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "metadata": { 192 | "id": "4kCAXdcd0J_v" 193 | }, 194 | "source": [ 195 | "def plot3d(df, item_name_col, value_name_cols):\n", 196 | " #add additional column if only 2 audiences presented\n", 197 | " if len(value_name_cols) == 2:\n", 198 | " df['no_audience'] = 0\n", 199 | " value_name_cols.append('no_audience')\n", 200 | "\n", 201 | " py.init_notebook_mode(connected=False)\n", 202 | "\n", 203 | " trace_points = go.Scatter3d(\n", 204 | " x=df[value_name_cols[0]],\n", 205 | " y=df[value_name_cols[1]],\n", 206 | " z=df[value_name_cols[2]],\n", 207 | " #z=df[value_name_cols[2]] if len(value_name_cols) > 2 else 0,\n", 208 | " text=df[item_name_col],\n", 209 | " mode='markers',\n", 210 | " marker=dict(\n", 211 | " size=12,\n", 212 | " line=dict(\n", 213 | " color='rgb(0, 0, 0, 1)',\n", 214 | " width=0.5\n", 215 | " ),\n", 216 | " color=df.apply(lambda x: \"rgba(\" + str(int(x[value_name_cols[0]]*255)) \n", 217 | " + ',' + str(int(x[value_name_cols[1]]*255)) \n", 218 | " + ',' + str(int(x[value_name_cols[2]]*255)) + ',1)', axis=1),\n", 219 | " opacity=1\n", 220 | " )\n", 221 | " )\n", 222 | " trace_c1 = go.Scatter3d(\n", 223 | " x=[1],\n", 224 | " y=[0],\n", 225 | " z=[0],\n", 226 | " text=value_name_cols[0],\n", 227 | " mode='text+markers',\n", 228 | " marker=dict(\n", 229 | " size=120,\n", 230 | " line=dict(\n", 231 | " color='rgb(255, 0, 0, 0.5)',\n", 232 | " width=3\n", 233 | " ),\n", 234 | " color='rgb(255, 0, 0, 0.5)',#'rgba(217, 217, 217, 0.14)\n", 235 | " opacity=.5,\n", 236 | " )\n", 237 | " )\n", 238 | " trace_c2 = go.Scatter3d(\n", 239 | " x=[0],\n", 240 | " y=[1],\n", 241 | " z=[0],\n", 242 | " text=value_name_cols[1],\n", 243 | " mode='text+markers',\n", 244 | " marker=dict(\n", 245 | " size=120,\n", 246 | " line=dict(\n", 247 | " color='rgb(0, 255, 0, 0.5)',\n", 248 | " width=3\n", 249 | " ),\n", 250 | " color='rgb(0, 255, 0, 0.5)',#'rgba(217, 217, 217, 0.14)\n", 251 | " opacity=.5,\n", 252 | " )\n", 253 | " )\n", 254 | " trace_c3 = go.Scatter3d(\n", 255 | " x=[0],\n", 256 | " y=[0],\n", 257 | " z=[1],\n", 258 | " text=value_name_cols[2],\n", 259 | " mode='text+markers',\n", 260 | " marker=dict(\n", 261 | " size=120,\n", 262 | " line=dict(\n", 263 | " color='rgb(0, 0, 255, 0.5)',\n", 264 | " width=3\n", 265 | " ),\n", 266 | " color='rgb(0, 0, 255, 0.5)',#'rgba(217, 217, 217, 0.14)\n", 267 | " opacity=.5,\n", 268 | " )\n", 269 | " )\n", 270 | " data = [trace_points, trace_c1,trace_c2,trace_c3]\n", 271 | " layout = go.Layout(\n", 272 | " margin=dict(\n", 273 | " l=0,\n", 274 | " r=0,\n", 275 | " b=0,\n", 276 | " t=0\n", 277 | " )\n", 278 | " )\n", 279 | " fig = go.Figure(data=data, layout=layout)\n", 280 | " #py.iplot(fig, filename='simple-3d-scatter')\n", 281 | "\n", 282 | " py.iplot(data)\n", 283 | " # Plot and embed in ipython notebook!\n", 284 | " #py.iplot(data, filename='basic-scatter')\n", 285 | "\n", 286 | "def configure_plotly_browser_state():\n", 287 | " import IPython\n", 288 | " display(IPython.core.display.HTML('''\n", 289 | " \n", 290 | " \n", 298 | " '''))" 299 | ], 300 | "execution_count": null, 301 | "outputs": [] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": { 306 | "id": "EdZIUSYW30vA" 307 | }, 308 | "source": [ 309 | "## Define TF-IDF Function" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "metadata": { 315 | "id": "iF7nRPS72Lvk" 316 | }, 317 | "source": [ 318 | "def scalarToSigmod(scalar):#0-1 input\n", 319 | " x = (scalar-.5)*8\n", 320 | " return 1 / (1 + math.exp(-x))\n", 321 | "\n", 322 | "def scalarToTanh(scalar):\n", 323 | " x = (scalar-.5)*6\n", 324 | " return (math.tanh(x)+1)/2\n", 325 | "\n", 326 | "def calc_tfidf(df, label_col_name, transformation='tanh'):\n", 327 | " transformer = TfidfTransformer(smooth_idf=True, norm='l1', use_idf=False)\n", 328 | "\n", 329 | " X = df.copy()\n", 330 | " y = X[label_col_name]\n", 331 | " X = X.drop([label_col_name], axis=1)\n", 332 | "\n", 333 | " tfidf = transformer.fit_transform(X)\n", 334 | " #create pd with results\n", 335 | " results = pd.DataFrame.from_records(tfidf.toarray() , columns=list(X.columns.values))\n", 336 | " #transpose\n", 337 | " results_transposed = results.T.reset_index()\n", 338 | " results_transposed.columns = [\"COMPARED_USERLIST_FULL_NAME\"] + list(y)\n", 339 | " results_transposed\n", 340 | " #scale to 0-1\n", 341 | " scaler = MinMaxScaler()\n", 342 | " results_transposed[list(y)] = scaler.fit_transform(results_transposed[list(y)])\n", 343 | "\n", 344 | " for col in list(y):\n", 345 | " if transformation == 'sig':\n", 346 | " results_transposed[col] = results_transposed.apply(lambda x: scalarToSigmod(x[col]), axis=1)\n", 347 | " elif transformation == 'tanh':\n", 348 | " results_transposed[col] = results_transposed.apply(lambda x: scalarToTanh(x[col]), axis=1)\n", 349 | " return results_transposed" 350 | ], 351 | "execution_count": null, 352 | "outputs": [] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": { 357 | "id": "a0EVP1y84A7J" 358 | }, 359 | "source": [ 360 | "## Define GA API reporting functions" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "metadata": { 366 | "id": "hZaxNk8j3HkT" 367 | }, 368 | "source": [ 369 | "def process_report(report):\n", 370 | " data=[]\n", 371 | " columnHeader = report.get('columnHeader', {})\n", 372 | " dimensionHeaders = columnHeader.get('dimensions', [])\n", 373 | " metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])\n", 374 | " metricHeaders = [header['name'] for header in metricHeaders]\n", 375 | " df_headers = dimensionHeaders + metricHeaders\n", 376 | "\n", 377 | " for row in report['data']['rows']:\n", 378 | " d = row['dimensions']\n", 379 | " m = row['metrics'][0]['values']\n", 380 | " data.append(d+m)\n", 381 | " df = pd.DataFrame(data, columns=df_headers)\n", 382 | " pivot = pd.pivot_table(df, \n", 383 | " index=[df.columns[0]], \n", 384 | " columns=['ga:segment'],\n", 385 | " aggfunc='sum').T\n", 386 | " df = pd.DataFrame(pivot.fillna(0).to_records())\n", 387 | " return df[df.columns[1:]]" 388 | ], 389 | "execution_count": null, 390 | "outputs": [] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": { 395 | "id": "El_6e2hnjn8K" 396 | }, 397 | "source": [ 398 | "##Run TF-IDF" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "metadata": { 404 | "id": "goZq6ox8muXA" 405 | }, 406 | "source": [ 407 | "df_1['Segmento'] = audience1_name\n", 408 | "df_2['Segmento'] = audience2_name\n", 409 | "if (audience3_enabled):\n", 410 | " df_3['Segmento'] = audience3_name\n", 411 | " df_list = [df_1,df_2,df_3]\n", 412 | "else:\n", 413 | " df_list = [df_1,df_2]\n", 414 | "df = pd.concat(df_list)\n", 415 | "df = df.loc[df['Dimension'] != 'City']\n", 416 | "df = df.loc[df['Dimension'] != 'Country']\n", 417 | "df['Audience'] = df['Dimension'] + ' | ' + df['Audience']\n", 418 | "df.drop(['Dimension'],axis=1,inplace=True)\n", 419 | "df_pivot = pd.pivot_table(df, index=['Segmento'], columns=['Audience'],aggfunc='sum').fillna(0)\n", 420 | "df_pivot.columns = df_pivot.columns.droplevel(level=0)\n", 421 | "df_pivot.reset_index(level=[0],inplace=True)\n", 422 | "cmi_df = calc_tfidf(df_pivot,'Segmento')\n", 423 | "cmi_df.head()" 424 | ], 425 | "execution_count": null, 426 | "outputs": [] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": { 431 | "id": "Sj_3vDc0khHC" 432 | }, 433 | "source": [ 434 | "##Plot the results" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "metadata": { 440 | "id": "Kdsp44MDap3F" 441 | }, 442 | "source": [ 443 | "def plot_3d(cmi_df):\n", 444 | " configure_plotly_browser_state()\n", 445 | " y = list(cmi_df.drop(['COMPARED_USERLIST_FULL_NAME'],axis=1).columns)\n", 446 | " plot3d(cmi_df,'COMPARED_USERLIST_FULL_NAME',list(y))\n", 447 | "\n", 448 | "def print_ordered_list(cmi_df):\n", 449 | " vecs = [[1,0,0], [0,1,0], [0,0,1]]\n", 450 | " segments = list(cmi_df.columns[1:])\n", 451 | " cmi_df['vector'] = cmi_df[[*segments]].values.tolist()\n", 452 | " for i in range(len(segments)):\n", 453 | " data = []\n", 454 | " col = 'distance_{}'.format(segments[i])\n", 455 | " for row in cmi_df.iterrows():\n", 456 | " euc = distance.euclidean(row[1]['vector'], vecs[i])\n", 457 | " data.append(euc)\n", 458 | " cmi_df[col] = data\n", 459 | "\n", 460 | "\n", 461 | " for col in cmi_df.columns[-3:]:\n", 462 | " display(cmi_df[['COMPARED_USERLIST_FULL_NAME', col]].sort_values(by=col, ascending=True))\n", 463 | "\n", 464 | "plot_3d(cmi_df)\n", 465 | "print_ordered_list(cmi_df)" 466 | ], 467 | "execution_count": null, 468 | "outputs": [] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "metadata": { 473 | "id": "homk-KpOJ_aU" 474 | }, 475 | "source": [ 476 | "" 477 | ], 478 | "execution_count": null, 479 | "outputs": [] 480 | } 481 | ] 482 | } -------------------------------------------------------------------------------- /pills/Third Party/[DATA_PILL]_[Appsflyer]_Install_Report_Analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "**PLEASE MAKE A COPY BEFORE CHANGING**\n", 8 | "\n", 9 | "**Copyright** 2022 Google LLC\n", 10 | "\n", 11 | "Licensed under the Apache License, Version 2.0 (the \"License\");\n", 12 | "you may not use this file except in compliance with the License.\n", 13 | "You may obtain a copy of the License at\n", 14 | "\n", 15 | " https://www.apache.org/licenses/LICENSE-2.0\n", 16 | "\n", 17 | "Unless required by applicable law or agreed to in writing, software\n", 18 | "distributed under the License is distributed on an \"AS IS\" BASIS,\n", 19 | "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", 20 | "See the License for the specific language governing permissions and\n", 21 | "limitations under the License.\n", 22 | "\n", 23 | "\n", 24 | "Important\n", 25 | "This content are intended for educational and informational purposes only." 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## Instructions\n", 33 | "\n", 34 | "##### 1. Export Install Report from Appsflyer\n", 35 | "##### 2. Upload csv to Google Drive\n", 36 | "##### 3. Configure the locations below then run this colab." 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "# Import necessary packages" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "id": "-UI6voLWzyk3", 51 | "vscode": { 52 | "languageId": "python" 53 | } 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "## Import Packages\n", 58 | "import pandas as pd\n", 59 | "import numpy as np\n", 60 | "import matplotlib.pyplot as plt\n", 61 | "import seaborn as sns" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": { 67 | "id": "ABEgtUqQN8j0" 68 | }, 69 | "source": [ 70 | "# Mount Google Drive" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": { 77 | "colab": { 78 | "base_uri": "https://localhost:8080/" 79 | }, 80 | "id": "Y1pSQ-boEoVV", 81 | "outputId": "804ca21d-1100-4981-c3d5-6972b229225a", 82 | "vscode": { 83 | "languageId": "python" 84 | } 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "## Mount to Google Drive\n", 89 | "from google.colab import drive\n", 90 | "drive.mount('/content/drive')\n", 91 | "print(\"Log: Google Drive mounted on 'Files' tab\")" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": { 97 | "id": "aItyjyOaNiLk" 98 | }, 99 | "source": [ 100 | "# Import Appsflyer's *Install Report* as csv from Google Drive" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "colab": { 108 | "base_uri": "https://localhost:8080/" 109 | }, 110 | "id": "NBbCBKDzEvMR", 111 | "outputId": "88c5aeac-96e4-43b1-8753-46d17f59c755", 112 | "vscode": { 113 | "languageId": "python" 114 | } 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "file_path = \"/content/drive/location/file.csv\" # @param {type:\"string\"}\n", 119 | "low_memory=False\n", 120 | "df = pd.read_csv(file_path)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": { 126 | "id": "Yjp69IBzOAY1" 127 | }, 128 | "source": [ 129 | "# Prepare and check dataframe" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": { 136 | "colab": { 137 | "base_uri": "https://localhost:8080/", 138 | "height": 478 139 | }, 140 | "id": "DIJ4Eyi9bEYK", 141 | "outputId": "ec9f123a-d438-4f5c-e501-88a3b01691d1", 142 | "vscode": { 143 | "languageId": "python" 144 | } 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "# @title Select necessary columns and prepare dataframe { vertical-output: true, display-mode: \"form\" }\n", 149 | "## Select necessary columns\n", 150 | "df = df[df['Event Name']=='install']\n", 151 | "df = df[['Attributed Touch Type'\n", 152 | " , 'Attributed Touch Time'\n", 153 | " , 'Install Time'\n", 154 | " , 'Media Source'\n", 155 | " , 'Country Code'\n", 156 | " , 'Contributor 1 Touch Type' \n", 157 | " , 'Contributor 1 Touch Time'\n", 158 | " , 'Contributor 1 Media Source'\n", 159 | " , 'Contributor 2 Touch Type' \n", 160 | " , 'Contributor 2 Touch Time'\n", 161 | " , 'Contributor 2 Media Source'\n", 162 | " , 'Contributor 3 Touch Type' \n", 163 | " , 'Contributor 3 Touch Time'\n", 164 | " , 'Contributor 3 Media Source'\n", 165 | " ]]\n", 166 | "## Calculate time Touch to install time\n", 167 | "df['Install-Touch Timestamp'] = (pd.to_datetime(df['Install Time']) -\\\n", 168 | " pd.to_datetime(df['Attributed Touch Time']))\n", 169 | "\n", 170 | "df['Install-Touch sec'] = pd.to_timedelta(df['Install-Touch Timestamp'], unit='s')\n", 171 | "\n", 172 | "df['Install-Touch sec'] = df['Install-Touch sec'].dt.total_seconds()\n", 173 | "df.rename(columns={'Media Source': 'Attributed Media Source'}, inplace=True)\n", 174 | "df.head(3)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": { 181 | "colab": { 182 | "base_uri": "https://localhost:8080/", 183 | "height": 488 184 | }, 185 | "id": "no8CNVvFdkLI", 186 | "outputId": "94b62f1e-5571-4134-a709-6c39ea044af1", 187 | "vscode": { 188 | "languageId": "python" 189 | } 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "# @title Describe data { vertical-output: true, display-mode: \"form\" }\n", 194 | "\n", 195 | "grouping = \"Attributed Media Source\" #@param [\"Attributed Media Source\", \"Contributor 1 Media Source\", \"Contributor 2 Media Source\", \"Contributor 3 Media Source\"]\n", 196 | "\n", 197 | "df_cont = df.groupby(grouping).agg(['count', 'mean','min','max','std'])\n", 198 | "column = 'Install-Touch sec' # @param['Install-Touch sec']\n", 199 | "min_entries = 500 # @param {type:\"number\"}\n", 200 | "\n", 201 | "df_cont=df_cont[column].sort_values(by=['count'], ascending=False)\n", 202 | "df_cont=df_cont[df_cont['count']>=min_entries]\n", 203 | "\n", 204 | "##Affects next card\n", 205 | "medias = list(df_cont.index.values) \n", 206 | "\n", 207 | "df_cont" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": { 213 | "id": "LaQSPiK7OS3w" 214 | }, 215 | "source": [ 216 | "# Plots" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": { 223 | "colab": { 224 | "base_uri": "https://localhost:8080/", 225 | "height": 503 226 | }, 227 | "id": "ilciueUOf_eN", 228 | "outputId": "90e83a83-ab82-4303-dab6-0e173efd4ad6", 229 | "vscode": { 230 | "languageId": "python" 231 | } 232 | }, 233 | "outputs": [], 234 | "source": [ 235 | "# @title Use Violin Plots to compare distributions side by side { vertical-output: true, display-mode: \"form\" }\n", 236 | "col_x = 'Attributed Media Source'\n", 237 | "col_y = 'Install-Touch sec'\n", 238 | "\n", 239 | "sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'}, font_scale=1.15)\n", 240 | "sns.set_theme(style=\"whitegrid\")\n", 241 | "\n", 242 | "sec_min = 0 # @param {type:\"number\"}\n", 243 | "sec_max = 960 # @param {type:\"number\"}\n", 244 | "\n", 245 | "f, ax = plt.subplots(figsize=(30, 8))\n", 246 | "ax = sns.violinplot(x=col_x\n", 247 | " , y=col_y\n", 248 | " , data=df[((df[col_y]<=sec_max))], \n", 249 | " palette = \"tab20_r\",bw=.2, cut=1, linewidth=1, order=medias)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": { 256 | "colab": { 257 | "base_uri": "https://localhost:8080/", 258 | "height": 611 259 | }, 260 | "id": "pqV4kmqsWOGU", 261 | "outputId": "6fc9d5b8-4e65-4b1f-990c-78ca25f11cb6", 262 | "vscode": { 263 | "languageId": "python" 264 | } 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "#@title Plot histogram to compare distributions { vertical-output: true, display-mode: \"form\" }\n", 269 | "\n", 270 | "max_sec = 960 # @param {type:\"number\"}\n", 271 | "bsize = 10 # @param {type:\"number\"}\n", 272 | "#Change baseline to desired media source\n", 273 | "baseline = 'googleadwords_int' # @param[\"googleadwords_int\"] {allow-input: true}\n", 274 | "#Change media_source to compare\n", 275 | "media_source = 'googleadwords_int' # @param[\"googleadwords_int\"] {allow-input: true}\n", 276 | "\n", 277 | "df_filtered = df[(df['Install-Touch sec']<= max_sec) & (df['Install-Touch sec']>= 0)]\n", 278 | "df_filtered1 = df_filtered[df_filtered['Attributed Media Source']==baseline]\n", 279 | "df_filtered2 = df_filtered[df_filtered['Attributed Media Source']==media_source]\n", 280 | "\n", 281 | "sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})\n", 282 | "f, ax = plt.subplots(figsize=(20, 10))\n", 283 | "sns.histplot( df_filtered1['Install-Touch sec'], stat='density', kde=False, \n", 284 | " color=\"slategray\", label=baseline, bins=range(0, max_sec + bsize, bsize))\n", 285 | "sns.histplot( df_filtered2['Install-Touch sec'], stat='density', kde=False, \n", 286 | " color=\"deeppink\", label=media_source, bins=range(0, max_sec + bsize, bsize))\n", 287 | "plt.legend()\n", 288 | "plt.show()" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": { 294 | "id": "8T03cS-zDf1e" 295 | }, 296 | "source": [ 297 | "# Contribution Ratio" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": { 304 | "colab": { 305 | "base_uri": "https://localhost:8080/", 306 | "height": 1000 307 | }, 308 | "id": "mGsBk93gVbAb", 309 | "outputId": "6c104fb6-1de2-40e5-9bbd-f3e0a1196ebc", 310 | "vscode": { 311 | "languageId": "python" 312 | } 313 | }, 314 | "outputs": [], 315 | "source": [ 316 | "#@title Evaluate contribution/attribution ratio { vertical-output: true, display-mode: \"form\" }\n", 317 | "\n", 318 | "df_contrib = pd.DataFrame(df['Attributed Media Source'].value_counts())\\\n", 319 | " .join(pd.DataFrame(df['Contributor 1 Media Source'].value_counts()),how='outer')\\\n", 320 | " .join(pd.DataFrame(df['Contributor 2 Media Source'].value_counts()),how='outer')\\\n", 321 | " .join(pd.DataFrame(df['Contributor 3 Media Source'].value_counts()),how='outer').fillna(0)\n", 322 | "\n", 323 | "df_contrib['Contributions']= df_contrib[list(df_contrib.columns)[1:]].sum(axis=1)\n", 324 | "df_contrib['Ratio']=df_contrib['Contributions'] / df_contrib['Attributed Media Source']\n", 325 | "\n", 326 | "df_contrib=df_contrib.sort_values(by=['Attributed Media Source'],ascending=False)\n", 327 | "df_contrib.style.format({'Attributed Media Source':\"{:,}\",\\\n", 328 | " 'Contributor 1 Media Source':\"{:,}\",\\\n", 329 | " 'Contributor 2 Media Source':\"{:,}\",\\\n", 330 | " 'Contributor 3 Media Source':\"{:,}\",\\\n", 331 | " 'Contributions':\"{:,}\",\\\n", 332 | " 'Ratio': \"{:.2%}\"})" 333 | ] 334 | } 335 | ], 336 | "metadata": { 337 | "colab": { 338 | "collapsed_sections": [ 339 | "12bMVc1-N5TI", 340 | "ABEgtUqQN8j0" 341 | ], 342 | "name": "[DATA_PILL]_[Appsflyer]_Install_Report_Analysis.ipynb", 343 | "provenance": [] 344 | }, 345 | "kernelspec": { 346 | "display_name": "Python 3", 347 | "name": "python3" 348 | } 349 | }, 350 | "nbformat": 4, 351 | "nbformat_minor": 0 352 | } 353 | --------------------------------------------------------------------------------