├── .github ├── CHANGELOG.md ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml └── workflows │ ├── codacy-analysis.yml │ ├── python-app.yml │ ├── semantic-release.yml │ └── terraform.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── _config.yml ├── cloud_config ├── buyers.json ├── generate_megalist_token.py └── scheduler_sample.json ├── documentation └── Megalista - Technical User Guide - EXTERNAL.pdf ├── generate_megalist_token.sh ├── megalist_dataflow ├── main.py ├── mappers │ ├── __init__.py │ ├── ads_ssd_hashing_mapper.py │ ├── ads_user_list_pii_hashing_mapper.py │ └── ads_user_list_pii_hashing_mapper_test.py ├── megalist_metadata ├── models │ ├── __init__.py │ ├── execution.py │ ├── oauth_credentials.py │ ├── oauth_credentials_test.py │ ├── options.py │ ├── options_test.py │ └── sheets_config.py ├── requirements.txt ├── setup.py ├── sources │ ├── __init__.py │ ├── base_bounded_source.py │ ├── batches_from_executions.py │ ├── firestore_execution_source.py │ └── spreadsheet_execution_source.py └── uploaders │ ├── __init__.py │ ├── appsflyer │ ├── __init__.py │ └── appsflyer_s2s_uploader_async.py │ ├── big_query │ ├── __init__.py │ ├── transactional_events_results_writer.py │ └── transactional_events_results_writer_test.py │ ├── campaign_manager │ ├── __init__.py │ ├── campaign_manager_conversion_uploader.py │ └── campaign_manager_conversion_uploader_test.py │ ├── google_ads │ ├── __init__.py │ ├── conversions │ │ ├── __init__.py │ │ ├── google_ads_offline_conversions_uploader.py │ │ ├── google_ads_offline_conversions_uploader_test.py │ │ ├── google_ads_ssd_uploader.py │ │ └── google_ads_ssd_uploader_test.py │ └── customer_match │ │ ├── __init__.py │ │ ├── abstract_uploader.py │ │ ├── contact_info_uploader.py │ │ ├── mobile_uploader.py │ │ └── user_id_uploader.py │ ├── google_analytics │ ├── __init__.py │ ├── google_analytics_4_measurement_protocol.py │ ├── google_analytics_4_measurement_protocol_test.py │ ├── google_analytics_data_import_eraser.py │ ├── google_analytics_data_import_eraser_test.py │ ├── google_analytics_data_import_uploader.py │ ├── google_analytics_data_import_uploader_test.py │ ├── google_analytics_measurement_protocol.py │ ├── google_analytics_user_list_uploader.py │ └── google_analytics_user_list_uploader_test.py │ └── utils.py ├── mypy.ini ├── pyproject.toml ├── run_cloud.sh ├── run_tests.sh ├── terraform ├── external.tf ├── main.tf ├── scripts │ └── deploy_cloud.sh └── variables.tf └── terraform_deploy.sh /.github/CHANGELOG.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DP6/marketing-data-sync/e65fd6627612281143f0696461bd6475e793785d/.github/CHANGELOG.md -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '[BUG]' 5 | labels: bug 6 | assignees: '' 7 | --- 8 | 9 | **Describe the bug** 10 | A clear and concise description of what the bug is. 11 | 12 | **To Reproduce** 13 | Steps to reproduce the behavior: 14 | 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Additional context** 27 | Add any other context about the problem here. 28 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '[NEW]' 5 | labels: enhancement 6 | assignees: '' 7 | --- 8 | 9 | **Is your feature request related to a problem? Please describe.** 10 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 11 | 12 | **Describe the solution you'd like** 13 | A clear and concise description of what you want to happen. 14 | 15 | **Describe alternatives you've considered** 16 | A clear and concise description of any alternative solutions or features you've considered. 17 | 18 | **Additional context** 19 | Add any other context or screenshots about the feature request here. 20 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | **What issue does this pull request resolve?** 2 | 3 | **What changes did you make?** 4 | 5 | **Is there anything that requires more attention while reviewing?** 6 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | # Maintain dependencies for GitHub Actions 4 | - package-ecosystem: 'github-actions' 5 | directory: '/' 6 | schedule: 7 | interval: 'monthly' 8 | 9 | # Maintain dependencies for npm 10 | - package-ecosystem: 'pip' 11 | directory: '/megalist_dataflow' 12 | schedule: 13 | interval: 'monthly' 14 | -------------------------------------------------------------------------------- /.github/workflows/codacy-analysis.yml: -------------------------------------------------------------------------------- 1 | # This workflow checks out code, performs a Codacy security scan 2 | # and integrates the results with the 3 | # GitHub Advanced Security code scanning feature. For more information on 4 | # the Codacy security scan action usage and parameters, see 5 | # https://github.com/codacy/codacy-analysis-cli-action. 6 | # For more information on Codacy Analysis CLI in general, see 7 | # https://github.com/codacy/codacy-analysis-cli. 8 | 9 | name: Codacy Analysis 10 | 11 | on: ['push'] 12 | 13 | jobs: 14 | codacy-security-scan: 15 | name: Codacy Analysis 16 | runs-on: ubuntu-latest 17 | steps: 18 | # Checkout the repository to the GitHub Actions runner 19 | - name: Checkout code 20 | uses: actions/checkout@v2 21 | 22 | # Execute Codacy Analysis CLI and generate a SARIF output with the security issues identified during the analysis 23 | - name: Run Codacy Analysis CLI 24 | uses: codacy/codacy-analysis-cli-action@3.0.0 25 | with: 26 | # Check https://github.com/codacy/codacy-analysis-cli#project-token to get your project token from your Codacy repository 27 | # You can also omit the token and run the tools that support default configurations 28 | project-token: ${{ secrets.CODACY_PROJECT_TOKEN }} 29 | verbose: true 30 | output: results.sarif 31 | format: sarif 32 | # Adjust severity of non-security issues 33 | gh-code-scanning-compat: true 34 | # Force 0 exit code to allow SARIF file generation 35 | # This will handover control about PR rejection to the GitHub side 36 | max-allowed-issues: 2147483647 37 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python 5 | 6 | on: 7 | push: 8 | branches: [ develop ] 9 | pull_request: 10 | branches: [ main, master ] 11 | 12 | jobs: 13 | unit_testing: 14 | name: Test 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python 3.8 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: 3.8 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install -r megalist_dataflow/requirements.txt 27 | - name: Run tests 28 | run: | 29 | ./run_tests.sh 30 | - name: Upload coverage to Codacy 31 | run: export CODACY_PROJECT_TOKEN=${{ secrets.CODACY_PROJECT_TOKEN }} && bash <(curl -Ls https://coverage.codacy.com/get.sh) report -r megalist_dataflow/* 32 | continue-on-error: true 33 | -------------------------------------------------------------------------------- /.github/workflows/semantic-release.yml: -------------------------------------------------------------------------------- 1 | name: Semantic Release 2 | 3 | on: 4 | push: 5 | branches: [ main, master ] 6 | 7 | jobs: 8 | release: 9 | name: Release 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v2 14 | with: 15 | fetch-depth: 0 16 | 17 | - name: Python Semantic Release 18 | uses: relekang/python-semantic-release@master 19 | with: 20 | github_token: ${{ secrets.GITHUB_TOKEN }} 21 | pypi_token: ${{ secrets.PYPI_TOKEN }} -------------------------------------------------------------------------------- /.github/workflows/terraform.yml: -------------------------------------------------------------------------------- 1 | name: Terraform Validate 2 | 3 | on: ['push'] 4 | 5 | jobs: 6 | terraform-actions: 7 | name: tf validate 8 | runs-on: ubuntu-latest 9 | defaults: 10 | run: 11 | working-directory: ./terraform 12 | steps: 13 | - name: Checkout Repository 14 | uses: actions/checkout@master 15 | 16 | - name: HashiCorp - Setup Terraform 17 | uses: hashicorp/setup-terraform@v1.3.2 18 | with: 19 | terraform_version: 0.14.6 20 | 21 | - name: Terraform Init 22 | id: init 23 | run: terraform init 24 | continue-on-error: true 25 | 26 | - name: Terraform Fmt 27 | id: fmt 28 | run: terraform fmt -check -diff 29 | continue-on-error: true 30 | 31 | - name: Terraform Validate 32 | id: validate 33 | run: terraform validate -no-color 34 | continue-on-error: false 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *__pycache__* 2 | *.egg-info 3 | temp 4 | .pytest_cache 5 | .coverage 6 | htmlcov 7 | cloud_config/scheduler.json 8 | run_local.sh 9 | generate_csv.sh 10 | .terraform 11 | .idea 12 | .venv 13 | .vscode 14 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | - Using welcoming and inclusive language 18 | - Being respectful of differing viewpoints and experiences 19 | - Gracefully accepting constructive criticism 20 | - Focusing on what is best for the community 21 | - Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | - The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | - Trolling, insulting/derogatory comments, and personal or political attacks 28 | - Public or private harassment 29 | - Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | - Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at koopas@dp6.com.br. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution; 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | 25 | ## Community Guidelines 26 | 27 | This project follows 28 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/). 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MDS - Marketing Data Sync 2 | 3 | Solution based on the [Google Megalista project](https://github.com/google/megalista). 4 | 5 |
6 | 7 |
8 | 9 |

10 | 11 | semantic-release 12 | 13 | 14 | Code quality 15 | 16 | 17 |

18 | 19 | Sample integration code for onboarding offline/CRM data from BigQuery as custom audiences or offline conversions in Google Ads, Google Analytics 360, Google Display & Video 360, Google Campaign Manager and Facebook Ads. 20 | 21 | ## Supported integrations 22 | - **Google Ads** 23 | - Contact Info **Customer Match** (email, phone, address) [[details]](https://support.google.com/google-ads/answer/6379332?&ref_topic=6296507) 24 | - Id Based **Customer Match** (device Id, user id) 25 | - Offline Conversions through **gclid** [[details]](https://support.google.com/google-ads/answer/2998031?) 26 | - Store Sales Direct **(SSD)** conversions [[details]](https://support.google.com/google-ads/answer/9995886?hl=en) 27 | 28 | - **Google Analytics (Universal analytics)** 29 | - Custom segments through **Data Import** [[details]](https://support.google.com/analytics/answer/3191589?hl=en) 30 | - Measurement Protocol [[details]](https://developers.google.com/analytics/devguides/collection/protocol/v1#:~:text=Measurement%20Protocol%20Overview%20bookmark_border&text=The%20Google%20Analytics%20Measurement%20Protocol,directly%20to%20Google%20Analytics%20servers.) 31 | 32 | - **Campaign Manager** 33 | - Offline Conversions API **(user id, device id, match id, gclid, dclid)** [[details]](https://developers.google.com/doubleclick-advertisers/guides/conversions_upload) 34 | 35 | - **Google Analytics 4** 36 | - Measurement protocol (Web + App) [[details]](https://developers.google.com/analytics/devguides/collection/protocol/ga4) 37 | 38 | - **Appsflyer** 39 | - S2S Offline events API (conversion upload), to be used for audience creation and in-app events with Google Ads and DV360 [[details]](https://support.appsflyer.com/hc/en-us/articles/207034486-API-de-eventos-de-servidor-para-servidor-S2S-mobile-para-mobile) 40 | 41 | ## How does it work 42 | MDS was design to separate the configuration of conversion/audience upload rules from the engine, giving more freedom for non-technical teams (i.e. Media and Business Inteligence) to setup multiple upload rules on their own. 43 | 44 | The solution consists in #1 a Google Spreadsheet (template) in which all rules are defined by mapping a data source (BigQuery Table) to a destination (data upload endpoint) and #2, an apache beam workflow running on Google Dataflow, scheduled to upload the data in batch mode. 45 | 46 | ## Prerequisites 47 | 48 | ### Google Cloud Services 49 | - **Google Cloud Platform** account 50 | - **Billing** enabled 51 | - **BigQuery** enabled 52 | - **Dataflow** enabled 53 | - **Cloud storage** enabled 54 | - **Cloud scheduler** enabled 55 | - At least one of: 56 | - **Google Ads** API Access 57 | - **Campaign Manager** API Access 58 | - **Google Analytics** API Access 59 | - **Python3** 60 | - **Google Cloud SDK** 61 | 62 | ### Access Requirements 63 | Those are the minimum roles necessary to deploy MDS: 64 | - OAuth Config Editor 65 | - BigQuery User 66 | - BigQuery Job User 67 | - BigQuery Data Viewer 68 | - Cloud Scheduler Admin 69 | - Storage Admin 70 | - Dataflow Admin 71 | - Service Account Admin 72 | - Logs Viewer 73 | - Service Consumer 74 | 75 | ### APIs 76 | Required APIs will depend on upload endpoints in use. We recomend you to enable all of them: 77 | - Google Sheets (required for any use case) [[link]](https://console.cloud.google.com/apis/library/sheets.googleapis.com) 78 | - Google Analytics [[link]](https://console.cloud.google.com/apis/library/analytics.googleapis.com) 79 | - Google Analytics Reporting [[link]](https://console.cloud.google.com/apis/library/analyticsreporting.googleapis.com) 80 | - Google Ads [[link]](https://console.cloud.google.com/apis/library/googleads.googleapis.com) 81 | - Campaign Manager [[link]](https://console.cloud.google.com/apis/library/dfareporting.googleapis.com) 82 | 83 | 84 | ## Installation 85 | 86 | ### Create a copy of the configuration Spreadsheet 87 | WIP 88 | 89 | ### Creating required access tokens 90 | To access campaigns and user lists on Google's platforms, this dataflow will need OAuth tokens for a account that can authenticate in those systems. 91 | 92 | In order to create it, follow these steps: 93 | - Access GCP console 94 | - Go to the **API & Services** section on the top-left menu. 95 | - On the **OAuth Consent Screen** and configure an *Application name* 96 | - Then, go to the **Credentials** and create an *OAuth client Id* with Application type set as *Desktop App* 97 | - This will generate a *Client Id* and a *Client secret* 98 | - Run the **generate_mds_token.sh** script in this folder providing these two values and follow the instructions 99 | - Sample: `./generate_mds_token.sh client_id client_secret` 100 | - This will generate the *Access Token* and the *Refresh token* 101 | 102 | ### Creating a bucket on Cloud Storage 103 | This bucket will hold the deployed code for this solution. To create it, navigate to the *Storage* link on the top-left menu on GCP and click on *Create bucket*. You can use Regional location and Standard data type for this bucket. 104 | 105 | ## Running MDS 106 | 107 | We recommend first running it locally and make sure that everything works. 108 | Make some sample tables on BigQuery for one of the uploaders and make sure that the data is getting correctly to the destination. 109 | After that is done, upload the Dataflow template to GCP and try running it manually via the UI to make sure it works. 110 | Lastly, configure the Cloud Scheduler to run MDS in the frequency desired and you'll have a fully functional data integration pipeline. 111 | 112 | ### Running locally 113 | ```bash 114 | python3 mds_dataflow/main.py \ 115 | --runner DirectRunner \ 116 | --developer_token ${GOOGLE_ADS_DEVELOPER_TOKEN} \ 117 | --setup_sheet_id ${CONFIGURATION_SHEET_ID} \ 118 | --refresh_token ${REFRESH_TOKEN} \ 119 | --access_token ${ACCESS_TOKEN} \ 120 | --client_id ${CLIENT_ID} \ 121 | --client_secret ${CLIENT_SECRET} \ 122 | --project ${GCP_PROJECT_ID} \ 123 | --region us-central1 \ 124 | --temp_location gs://{$GCS_BUCKET}/tmp 125 | ``` 126 | 127 | ### Deploying Pipeline 128 | To deploy, use the following commands from the root folder: 129 | ``` 130 | cd terraform 131 | ./scripts/deploy_cloud.sh project_id bucket_name region_name 132 | ``` 133 | 134 | #### Manually executing pipeline using Dataflow UI 135 | To execute the pipeline, use the following steps: 136 | - Go to **Dataflow** on GCP console 137 | - Click on *Create job from template* 138 | - On the template selection dropdown, select *Custom template* 139 | - Find the *mds* file on the bucket you've created, on the templates folder 140 | - Fill in the parameters required and execute 141 | 142 | ### Scheduling pipeline 143 | To schedule daily/hourly runs, go to **Cloud Scheduler**: 144 | - Click on *create job* 145 | - Add a name and frequency as desired 146 | - For *target* set as HTTP 147 | - Configure a *POST* for url: https://dataflow.googleapis.com/v1b3/projects/${YOUR_PROJECT_ID}/locations/${LOCATION}/templates:launch?gcsPath=gs://${BUCKET_NAME}/templates/mds, replacing the params with the actual values 148 | - For a sample on the *body* of the request, check **cloud_config/scheduler.json** 149 | - Add OAuth Headers 150 | - Scope: https://www.googleapis.com/auth/cloud-platform 151 | 152 | #### Creating a Service Account 153 | It's recommended to create a new Service Account to be used with the Cloud Scheduler 154 | - Go to IAM & Admin > Service Accounts 155 | - Create a new Service Account with the following roles: 156 | - Cloud Dataflow Service Agent 157 | - Dataflow Admin 158 | - Storage Objects Viewer 159 | 160 | 161 | ## Usage 162 | Every upload method expects as source a BigQuery data with specific fields, in addition to specific configuration metadata. For details on how to setup your upload routines, refer to the [MDS Wiki](https://github.com/dp6/marketing-data-sync/wiki) or the [MDS user guide](https://github.com/dp6/marketing-data-sync/blob/main/documentation/mds%20-%20Technical%20User%20Guide%20-%20EXTERNAL.pdf). 163 | 164 | ### Mandatory requirements 165 | 166 | Only contributions that meet the following requirements will be accepted: 167 | 168 | - [Commit pattern](https://www.conventionalcommits.org/en/v1.0.0/) 169 | 170 | ## Support: 171 | 172 | **DP6 Koopa-troopa Team** 173 | 174 | _e-mail: _ 175 | 176 | 177 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | title: DP6 - Centro de inovações 2 | initiative: 'Marketing Data Sync' 3 | 4 | remote_theme: dp6/dp6.github.io 5 | 6 | plugins: 7 | - jekyll-sitemap 8 | - jekyll-gzip 9 | -------------------------------------------------------------------------------- /cloud_config/buyers.json: -------------------------------------------------------------------------------- 1 | {"user_id": "uuid1", "gclid":"E45C235","mobile_device_id":"A76B923847E","email":"test@test.com","mailing_address":{"first_name":"John","last_name":"Doe","country":"US","zip":"111-2222"},"phone":"555-1234","conversions":[{"id":"1","value":3.75,"time":"2019-06-21 01:11:21.805627 UTC"},{"id":"2","value":5.99,"time":"2019-06-21 01:11:21.805627 UTC"}]} 2 | {"user_id": "uuid2","gclid":"AB9203","mobile_device_id":"35883792E","email":"test2@test2.com","mailing_address":{"first_name":"Jane","last_name":"Doe","country":"US","zip":"111-2222"},"phone":"555-4321","conversions":[{"id":"1","value":53,"time":"2019-06-21 01:10:48.460715 UTC"},{"id":"2","value":12.99,"time":"2019-06-21 01:10:48.460715 UTC"}]} -------------------------------------------------------------------------------- /cloud_config/generate_megalist_token.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright 2014 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """Generates refresh token for AdWords using the Installed Application flow.""" 18 | 19 | 20 | import argparse 21 | import sys 22 | 23 | from google_auth_oauthlib.flow import InstalledAppFlow 24 | from oauthlib.oauth2.rfc6749.errors import InvalidGrantError 25 | 26 | # Your OAuth2 Client ID and Secret. If you do not have an ID and Secret yet, 27 | # please go to https://console.developers.google.com and create a set. 28 | DEFAULT_CLIENT_ID = None 29 | DEFAULT_CLIENT_SECRET = None 30 | 31 | # The AdWords API OAuth2 scope. 32 | SCOPES = ['https://www.googleapis.com/auth/adwords', 33 | 'https://www.googleapis.com/auth/dfareporting', 34 | 'https://www.googleapis.com/auth/dfatrafficking', 35 | 'https://www.googleapis.com/auth/ddmconversions', 36 | "https://www.googleapis.com/auth/analytics.edit", 37 | 'https://www.googleapis.com/auth/spreadsheets.readonly'] 38 | 39 | # The redirect URI set for the given Client ID. The redirect URI for Client ID 40 | # generated for an installed application will always have this value. 41 | _REDIRECT_URI = 'urn:ietf:wg:oauth:2.0:oob' 42 | 43 | parser = argparse.ArgumentParser(description='Generates a refresh token with ' 44 | 'the provided credentials.') 45 | parser.add_argument('--client_id', default=DEFAULT_CLIENT_ID, 46 | help='Client Id retrieved from the Developer\'s Console.') 47 | parser.add_argument('--client_secret', default=DEFAULT_CLIENT_SECRET, 48 | help='Client Secret retrieved from the Developer\'s ' 49 | 'Console.') 50 | parser.add_argument('--additional_scopes', default=None, 51 | help='Additional scopes to apply when generating the ' 52 | 'refresh token. Each scope should be separated by a comma.') 53 | 54 | 55 | class ClientConfigBuilder(object): 56 | """Helper class used to build a client config dict used in the OAuth 2.0 flow. 57 | """ 58 | _DEFAULT_AUTH_URI = 'https://accounts.google.com/o/oauth2/auth' 59 | _DEFAULT_TOKEN_URI = 'https://accounts.google.com/o/oauth2/token' 60 | CLIENT_TYPE_WEB = 'web' 61 | CLIENT_TYPE_INSTALLED_APP = 'installed' 62 | 63 | def __init__(self, client_type=None, client_id=None, client_secret=None, 64 | auth_uri=_DEFAULT_AUTH_URI, token_uri=_DEFAULT_TOKEN_URI): 65 | self.client_type = client_type 66 | self.client_id = client_id 67 | self.client_secret = client_secret 68 | self.auth_uri = auth_uri 69 | self.token_uri = token_uri 70 | 71 | def Build(self): 72 | """Builds a client config dictionary used in the OAuth 2.0 flow.""" 73 | if all((self.client_type, self.client_id, self.client_secret, 74 | self.auth_uri, self.token_uri)): 75 | client_config = { 76 | self.client_type: { 77 | 'client_id': self.client_id, 78 | 'client_secret': self.client_secret, 79 | 'auth_uri': self.auth_uri, 80 | 'token_uri': self.token_uri 81 | } 82 | } 83 | else: 84 | raise ValueError('Required field is missing.') 85 | 86 | return client_config 87 | 88 | 89 | def main(client_id, client_secret, scopes): 90 | """Retrieve and display the access and refresh token.""" 91 | client_config = ClientConfigBuilder( 92 | client_type=ClientConfigBuilder.CLIENT_TYPE_WEB, client_id=client_id, 93 | client_secret=client_secret) 94 | 95 | flow = InstalledAppFlow.from_client_config( 96 | client_config.Build(), scopes=scopes) 97 | # Note that from_client_config will not produce a flow with the 98 | # redirect_uris (if any) set in the client_config. This must be set 99 | # separately. 100 | flow.redirect_uri = _REDIRECT_URI 101 | 102 | auth_url, _ = flow.authorization_url(prompt='consent') 103 | 104 | print('Log into the Google Account you use to access your AdWords account ' 105 | 'and go to the following URL: \n%s\n' % auth_url) 106 | print('After approving the token enter the verification code (if specified).') 107 | code = input('Code: ').strip() 108 | 109 | try: 110 | flow.fetch_token(code=code) 111 | except InvalidGrantError as ex: 112 | print('Authentication has failed: %s' % ex) 113 | sys.exit(1) 114 | 115 | print('Access token: %s' % flow.credentials.token) 116 | print('Refresh token: %s' % flow.credentials.refresh_token) 117 | 118 | 119 | if __name__ == '__main__': 120 | args = parser.parse_args() 121 | configured_scopes = SCOPES 122 | if not (any([args.client_id, DEFAULT_CLIENT_ID]) and 123 | any([args.client_secret, DEFAULT_CLIENT_SECRET])): 124 | raise AttributeError('No client_id or client_secret specified.') 125 | if args.additional_scopes: 126 | configured_scopes.extend( 127 | args.additional_scopes.replace(' ', '').split(',')) 128 | main(args.client_id, args.client_secret, configured_scopes) 129 | -------------------------------------------------------------------------------- /cloud_config/scheduler_sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "jobName": "mds_daily", 3 | "parameters": { 4 | "developer_token": "Google Ads Developer Token", 5 | "client_id": "GCP OAuth Client id", 6 | "client_secret": "GCP OAuth Client Secret", 7 | "access_token": "GCP OAuth access token", 8 | "refresh_token": "GCP OAuth refresh token", 9 | "setup_sheet_id": "Setup Google Sheets Id", 10 | "bq_ops_dataset": "Auxliary bigquery dataset used for MDS operations", 11 | "appsflyer_dev_key": "Apps flyer dev key" 12 | }, 13 | "environment": { 14 | "tempLocation": "gs://bucket-name/temp", 15 | "zone": "us-central1-f" 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /documentation/Megalista - Technical User Guide - EXTERNAL.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DP6/marketing-data-sync/e65fd6627612281143f0696461bd6475e793785d/documentation/Megalista - Technical User Guide - EXTERNAL.pdf -------------------------------------------------------------------------------- /generate_megalist_token.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2021 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | if [ $# != 2 ]; then 17 | echo "Usage: $0 client_id client_secret" 18 | exit 1 19 | fi 20 | 21 | pip3 install --user -q -r megalist_dataflow/requirements.txt 22 | python3 cloud_config/generate_megalist_token.py --client_id $1 --client_secret $2 23 | -------------------------------------------------------------------------------- /megalist_dataflow/mappers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. -------------------------------------------------------------------------------- /megalist_dataflow/mappers/ads_ssd_hashing_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Iterable 16 | from models.execution import Batch 17 | 18 | class AdsSSDHashingMapper(): 19 | def _hash_field(self, s): 20 | import hashlib 21 | return hashlib.sha256(s.strip().lower().encode('utf-8')).hexdigest() 22 | 23 | def _map_conversion(self, conversion): 24 | return { 25 | 'hashedEmail': self._hash_field(conversion['email']), 26 | 'time': conversion['time'], 27 | 'amount': conversion['amount'] 28 | } 29 | 30 | def _map_conversions(self, conversions): 31 | return [self._map_conversion(conversion) for conversion in conversions] 32 | 33 | def map_batch(self, batch: Batch): 34 | return Batch(batch.execution, self._map_conversions(batch.elements)) 35 | 36 | -------------------------------------------------------------------------------- /megalist_dataflow/mappers/ads_user_list_pii_hashing_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | 17 | from models.execution import Batch 18 | 19 | 20 | class FieldHasher: 21 | 22 | def __init__(self, should_hash_fields): 23 | self.should_hash_fields = should_hash_fields 24 | 25 | def hash_field(self, field): 26 | import hashlib 27 | 28 | if self.should_hash_fields: 29 | return hashlib.sha256(field.strip().lower().encode('utf-8')).hexdigest() 30 | 31 | return field 32 | 33 | 34 | class AdsUserListPIIHashingMapper: 35 | def __init__(self): 36 | self.logger = logging.getLogger( 37 | 'megalista.AdsUserListPIIHashingMapper') 38 | 39 | def _hash_user(self, user, hasher): 40 | 41 | hashed = user.copy() 42 | 43 | try: 44 | if 'email' in user: 45 | hashed['hashedEmail'] = hasher.hash_field(user['email']) 46 | del hashed['email'] 47 | except: 48 | self.logger.error("Error hashing email for user: %s" % user) 49 | 50 | try: 51 | if 'mailing_address_first_name' in user and 'mailing_address_last_name' in user: 52 | hashed['addressInfo'] = { 53 | 'hashedFirstName': hasher.hash_field(user['mailing_address_first_name']), 54 | 'hashedLastName': hasher.hash_field(user['mailing_address_last_name']), 55 | 'countryCode': user['mailing_address_country'], 56 | 'zipCode': user['mailing_address_zip'] 57 | } 58 | del hashed['mailing_address_first_name'] 59 | del hashed['mailing_address_last_name'] 60 | del hashed['mailing_address_country'] 61 | del hashed['mailing_address_zip'] 62 | except: 63 | self.logger.error("Error hashing address for user: %s" % user) 64 | 65 | try: 66 | if 'phone' in user: 67 | hashed['hashedPhoneNumber'] = hasher.hash_field(user['phone']) 68 | del hashed['phone'] 69 | except: 70 | self.logger.error("Error hashing phone for user: %s" % user) 71 | 72 | try: 73 | if 'mobile_device_id' in user: 74 | hashed['mobileId'] = user['mobile_device_id'] 75 | del hashed['mobile_device_id'] 76 | except: 77 | self.logger.error( 78 | "Error hashing mobile_device_id for user: %s" % user) 79 | 80 | try: 81 | if 'user_id' in user: 82 | hashed['userId'] = hasher.hash_field(user['user_id']) 83 | del hashed['user_id'] 84 | except: 85 | self.logger.error("Error hashing user_id for user: %s" % user) 86 | 87 | return hashed 88 | 89 | def _get_should_hash_fields(self, metadata_list): 90 | 91 | if len(metadata_list) < 3: 92 | return True 93 | 94 | should_hash_fields = metadata_list[2] 95 | 96 | if not should_hash_fields: 97 | return True 98 | 99 | return should_hash_fields.lower() != 'false' 100 | 101 | def hash_users(self, batch: Batch): 102 | 103 | should_hash_fields = self._get_should_hash_fields( 104 | batch.execution.destination.destination_metadata) 105 | self.logger.debug('Should hash fields is %s' % should_hash_fields) 106 | 107 | return Batch(batch.execution, [self._hash_user(element, FieldHasher(should_hash_fields)) for element in batch.elements]) 108 | -------------------------------------------------------------------------------- /megalist_dataflow/mappers/ads_user_list_pii_hashing_mapper_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from mappers.ads_user_list_pii_hashing_mapper import AdsUserListPIIHashingMapper 16 | 17 | from models.execution import Batch 18 | 19 | 20 | def test_get_should_hash_fields(): 21 | 22 | hasher = AdsUserListPIIHashingMapper() 23 | 24 | # True 25 | assert hasher._get_should_hash_fields(['ListName', 'Operator', 'True']) 26 | assert hasher._get_should_hash_fields(['ListName', 'Operator']) 27 | assert hasher._get_should_hash_fields(['ListName', 'Operator', None]) 28 | assert hasher._get_should_hash_fields(['ListName', 'Operator', '']) 29 | assert hasher._get_should_hash_fields(['ListName', 'Operator', 'anything']) 30 | 31 | # False 32 | assert not hasher._get_should_hash_fields(['ListName', 'Operator', 'false']) 33 | assert not hasher._get_should_hash_fields(['ListName', 'Operator', 'FALSE']) 34 | assert not hasher._get_should_hash_fields(['ListName', 'Operator', 'False']) 35 | 36 | 37 | def test_pii_hashing(mocker): 38 | 39 | users = [{ 40 | "email": "john@doe.com", 41 | "mailing_address_first_name": "John", 42 | "mailing_address_last_name": "Doe", 43 | "mailing_address_zip": "12345", 44 | "mailing_address_country": "US" 45 | }, 46 | { 47 | "email": "jane@doe.com", 48 | "mailing_address_first_name": "Jane", 49 | "mailing_address_last_name": "Doe", 50 | "mailing_address_zip": "12345", 51 | "mailing_address_country": "US" 52 | }] 53 | 54 | # Execution mock 55 | execution = mocker.MagicMock() 56 | execution.destination.destination_metadata = ['Audience', 'ADD'] 57 | 58 | batch = Batch(execution, [users[0], users[1]]) 59 | 60 | # Call 61 | hasher = AdsUserListPIIHashingMapper() 62 | hashed = hasher.hash_users(batch).elements 63 | 64 | assert len(hashed) == 2 65 | 66 | assert hashed[0] == { 67 | 'hashedEmail': 'd709f370e52b57b4eb75f04e2b3422c4d41a05148cad8f81776d94a048fb70af', 68 | 'addressInfo': { 69 | 'countryCode': 'US', 70 | 'hashedFirstName': '96d9632f363564cc3032521409cf22a852f2032eec099ed5967c0d000cec607a', 71 | 'hashedLastName': '799ef92a11af918e3fb741df42934f3b568ed2d93ac1df74f1b8d41a27932a6f', 72 | 'zipCode': '12345' 73 | }} 74 | 75 | assert hashed[1] == { 76 | 'hashedEmail': '7c815580ad3844bcb627c74d24eaf700e1a711d9c23e9beb62ab8d28e8cb7954', 77 | 'addressInfo': { 78 | 'countryCode': 'US', 79 | 'hashedFirstName': '81f8f6dde88365f3928796ec7aa53f72820b06db8664f5fe76a7eb13e24546a2', 80 | 'hashedLastName': '799ef92a11af918e3fb741df42934f3b568ed2d93ac1df74f1b8d41a27932a6f', 81 | 'zipCode': '12345' 82 | }} 83 | 84 | 85 | def test_avoid_pii_hashing(mocker): 86 | users = [{ 87 | "email": "john@doe.com", 88 | "mailing_address_first_name": "John", 89 | "mailing_address_last_name": "Doe", 90 | "mailing_address_zip": "12345", 91 | "mailing_address_country": "US" 92 | }, 93 | { 94 | "email": "jane@doe.com", 95 | "mailing_address_first_name": "Jane", 96 | "mailing_address_last_name": "Doe", 97 | "mailing_address_zip": "12345", 98 | "mailing_address_country": "US" 99 | }] 100 | 101 | # Mock the execution 102 | execution = mocker.MagicMock() 103 | execution.destination.destination_metadata = ['Audience', 'ADD', 'False'] 104 | 105 | batch = Batch(execution, [users[0], users[1]]) 106 | 107 | # Call 108 | hasher = AdsUserListPIIHashingMapper() 109 | hashed = hasher.hash_users(batch).elements 110 | 111 | assert len(hashed) == 2 112 | 113 | assert hashed[0] == { 114 | 'hashedEmail': 'john@doe.com', 115 | 'addressInfo': { 116 | 'countryCode': 'US', 117 | 'hashedFirstName': 'John', 118 | 'hashedLastName': 'Doe', 119 | 'zipCode': '12345' 120 | }} 121 | 122 | assert hashed[1] == { 123 | 'hashedEmail': 'jane@doe.com', 124 | 'addressInfo': { 125 | 'countryCode': 'US', 126 | 'hashedFirstName': 'Jane', 127 | 'hashedLastName': 'Doe', 128 | 'zipCode': '12345' 129 | }} 130 | -------------------------------------------------------------------------------- /megalist_dataflow/megalist_metadata: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Megalist", 3 | "description": "Buyers audience generator and uploader", 4 | "parameters": [ 5 | { 6 | "name": "developer_token", 7 | "label": "Google Ads Developer Token", 8 | "help_text": "Google Ads Developer Token", 9 | "is_optional": "true" 10 | }, 11 | { 12 | "name": "client_id", 13 | "label": "Client Id for the Google APIs", 14 | "help_text": "Client Id for the Google APIs" 15 | }, 16 | { 17 | "name": "client_secret", 18 | "label": "Client Secret for the Google APIs", 19 | "help_text": "Client Secret for the Google APIs" 20 | }, 21 | { 22 | "name": "access_token", 23 | "label": "Access Token for the Google APIs", 24 | "help_text": "Access Token for the Google APIs" 25 | }, 26 | { 27 | "name": "refresh_token", 28 | "label": "Refresh Token for the Google APIs", 29 | "help_text": "Refresh Token for the Google APIs" 30 | }, 31 | { 32 | "name": "setup_sheet_id", 33 | "label": "Google Sheets id for config", 34 | "help_text": "Google Sheets id for config" 35 | }, 36 | { 37 | "name": "bq_ops_dataset", 38 | "label": "Auxliary bigquery dataset used for Megalista operations", 39 | "help_text": "Auxliary bigquery dataset used for Megalista operations" 40 | }, 41 | { 42 | "name": "appsflyer_dev_key", 43 | "label": "Developer key for AppsFlyer S2S API", 44 | "help_text": "Developer key for AppsFlyer S2S API", 45 | "is_optional": "true" 46 | } 47 | ] 48 | } 49 | -------------------------------------------------------------------------------- /megalist_dataflow/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name = "dataflow_deps" -------------------------------------------------------------------------------- /megalist_dataflow/models/execution.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from enum import Enum 16 | from typing import Dict, List, Union 17 | 18 | OK_STATUS = 'OK' 19 | 20 | 21 | class DestinationType(Enum): 22 | CM_OFFLINE_CONVERSION, \ 23 | ADS_OFFLINE_CONVERSION, \ 24 | ADS_SSD_UPLOAD, \ 25 | ADS_ENHANCED_CONVERSION, \ 26 | ADS_CUSTOMER_MATCH_CONTACT_INFO_UPLOAD, \ 27 | ADS_CUSTOMER_MATCH_MOBILE_DEVICE_ID_UPLOAD, \ 28 | ADS_CUSTOMER_MATCH_USER_ID_UPLOAD, \ 29 | GA_USER_LIST_UPLOAD, \ 30 | APPSFLYER_S2S_EVENTS, \ 31 | GA_MEASUREMENT_PROTOCOL, \ 32 | GA_DATA_IMPORT, \ 33 | GA_4_MEASUREMENT_PROTOCOL = range(12) 34 | 35 | def __eq__(self, other): 36 | if other is None: 37 | return False 38 | return self.name == other.name 39 | 40 | 41 | class SourceType(Enum): 42 | BIG_QUERY, \ 43 | CSV = range(2) 44 | # TODO: CSV not yet implemented 45 | 46 | 47 | class AccountConfig: 48 | def __init__( 49 | self, 50 | google_ads_account_id: str, 51 | mcc: bool, 52 | google_analytics_account_id: str, 53 | campaign_manager_account_id: str, 54 | app_id: str 55 | ): 56 | self._google_ads_account_id = google_ads_account_id 57 | self._mcc = mcc 58 | self._google_analytics_account_id = google_analytics_account_id 59 | self._campaign_manager_account_id = campaign_manager_account_id 60 | self._app_id = app_id 61 | 62 | @property 63 | def google_ads_account_id(self) -> str: 64 | return self._google_ads_account_id 65 | 66 | @property 67 | def mcc(self) -> bool: 68 | return self._mcc 69 | 70 | @property 71 | def google_analytics_account_id(self) -> str: 72 | return self._google_analytics_account_id 73 | 74 | @property 75 | def campaign_manager_account_id(self) -> str: 76 | return self._campaign_manager_account_id 77 | 78 | @property 79 | def app_id(self) -> str: 80 | return self._app_id 81 | 82 | def __str__(self) -> str: 83 | return f"\n[Account Config]\n\t" \ 84 | f"Google Ads Customer Id: {self.google_ads_account_id}\n\t" \ 85 | f"Google Ads MCC: {self._mcc}\n\t" \ 86 | f"Google Analytics Account Id: {self.google_analytics_account_id}\n\t" \ 87 | f"Campaign Manager Account Id: {self.campaign_manager_account_id}\n\t" \ 88 | f"Play Store App Id: {self.app_id}" 89 | 90 | def __eq__(self, other): 91 | return self.google_ads_account_id == other.google_ads_account_id \ 92 | and self.google_analytics_account_id == other.google_analytics_account_id \ 93 | and self.campaign_manager_account_id == other.campaign_manager_account_id \ 94 | and self.app_id == other.app_id 95 | 96 | def __hash__(self): 97 | return hash((self.google_ads_account_id, self.google_analytics_account_id, 98 | self.campaign_manager_account_id, self.app_id)) 99 | 100 | 101 | class Source: 102 | def __init__( 103 | self, 104 | source_name: str, 105 | source_type: SourceType, 106 | source_metadata: List[str] 107 | ): 108 | self._source_name = source_name 109 | self._source_type = source_type 110 | self._source_metadata = source_metadata 111 | 112 | @property 113 | def source_name(self) -> str: 114 | return self._source_name 115 | 116 | @property 117 | def source_type(self) -> SourceType: 118 | return self._source_type 119 | 120 | @property 121 | def source_metadata(self) -> List[str]: 122 | return self._source_metadata 123 | 124 | def __eq__(self, other): 125 | return self.source_name == other.source_name \ 126 | and self.source_type == other.source_type \ 127 | and self.source_metadata == other.source_metadata 128 | 129 | def __hash__(self): 130 | return hash((self.source_name, self.source_type, self.source_metadata[0], self.source_metadata[1])) 131 | 132 | 133 | class Destination: 134 | def __init__( 135 | self, 136 | destination_name: str, 137 | destination_type: DestinationType, 138 | destination_metadata: List[str] 139 | ): 140 | self._destination_name = destination_name 141 | self._destination_type = destination_type 142 | self._destination_metadata = destination_metadata 143 | 144 | @property 145 | def destination_name(self) -> str: 146 | return self._destination_name 147 | 148 | @property 149 | def destination_type(self) -> DestinationType: 150 | return self._destination_type 151 | 152 | @property 153 | def destination_metadata(self) -> List[str]: 154 | return self._destination_metadata 155 | 156 | def __eq__(self, other) -> bool: 157 | return bool(self.destination_name == other.destination_name and self.destination_metadata[0] == other.destination_metadata[0]) 158 | 159 | def __hash__(self) -> int: 160 | return hash((self.destination_name, self.destination_type.name, self.destination_metadata[0])) 161 | 162 | 163 | class Execution: 164 | def __init__( 165 | self, 166 | account_config: AccountConfig, 167 | source: Source, 168 | destination: Destination 169 | ): 170 | self._account_config = account_config 171 | self._source = source 172 | self._destination = destination 173 | 174 | @property 175 | def source(self) -> Source: 176 | return self._source 177 | 178 | @property 179 | def destination(self) -> Destination: 180 | return self._destination 181 | 182 | @property 183 | def account_config(self) -> AccountConfig: 184 | return self._account_config 185 | 186 | def __str__(self): 187 | return 'Origin name: {}. Action: {}. Destination name: {}'.format(self.source.source_name, 188 | self.destination.destination_type, 189 | self.destination.destination_name) 190 | 191 | def __eq__(self, other): 192 | if other is None: 193 | return False 194 | return self.source == other.source \ 195 | and self.destination == other.destination \ 196 | and self.account_config == other.account_config 197 | 198 | def __hash__(self): 199 | return hash((self.source, self.destination, self.account_config)) 200 | 201 | 202 | class Batch: 203 | def __init__( 204 | self, 205 | execution: Execution, 206 | elements: List[Dict[str, Union[str, Dict[str, str]]]] 207 | ): 208 | self._execution = execution 209 | self._elements = elements 210 | 211 | @property 212 | def execution(self) -> Execution: 213 | return self._execution 214 | 215 | @property 216 | def elements(self) -> List[Dict[str, Union[str, Dict[str, str]]]]: 217 | return self._elements 218 | 219 | def __str__(self): 220 | return f'Execution: {self._execution}. Elements: {self._elements}' 221 | 222 | def __eq__(self, other): 223 | if other is None: 224 | return False 225 | return self.execution == other.execution and self.elements == other.elements 226 | 227 | def __hash__(self): 228 | return hash(('Batch', self.execution)) 229 | -------------------------------------------------------------------------------- /megalist_dataflow/models/oauth_credentials.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | class OAuthCredentials(): 17 | def __init__(self, client_id, client_secret, access_token, refresh_token): 18 | self.client_id = client_id 19 | self.client_secret = client_secret 20 | self.access_token = access_token 21 | self.refresh_token = refresh_token 22 | 23 | def get_client_id(self): 24 | return self.client_id.get() 25 | 26 | def get_client_secret(self): 27 | return self.client_secret.get() 28 | 29 | def get_access_token(self): 30 | return self.access_token.get() 31 | 32 | def get_refresh_token(self): 33 | return self.refresh_token.get() 34 | -------------------------------------------------------------------------------- /megalist_dataflow/models/oauth_credentials_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from models.oauth_credentials import OAuthCredentials 16 | from apache_beam.options.value_provider import StaticValueProvider 17 | 18 | 19 | def test_init(): 20 | id = StaticValueProvider(str, "id") 21 | secret = StaticValueProvider(str, "secret") 22 | access = StaticValueProvider(str, "access") 23 | refresh = StaticValueProvider(str, "refresh") 24 | credentials = OAuthCredentials(id, secret, access, refresh) 25 | assert credentials.get_client_id() == "id" 26 | assert credentials.get_client_secret() == "secret" 27 | assert credentials.get_access_token() == "access" 28 | assert credentials.get_refresh_token() == "refresh" 29 | -------------------------------------------------------------------------------- /megalist_dataflow/models/options.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from apache_beam.options.pipeline_options import PipelineOptions 16 | 17 | 18 | class DataflowOptions(PipelineOptions): 19 | 20 | @classmethod 21 | def _add_argparse_args(cls, parser): 22 | # OAUTH 23 | parser.add_value_provider_argument( 24 | '--client_id', help='Client Id for the Google APIs') 25 | parser.add_value_provider_argument( 26 | '--client_secret', help='Client Secret for the Google APIs') 27 | parser.add_value_provider_argument( 28 | '--refresh_token', help='OAUTH Refresh Token for the Google APIs') 29 | parser.add_value_provider_argument( 30 | '--access_token', help='OAUTH Access Token for the Google APIs') 31 | # Set up 32 | parser.add_value_provider_argument( 33 | '--setup_sheet_id', help='Id of Spreadsheet with execution info') 34 | parser.add_value_provider_argument( 35 | '--setup_firestore_collection', help='Name of Firestore collection with execution info') 36 | parser.add_value_provider_argument( 37 | '--bq_ops_dataset', 38 | help='Auxliary bigquery dataset used for Megalista operations') 39 | # Google Ads 40 | parser.add_value_provider_argument( 41 | '--developer_token', help='Developer Token for Google Ads API') 42 | parser.add_value_provider_argument( 43 | '--customer_id', help='Google Ads Customer Id') 44 | # Google Analytics 45 | parser.add_value_provider_argument( 46 | '--google_analytics_account_id', help='Google Analytics account Id') 47 | parser.add_value_provider_argument( 48 | '--google_analytics_web_property_id', 49 | help='Google Analytics web property Id') 50 | parser.add_value_provider_argument( 51 | '--google_analytics_buyer_custom_dim', 52 | help='Google Analytics buyer custom dimension') 53 | parser.add_value_provider_argument( 54 | '--google_analytics_user_id_custom_dim', 55 | help='Google Analytics User Id custom dimension') 56 | # Campaign Manager 57 | parser.add_value_provider_argument( 58 | '--dcm_profile_id', help='CampaignManager profile Id') 59 | parser.add_value_provider_argument( 60 | '--floodlight_activity_id', 61 | help='CampaignManager floodlight activity Id') 62 | parser.add_value_provider_argument( 63 | '--floodlight_configuration_id', 64 | help='CampaignManager floodlight configuration Id') 65 | # Conversion Plus 66 | parser.add_value_provider_argument( 67 | '--cp_sheet_id', help='Conversion Plus Sheet Id') 68 | parser.add_value_provider_argument( 69 | '--cp_sheet_range', 70 | help='Name of the Conversion Plus Sheet config range') 71 | # BigQuery 72 | parser.add_value_provider_argument( 73 | '--dataset_id', default='megalist', help='BigQuery dataset Id') 74 | parser.add_value_provider_argument( 75 | '--table_id', default='crm_upload', help='BigQuery dataset Id') 76 | # GCP 77 | parser.add_argument( 78 | '--gcp_project_id', help='ID Google Cloud Project to use') 79 | parser.add_argument('--output', help='Output file to write results to.') 80 | # APPSFLYER 81 | parser.add_value_provider_argument( 82 | '--appsflyer_dev_key', help='Developer key for AppsFlyer S2S API') -------------------------------------------------------------------------------- /megalist_dataflow/models/options_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from models.options import DataflowOptions 16 | 17 | 18 | def test_options(mocker): 19 | parser = mocker.MagicMock() 20 | DataflowOptions._add_argparse_args(parser) 21 | parser.add_value_provider_argument.assert_any_call("--client_id", help=mocker.ANY) 22 | parser.add_value_provider_argument.assert_any_call("--client_secret", help=mocker.ANY) 23 | parser.add_value_provider_argument.assert_any_call("--refresh_token", help=mocker.ANY) 24 | parser.add_value_provider_argument.assert_any_call("--access_token", help=mocker.ANY) 25 | parser.add_value_provider_argument.assert_any_call("--developer_token", help=mocker.ANY) 26 | parser.add_value_provider_argument.assert_any_call("--customer_id", help=mocker.ANY) 27 | parser.add_value_provider_argument.assert_any_call("--google_analytics_account_id", help=mocker.ANY) 28 | parser.add_value_provider_argument.assert_any_call("--google_analytics_web_property_id", help=mocker.ANY) 29 | parser.add_value_provider_argument.assert_any_call("--google_analytics_buyer_custom_dim", help=mocker.ANY) 30 | parser.add_value_provider_argument.assert_any_call("--google_analytics_user_id_custom_dim", help=mocker.ANY) 31 | parser.add_value_provider_argument.assert_any_call("--dcm_profile_id", help=mocker.ANY) 32 | parser.add_value_provider_argument.assert_any_call("--floodlight_activity_id", help=mocker.ANY) 33 | parser.add_value_provider_argument.assert_any_call("--floodlight_configuration_id", help=mocker.ANY) 34 | parser.add_value_provider_argument.assert_any_call("--dataset_id", default="megalist", help=mocker.ANY) 35 | parser.add_value_provider_argument.assert_any_call("--table_id", default="crm_upload", help=mocker.ANY) 36 | parser.add_argument.assert_any_call("--gcp_project_id", help=mocker.ANY) 37 | parser.add_argument.assert_any_call("--output", help=mocker.ANY) 38 | -------------------------------------------------------------------------------- /megalist_dataflow/models/sheets_config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from google.oauth2.credentials import Credentials 16 | from googleapiclient.discovery import build 17 | 18 | 19 | class SheetsConfig: 20 | def __init__(self, oauth_credentials): 21 | self._oauth_credentials = oauth_credentials 22 | self._sheets_service = None 23 | 24 | def _get_sheets_service(self): 25 | if not self._sheets_service: 26 | credentials = Credentials( 27 | token=self._oauth_credentials.get_access_token(), 28 | refresh_token=self._oauth_credentials.get_refresh_token(), 29 | client_id=self._oauth_credentials.get_client_id(), 30 | client_secret=self._oauth_credentials.get_client_secret(), 31 | token_uri='https://accounts.google.com/o/oauth2/token', 32 | scopes=['https://www.googleapis.com/auth/spreadsheets.readonly']) 33 | 34 | self._sheets_service = build('sheets', 'v4', credentials=credentials) 35 | return self._sheets_service 36 | 37 | def to_dict(self, config): 38 | return dict(map(lambda x: (x[0], {"op": x[1], "value": x[2], "multiplier": x[3]}), config)) 39 | 40 | def get_config(self, sheet_id, range): 41 | config_range = self.get_range(sheet_id, range) 42 | return self.to_dict(config_range['values']) 43 | 44 | def get_range(self, sheet_id, range): 45 | return self._get_sheets_service().spreadsheets().values().get(spreadsheetId=sheet_id, range=range).execute() 46 | 47 | def get_value(self, sheet_id, range): 48 | range = self.get_range(sheet_id, range) 49 | if range.get('values') is None: 50 | return None 51 | return range['values'][0][0] 52 | -------------------------------------------------------------------------------- /megalist_dataflow/requirements.txt: -------------------------------------------------------------------------------- 1 | googleads==24.1.0 2 | httplib2==0.17.4 3 | protobuf==3.13.0 4 | google-api-python-client==1.12.8 5 | google-cloud-core==1.4.1 6 | google-cloud-bigquery==1.27.2 7 | apache-beam[gcp]==2.28.0 8 | apache-beam==2.28.0 9 | google-cloud-datastore==1.13.1 10 | google-apitools==0.5.31 11 | pytest==5.4.3 12 | pytest-cov==2.11.1 13 | pytest-mock==3.2.0 14 | requests-mock==1.8.0 15 | pytz==2021.1 16 | wheel==0.34.2 17 | pyarrow==0.17.1 18 | aiohttp==3.7.4 19 | bloom-filter==1.3 20 | six==1.15.0 21 | mypy==0.790 -------------------------------------------------------------------------------- /megalist_dataflow/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import setuptools 16 | 17 | __version__ = "1.0.0" 18 | setuptools.setup( 19 | name='megalist_dataflow', 20 | version=__version__, 21 | author='DP6 fork from Google/megalista', 22 | author_email='koopas@dp6.com.br', 23 | url='https://github.com/DP6/marketing-data-sync', 24 | install_requires=['googleads==24.1.0', 'google-api-python-client==1.10.0', 25 | 'google-cloud-core==1.3.0', 'google-cloud-bigquery==1.26.0', 26 | 'google-cloud-datastore==1.13.1', 'aiohttp==3.7.4'], 27 | packages=setuptools.find_packages(), 28 | ) 29 | -------------------------------------------------------------------------------- /megalist_dataflow/sources/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DP6/marketing-data-sync/e65fd6627612281143f0696461bd6475e793785d/megalist_dataflow/sources/__init__.py -------------------------------------------------------------------------------- /megalist_dataflow/sources/base_bounded_source.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import abstractmethod 16 | 17 | from apache_beam.io import OffsetRangeTracker 18 | from apache_beam.io import iobase 19 | from apache_beam.io.iobase import RangeTracker 20 | from apache_beam.io.iobase import SourceBundle 21 | 22 | from typing import Any 23 | from typing import Iterator 24 | from typing import Optional 25 | 26 | 27 | class BaseBoundedSource(iobase.BoundedSource): 28 | """ 29 | Abstract class implementing common methods of BoundedSource applicable to a fixed size Source 30 | """ 31 | 32 | def __init__(self): 33 | self._count = None 34 | 35 | def count(self): 36 | if self._count is None: 37 | self._count = self._do_count() 38 | return self._count 39 | 40 | @abstractmethod 41 | def _do_count(self): 42 | """ 43 | :return: Size of source 44 | """ 45 | raise NotImplementedError 46 | 47 | def split(self, 48 | desired_bundle_size, # type: int 49 | start_position=None, # type: Optional[Any] 50 | stop_position=None, # type: Optional[Any] 51 | ): # type: (...) -> Iterator[SourceBundle] 52 | if start_position is None: 53 | start_position = 0 54 | if stop_position is None: 55 | stop_position = self.count() 56 | 57 | bundle_start = start_position 58 | while bundle_start < stop_position: 59 | bundle_stop = min(stop_position, bundle_start + desired_bundle_size) 60 | yield iobase.SourceBundle( 61 | weight=(bundle_stop - bundle_start), 62 | source=self, 63 | start_position=bundle_start, 64 | stop_position=bundle_stop) 65 | bundle_start = bundle_stop 66 | 67 | def get_range_tracker(self, 68 | start_position, # type: Optional[Any] 69 | stop_position, # type: Optional[Any] 70 | ): # type: (...) -> RangeTracker 71 | if start_position is None: 72 | start_position = 0 73 | if stop_position is None: 74 | stop_position = self.count() 75 | 76 | return OffsetRangeTracker(start_position, stop_position) 77 | -------------------------------------------------------------------------------- /megalist_dataflow/sources/batches_from_executions.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Any, Dict, List, Iterable, Tuple 16 | 17 | import apache_beam as beam 18 | import logging 19 | from google.cloud import bigquery 20 | from apache_beam.io.gcp.bigquery import ReadFromBigQueryRequest 21 | 22 | from models.execution import DestinationType, Execution, Batch 23 | 24 | 25 | class BatchesFromExecutions(beam.PTransform): 26 | """ 27 | Filter the received executions by the received action, 28 | load the data using the received source and group by that batch size and Execution. 29 | """ 30 | 31 | class _ExecutionIntoBigQueryRequest(beam.DoFn): 32 | def process(self, execution: Execution) -> Iterable[ReadFromBigQueryRequest]: 33 | table_name = execution.source.source_metadata[0] + \ 34 | '.' + execution.source.source_metadata[1] 35 | query = f"SELECT Data.*, '{hash(execution)}' AS execution_hash FROM {table_name} AS Data" 36 | return [ReadFromBigQueryRequest(query=query)] 37 | 38 | class _ExecutionIntoBigQueryRequestTransactional(beam.DoFn): 39 | def process(self, execution: Execution) -> Iterable[ReadFromBigQueryRequest]: 40 | table_name = execution.source.source_metadata[0] + \ 41 | '.' + execution.source.source_metadata[1] 42 | uploaded_table_name = f"{table_name}_uploaded" 43 | client = bigquery.Client() 44 | 45 | query = "CREATE TABLE IF NOT EXISTS " + uploaded_table_name + " ( \ 46 | timestamp TIMESTAMP OPTIONS(description= 'Event timestamp'), \ 47 | uuid STRING OPTIONS(description='Event unique identifier'))\ 48 | PARTITION BY _PARTITIONDATE \ 49 | OPTIONS(partition_expiration_days=15)" 50 | 51 | logging.getLogger("megalista.ExecutionIntoBigQueryRequestTransactional").info( 52 | "Creating table %s if it doesn't exist", uploaded_table_name) 53 | 54 | client.query(query).result() 55 | 56 | query = f"SELECT Data.*, '{hash(execution)}' AS execution_hash FROM {table_name} AS Data \ 57 | LEFT JOIN {uploaded_table_name} AS Uploaded USING(uuid) \ 58 | WHERE Uploaded.uuid IS NULL;" 59 | 60 | return [ReadFromBigQueryRequest(query=query)] 61 | 62 | class _BatchElements(beam.DoFn): 63 | def __init__(self, batch_size: int): 64 | self._batch_size = batch_size 65 | 66 | def process(self, element, executions: Iterable[Execution]): 67 | execution = next( 68 | (execution for execution in executions if str(hash(execution)) == element[0])) 69 | batch: List[Any] = [] 70 | for i, element in enumerate(element[1]): 71 | if i != 0 and i % self._batch_size == 0: 72 | yield Batch(execution, batch) 73 | batch = [] 74 | batch.append(element) 75 | yield Batch(execution, batch) 76 | 77 | def __init__( 78 | self, 79 | destination_type: DestinationType, 80 | batch_size: int = 5000, 81 | transactional: bool = False 82 | ): 83 | super().__init__() 84 | self._destination_type = destination_type 85 | self._batch_size = batch_size 86 | self._transactional = transactional 87 | 88 | def _get_bq_request_class(self): 89 | if self._transactional: 90 | return self._ExecutionIntoBigQueryRequestTransactional() 91 | return self._ExecutionIntoBigQueryRequest() 92 | 93 | def expand(self, executions): 94 | return ( 95 | executions 96 | | beam.Filter(lambda execution: execution.destination.destination_type == self._destination_type) 97 | | beam.ParDo(self._get_bq_request_class()) 98 | | beam.io.ReadAllFromBigQuery() 99 | | beam.GroupBy(lambda x: x['execution_hash']) 100 | | beam.ParDo(self._BatchElements(self._batch_size), beam.pvalue.AsList(executions)) 101 | ) 102 | -------------------------------------------------------------------------------- /megalist_dataflow/sources/firestore_execution_source.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import distutils.util 15 | import logging 16 | 17 | from apache_beam.options.value_provider import ValueProvider 18 | 19 | from google.cloud import firestore 20 | from sources.base_bounded_source import BaseBoundedSource 21 | from models.execution import Destination, DestinationType 22 | from models.execution import Execution, AccountConfig 23 | from models.execution import Source, SourceType 24 | 25 | 26 | class FirestoreExecutionSource(BaseBoundedSource): 27 | """ 28 | Read Execution data from a Firestore collection. The collection name is set-up in the parameter "setup_firestore_collection" 29 | """ 30 | 31 | def __init__( 32 | self, 33 | setup_firestore_collection: ValueProvider 34 | ): 35 | super().__init__() 36 | self._setup_firestore_collection = setup_firestore_collection 37 | 38 | def _do_count(self): 39 | # TODO: implement count 40 | return 3 41 | 42 | def read(self, range_tracker): 43 | def document_to_dict(doc): 44 | if not doc.exists: 45 | return None 46 | doc_dict = doc.to_dict() 47 | doc_dict['id'] = doc.id 48 | return doc_dict 49 | 50 | firestore_collection = self._setup_firestore_collection.get() 51 | logging.getLogger("megalista.FirestoreExecutionSource").info(f"Loading Firestore collection {firestore_collection}...") 52 | db = firestore.Client() 53 | entries = db.collection(self._setup_firestore_collection.get()).where('active', '==', 'yes').stream() 54 | entries = [document_to_dict(doc) for doc in entries] 55 | 56 | account_data = document_to_dict(db.collection(self._setup_firestore_collection.get()).document('account_config').get()) 57 | 58 | if not account_data: 59 | raise Exception('Firestore collection is absent') 60 | google_ads_id = account_data.get('google_ads_id', 'empty') 61 | mcc_trix = account_data.get('mcc_trix', 'FALSE') 62 | mcc = False if mcc_trix is None else bool(distutils.util.strtobool(mcc_trix)) 63 | app_id = account_data.get('app_id', 'empty') 64 | google_analytics_account_id = account_data.get('google_analytics_account_id', 'empty') 65 | campaign_manager_account_id = account_data.get('campaign_manager_account_id', 'empty') 66 | 67 | account_config = AccountConfig(google_ads_id, mcc, google_analytics_account_id, campaign_manager_account_id, app_id) 68 | logging.getLogger("megalista.FirestoreExecutionSource").info(f"Loaded: {account_config}") 69 | 70 | sources = self._read_sources(entries) 71 | destinations = self._read_destination(entries) 72 | if entries: 73 | for entry in entries: 74 | if entry['active'].upper() == 'YES': 75 | logging.getLogger("megalista.FirestoreExecutionSource").info( 76 | f"Executing step Source:{sources[entry['id'] + '_source'].source_name} -> Destination:{destinations[entry['id'] + '_destination'].destination_name}") 77 | yield Execution(account_config, sources[entry['id'] + '_source'], destinations[entry['id'] + '_destination']) 78 | else: 79 | logging.getLogger("megalista.FirestoreExecutionSource").warn("No schedules found!") 80 | 81 | @staticmethod 82 | def _read_sources(entries): 83 | sources = {} 84 | if entries: 85 | for entry in entries: 86 | metadata = [entry['bq_dataset'], entry['bq_table']] #TODO: flexibilize for other source types 87 | source = Source(entry['id'] + '_source', SourceType[entry['source']], metadata) 88 | sources[source.source_name] = source 89 | else: 90 | logging.getLogger("megalista.FirestoreExecutionSource").warn("No sources found!") 91 | return sources 92 | 93 | @staticmethod 94 | def _read_destination(entries): 95 | def create_metadata_list(entry): 96 | metadata_list = { 97 | 'ADS_OFFLINE_CONVERSION': ['gads_conversion_name'], 98 | 'ADS_SSD_UPLOAD': ['gads_conversion_name', 'gads_external_upload_id'], 99 | 'ADS_CUSTOMER_MATCH_CONTACT_INFO_UPLOAD': ['gads_audience_name', 'gads_operation', 'gads_hash'], 100 | 'ADS_CUSTOMER_MATCH_MOBILE_DEVICE_ID_UPLOAD': ['gads_audience_name', 'gads_operation'], 101 | 'ADS_CUSTOMER_MATCH_USER_ID_UPLOAD': ['gads_audience_name', 'gads_operation'], 102 | 'GA_MEASUREMENT_PROTOCOL': ['google_analytics_property_id', 'google_analytics_non_interaction'], 103 | 'CM_OFFLINE_CONVERSION': ['campaign_manager_floodlight_activity_id', 'campaign_manager_floodlight_configuration_id'], 104 | 'APPSFLYER_S2S_EVENTS': ['appsflyer_app_id'], 105 | } 106 | 107 | entry_type = entry['type'] 108 | metadata = metadata_list.get(entry_type, None) 109 | if not metadata: 110 | raise Exception(f'Upload type not implemented: {entry_type}') 111 | entry_metadata = [] 112 | for m in metadata: 113 | if m in entry: 114 | entry_metadata.append(entry[m]) 115 | else: 116 | raise Exception(f'Missing field in Firestore document for {entry_type}: {m}') 117 | return entry_metadata 118 | 119 | 120 | destinations = {} 121 | if entries: 122 | for entry in entries: 123 | destination = Destination(entry['id'] + '_destination', DestinationType[entry['type']], create_metadata_list(entry)) 124 | destinations[destination.destination_name] = destination 125 | else: 126 | logging.getLogger("megalista.FirestoreExecutionSource").warn("No destinations found!") 127 | return destinations 128 | -------------------------------------------------------------------------------- /megalist_dataflow/sources/spreadsheet_execution_source.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import distutils.util 15 | import logging 16 | 17 | from apache_beam.options.value_provider import ValueProvider 18 | 19 | from sources.base_bounded_source import BaseBoundedSource 20 | from models.execution import Destination, DestinationType 21 | from models.execution import Execution, AccountConfig 22 | from models.execution import Source, SourceType 23 | from models.sheets_config import SheetsConfig 24 | 25 | 26 | class SpreadsheetExecutionSource(BaseBoundedSource): 27 | """ 28 | Read Execution data from a sheet. The sheet id is set-up in the parameter "setup_sheet_id" 29 | """ 30 | 31 | def __init__( 32 | self, 33 | sheets_config: SheetsConfig, 34 | setup_sheet_id: ValueProvider 35 | ): 36 | super().__init__() 37 | self._sheets_config = sheets_config 38 | self._setup_sheet_id = setup_sheet_id 39 | 40 | def _do_count(self): 41 | # TODO: really count the number of lines in the sheet 42 | return 3 43 | 44 | def read(self, range_tracker): 45 | sheet_id = self._setup_sheet_id.get() 46 | logging.getLogger("megalista.SpreadsheetExecutionSource").info(f"Loading configuration sheet {sheet_id}...") 47 | google_ads_id = self._sheets_config.get_value(sheet_id, "GoogleAdsAccountId") 48 | mcc_trix = self._sheets_config.get_value(sheet_id, "GoogleAdsMCC") 49 | mcc = False if mcc_trix is None else bool(distutils.util.strtobool(mcc_trix)) 50 | app_id = self._sheets_config.get_value(sheet_id, "AppId") 51 | google_analytics_account_id = self._sheets_config.get_value(sheet_id, "GoogleAnalyticsAccountId") 52 | campaign_manager_account_id = self._sheets_config.get_value(sheet_id, "CampaignManagerAccountId") 53 | account_config = AccountConfig(google_ads_id, mcc, google_analytics_account_id, campaign_manager_account_id, app_id) 54 | logging.getLogger("megalista.SpreadsheetExecutionSource").info(f"Loaded: {account_config}") 55 | 56 | sources = self._read_sources(self._sheets_config, sheet_id) 57 | destinations = self._read_destination(self._sheets_config, sheet_id) 58 | 59 | schedules_range = self._sheets_config.get_range(sheet_id, 'SchedulesRange') 60 | if 'values' in schedules_range: 61 | for schedule in schedules_range['values']: 62 | if schedule[0] == 'YES': 63 | logging.getLogger("megalista.SpreadsheetExecutionSource").info( 64 | f"Executing step Source:{sources[schedule[1]].source_name} -> Destination:{destinations[schedule[2]].destination_name}") 65 | yield Execution(account_config, sources[schedule[1]], destinations[schedule[2]]) 66 | else: 67 | logging.getLogger("megalista.SpreadsheetExecutionSource").warn("No schedules found!") 68 | 69 | @staticmethod 70 | def _read_sources(sheets_config, sheet_id): 71 | range = sheets_config.get_range(sheet_id, 'SourcesRange') 72 | sources = {} 73 | if 'values' in range: 74 | for row in range['values']: 75 | source = Source(row[0], SourceType[row[1]], row[2:]) 76 | sources[source.source_name] = source 77 | else: 78 | logging.getLogger("megalista.SpreadsheetExecutionSource").warn("No sources found!") 79 | return sources 80 | 81 | @staticmethod 82 | def _read_destination(sheets_config, sheet_id): 83 | range = sheets_config.get_range(sheet_id, 'DestinationsRange') 84 | destinations = {} 85 | if 'values' in range: 86 | for row in range['values']: 87 | destination = Destination(row[0], DestinationType[row[1]], row[2:]) 88 | destinations[destination.destination_name] = destination 89 | else: 90 | logging.getLogger("megalista.SpreadsheetExecutionSource").warn("No destinations found!") 91 | return destinations 92 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name = "uploaders" -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/appsflyer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DP6/marketing-data-sync/e65fd6627612281143f0696461bd6475e793785d/megalist_dataflow/uploaders/appsflyer/__init__.py -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/appsflyer/appsflyer_s2s_uploader_async.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | 17 | import apache_beam as beam 18 | import time 19 | from datetime import datetime 20 | from typing import Any, List 21 | 22 | import asyncio 23 | from aiohttp import ClientSession, ClientTimeout 24 | 25 | from uploaders import utils 26 | from models.execution import DestinationType, Batch 27 | 28 | 29 | class AppsFlyerS2SUploaderDoFn(beam.DoFn): 30 | def __init__(self, dev_key): 31 | super().__init__() 32 | self.API_URL = "https://api2.appsflyer.com/inappevent/" 33 | self.dev_key = dev_key 34 | self.app_id = None 35 | self.timeout = ClientTimeout(total=15) #15 sec timeout 36 | 37 | def start_bundle(self): 38 | pass 39 | 40 | 41 | async def _prepare_and_send(self, session, row, success_elements): 42 | 43 | #prepare payload 44 | payload = { 45 | "appsflyer_id": row['appsflyer_id'], 46 | "eventName": row['event_eventName'], 47 | "eventValue": "", 48 | "af_events_api" :"true" 49 | } 50 | self.bind_key(payload, row, 'device_ids_idfa','idfa') 51 | self.bind_key(payload, row, 'device_ids_advertising_id','advertising_id') 52 | self.bind_key(payload, row, 'device_ids_oaid','oaid') 53 | self.bind_key(payload, row, 'device_ids_amazon_aid','amazon_aid') 54 | self.bind_key(payload, row, 'device_ids_imei','imei') 55 | self.bind_key(payload, row, 'customer_user_id','customer_user_id') 56 | self.bind_key(payload, row, 'ip','ip') 57 | self.bind_key(payload, row, 'event_eventValue','eventValue') 58 | self.bind_key(payload, row, 'event_eventTime','eventTime') 59 | if 'eventTime' in payload: 60 | payload['eventTime'] = payload['eventTime'].strftime("%Y-%m-%d %H:%M:%S.%f") 61 | self.bind_key(payload, row, 'event_eventCurrency','eventCurrency') 62 | 63 | 64 | # run request asyncronously. 65 | response = await self._send_http_request(session, payload, 1) 66 | if response == 200: 67 | success_elements.append(row) 68 | return response 69 | 70 | 71 | async def _send_http_request(self, session, payload, curr_retry): 72 | url = self.API_URL + self.app_id 73 | headers = { 74 | "authentication": self.dev_key.get(), 75 | 'Content-Type': 'application/json' 76 | } 77 | 78 | try: 79 | async with session.post(url, headers=headers, json=payload, 80 | raise_for_status=False, timeout=15) as response: 81 | if response.status != 200: 82 | if curr_retry < 3: 83 | await asyncio.sleep(curr_retry) 84 | return await self._send_http_request(session, payload, curr_retry+1) 85 | else: 86 | logging.getLogger("megalista.AppsFlyerS2SUploader").error( 87 | f"Fail to send event. Response code: {response.status}, " 88 | f"reason: {response.reason}") 89 | #print(await response.text()) #uncomment to troubleshoot 90 | return response.status 91 | 92 | except Exception as exc: 93 | if curr_retry < 3: 94 | await asyncio.sleep(curr_retry) 95 | return await self._send_http_request(session, payload, curr_retry+1) 96 | else: 97 | logging.getLogger("megalista.AppsFlyerS2SUploader").error('Error inserting event: ' + str(exc)) 98 | return -1 99 | 100 | 101 | async def _async_request_runner(self, elements, success_elements): 102 | tasks = [] 103 | 104 | # Create client session to prevent multiple connections 105 | async with ClientSession(timeout=self.timeout) as session: 106 | 107 | # For each event 108 | for element in elements: 109 | task = asyncio.ensure_future(self._prepare_and_send(session, element, success_elements)) 110 | tasks.append(task) 111 | 112 | responses = asyncio.gather(*tasks) 113 | return await responses 114 | 115 | 116 | def bind_key(self, payload, row, row_key, name): 117 | if row_key in row and row[row_key] is not None and row[row_key] != "": 118 | payload[name] = row[row_key] 119 | 120 | 121 | @utils.safe_process(logger=logging.getLogger("megalista.AppsFlyerS2SUploader")) 122 | def process(self, batch: Batch, **kwargs): 123 | success_elements: List[Any] = [] 124 | start_datetime = datetime.now() 125 | execution = batch.execution 126 | 127 | self.app_id = execution.destination.destination_metadata[0] 128 | 129 | #send all requests asyncronously 130 | loop = asyncio.new_event_loop() 131 | future = asyncio.ensure_future(self._async_request_runner(batch.elements, success_elements), loop = loop) 132 | responses = loop.run_until_complete(future) 133 | 134 | 135 | #wait to avoid api trotle 136 | delta_sec = (datetime.now()-start_datetime).total_seconds() 137 | min_duration_sec = len(batch.elements)/500 #Using Rate limitation = 500 per sec 138 | if delta_sec < min_duration_sec: 139 | time.sleep(min_duration_sec - delta_sec) 140 | logging.getLogger("megalista.AppsFlyerS2SUploader").info( 141 | f"Successfully uploaded {len(success_elements)}/{len(batch.elements)} events.") 142 | 143 | yield Batch(execution, success_elements) 144 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/big_query/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DP6/marketing-data-sync/e65fd6627612281143f0696461bd6475e793785d/megalist_dataflow/uploaders/big_query/__init__.py -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/big_query/transactional_events_results_writer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | from datetime import datetime 17 | 18 | import apache_beam as beam 19 | from google.cloud import bigquery 20 | from google.cloud.bigquery import SchemaField 21 | 22 | from uploaders import utils 23 | from models.execution import Batch 24 | 25 | 26 | class TransactionalEventsResultsWriter(beam.DoFn): 27 | """ 28 | Uploads UUIDs from rows successfully sent by the uploader. 29 | It uploads the rows to a table with the same name of the source table plus the suffix '_uploaded'. 30 | """ 31 | 32 | def __init__(self, bq_ops_dataset): 33 | super().__init__() 34 | self._bq_ops_dataset = bq_ops_dataset 35 | 36 | @utils.safe_process(logger=logging.getLogger("megalista.TransactionalEventsResultsWriter")) 37 | def process(self, batch: Batch, *args, **kwargs): 38 | self._do_process(batch, datetime.now().timestamp()) 39 | 40 | def _do_process(self, batch: Batch, now): 41 | execution = batch.execution 42 | 43 | table_name = self._bq_ops_dataset.get() + '.' + execution.source.source_metadata[1] + "_uploaded" 44 | 45 | rows = batch.elements 46 | client = self._get_bq_client() 47 | table = client.get_table(table_name) 48 | results = client.insert_rows(table, [{'uuid': row['uuid'], 'timestamp': now} for row in rows], 49 | (SchemaField("uuid", "string"), SchemaField("timestamp", "timestamp"))) 50 | 51 | for result in results: 52 | logging.getLogger("megalista.TransactionalEventsResultsWriter").error(result['errors']) 53 | 54 | @staticmethod 55 | def _get_bq_client(): 56 | return bigquery.Client() 57 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/big_query/transactional_events_results_writer_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import datetime 16 | 17 | from models.execution import AccountConfig 18 | from models.execution import Destination 19 | from models.execution import DestinationType 20 | from models.execution import Execution 21 | from models.execution import Source 22 | from models.execution import SourceType 23 | from models.execution import Batch 24 | import pytest 25 | from uploaders.big_query.transactional_events_results_writer import TransactionalEventsResultsWriter 26 | 27 | from google.cloud.bigquery import SchemaField 28 | 29 | from apache_beam.options.value_provider import StaticValueProvider 30 | 31 | 32 | @pytest.fixture 33 | def uploader(): 34 | return TransactionalEventsResultsWriter(StaticValueProvider(str, 'bq_ops_dataset')) 35 | 36 | 37 | def test_bigquery_write(mocker, uploader): 38 | bq_client = mocker.MagicMock() 39 | 40 | mocker.patch.object(uploader, "_get_bq_client") 41 | uploader._get_bq_client.return_value = bq_client 42 | 43 | table = mocker.MagicMock() 44 | bq_client.get_table.return_value = table 45 | 46 | now = datetime.datetime.now().timestamp() 47 | 48 | account_config = AccountConfig("account_id", False, "ga_account_id", "", "") 49 | destination = Destination( 50 | "dest1", 51 | DestinationType.GA_MEASUREMENT_PROTOCOL, 52 | ["web_property", "view", "c", "list", "d", "buyers_custom_dim"]) 53 | source = Source("orig1", SourceType.BIG_QUERY, ["dt1", "buyers"]) 54 | execution = Execution(account_config, source, destination) 55 | 56 | uploader._do_process(Batch(execution, [{"uuid": "uuid-1"}, {"uuid": "uuid-2"}]), now) 57 | 58 | bq_client.insert_rows.assert_called_once_with( 59 | table, 60 | [{"uuid": "uuid-1", "timestamp": now}, 61 | {"uuid": "uuid-2", "timestamp": now}], 62 | (SchemaField("uuid", "string"), 63 | SchemaField("timestamp", "timestamp"))) 64 | 65 | 66 | def test_bigquery_write_failure(mocker, uploader, caplog): 67 | bq_client = mocker.MagicMock() 68 | 69 | mocker.patch.object(uploader, "_get_bq_client") 70 | uploader._get_bq_client.return_value = bq_client 71 | 72 | error_message = "This is an error message" 73 | bq_client.insert_rows.return_value = [{"errors": error_message}] 74 | 75 | account_config = AccountConfig("account_id", False, "ga_account_id", "", "") 76 | source = Source("orig1", SourceType.BIG_QUERY, ["dt1", "buyers"]) 77 | destination = Destination( 78 | "dest1", 79 | DestinationType.GA_MEASUREMENT_PROTOCOL, 80 | ["web_property", "view", "c", "list", "d", "buyers_custom_dim"]) 81 | 82 | execution = Execution(account_config, source, destination) 83 | 84 | uploader.process(Batch(execution, [{"uuid": "uuid-1"}])) 85 | 86 | assert error_message in caplog.text 87 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/campaign_manager/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DP6/marketing-data-sync/e65fd6627612281143f0696461bd6475e793785d/megalist_dataflow/uploaders/campaign_manager/__init__.py -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/campaign_manager/campaign_manager_conversion_uploader.py: -------------------------------------------------------------------------------- 1 | """Campaign Manager Conversion Uploader beam module.""" 2 | # Copyright 2021 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import logging 17 | import math 18 | import time 19 | 20 | import apache_beam as beam 21 | from google.oauth2.credentials import Credentials 22 | from googleapiclient.discovery import build 23 | 24 | from uploaders import utils 25 | from models.execution import DestinationType, Batch 26 | 27 | _LOGGER_NAME: str = 'megalista.CampaignManagerConversionsUploader' 28 | 29 | 30 | class CampaignManagerConversionUploaderDoFn(beam.DoFn): 31 | """Apache Beam DoFn class implementation.""" 32 | 33 | def __init__(self, oauth_credentials): 34 | super().__init__() 35 | self.oauth_credentials = oauth_credentials 36 | 37 | def _get_dcm_service(self): 38 | credentials = Credentials( 39 | token=self.oauth_credentials.get_access_token(), 40 | refresh_token=self.oauth_credentials.get_refresh_token(), 41 | client_id=self.oauth_credentials.get_client_id(), 42 | client_secret=self.oauth_credentials.get_client_secret(), 43 | token_uri='https://accounts.google.com/o/oauth2/token', 44 | scopes=[ 45 | 'https://www.googleapis.com/auth/dfareporting', 46 | 'https://www.googleapis.com/auth/dfatrafficking', 47 | 'https://www.googleapis.com/auth/ddmconversions']) 48 | 49 | return build('dfareporting', 'v3.4', credentials=credentials) 50 | 51 | def start_bundle(self): 52 | pass 53 | 54 | @staticmethod 55 | def _assert_all_list_names_are_present(any_execution): 56 | destination = any_execution.destination.destination_metadata 57 | if len(destination) != 2: 58 | raise ValueError( 59 | f'Missing destination information. Found {len(destination)}') 60 | 61 | if not destination[0] \ 62 | or not destination[1]: 63 | raise ValueError( 64 | f'Missing destination information. Received {str(destination)}') 65 | 66 | @utils.safe_process(logger=logging.getLogger(_LOGGER_NAME)) 67 | def process(self, batch: Batch, **kwargs): 68 | self._do_process(batch, time.time()) 69 | yield batch 70 | 71 | def _do_process(self, batch: Batch, timestamp): 72 | execution = batch.execution 73 | self._assert_all_list_names_are_present(execution) 74 | 75 | self._do_upload_data( 76 | execution.destination.destination_metadata[0], 77 | execution.destination.destination_metadata[1], 78 | execution.account_config.campaign_manager_account_id, 79 | timestamp, 80 | batch.elements) 81 | 82 | def _do_upload_data( 83 | self, 84 | floodlight_activity_id, 85 | floodlight_configuration_id, 86 | campaign_manager_account_id, 87 | timestamp, 88 | rows): 89 | 90 | service = self._get_dcm_service() 91 | conversions = [] 92 | logger = logging.getLogger(_LOGGER_NAME) 93 | for conversion in rows: 94 | to_upload = { 95 | 'floodlightActivityId': floodlight_activity_id, 96 | 'floodlightConfigurationId': floodlight_configuration_id, 97 | 'ordinal': math.floor(timestamp * 10e5), 98 | 'timestampMicros': math.floor(timestamp * 10e5) 99 | } 100 | 101 | if 'gclid' in conversion and conversion['gclid']: 102 | to_upload['gclid'] = conversion['gclid'] 103 | elif 'encryptedUserId' in conversion and conversion['encryptedUserId']: 104 | to_upload['encryptedUserId'] = conversion['encryptedUserId'] 105 | elif 'mobileDeviceId' in conversion and conversion['mobileDeviceId']: 106 | to_upload['mobileDeviceId'] = conversion['mobileDeviceId'] 107 | elif 'matchId' in conversion and conversion['matchId']: 108 | to_upload['matchId'] = conversion['matchId'] 109 | 110 | conversions.append(to_upload) 111 | 112 | request_body = { 113 | 'conversions': conversions, 114 | } 115 | 116 | logger.info(f'Conversions: \n{conversions}') 117 | 118 | request = service.conversions().batchinsert( 119 | profileId=campaign_manager_account_id, body=request_body) 120 | response = request.execute() 121 | 122 | if response['hasFailures']: 123 | logger.error(f'Error(s) inserting conversions:\n{response}') 124 | conversions_status = response['status'] 125 | error_messages = [] 126 | 127 | for status in conversions_status: 128 | if 'errors' in status: 129 | for error in status['errors']: 130 | error_messages.append('[{}]: {}'.format(error['code'], error['message'])) 131 | 132 | logger.error('Errors from API:\n{}'.format('\n'.join(error_messages))) 133 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/campaign_manager/campaign_manager_conversion_uploader_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import math 15 | import time 16 | import logging 17 | 18 | from apache_beam.options.value_provider import StaticValueProvider 19 | from uploaders.campaign_manager.campaign_manager_conversion_uploader import CampaignManagerConversionUploaderDoFn 20 | from models.execution import AccountConfig 21 | from models.execution import Destination 22 | from models.execution import DestinationType 23 | from models.execution import Execution 24 | from models.execution import Source 25 | from models.execution import SourceType 26 | from models.execution import Batch 27 | from models.oauth_credentials import OAuthCredentials 28 | import pytest 29 | 30 | _account_config = AccountConfig(mcc=False, 31 | campaign_manager_account_id='dcm_profile_id', 32 | google_ads_account_id='', 33 | google_analytics_account_id='', 34 | app_id='') 35 | 36 | 37 | @pytest.fixture 38 | def uploader(mocker): 39 | credential_id = StaticValueProvider(str, 'id') 40 | secret = StaticValueProvider(str, 'secret') 41 | access = StaticValueProvider(str, 'access') 42 | refresh = StaticValueProvider(str, 'refresh') 43 | credentials = OAuthCredentials(credential_id, secret, access, refresh) 44 | 45 | return CampaignManagerConversionUploaderDoFn(credentials) 46 | 47 | 48 | def test_get_service(uploader): 49 | assert uploader._get_dcm_service() is not None 50 | 51 | 52 | def test_conversion_upload(mocker, uploader): 53 | mocker.patch.object(uploader, '_get_dcm_service') 54 | 55 | floodlight_activity_id = 'floodlight_activity_id' 56 | floodlight_configuration_id = 'floodlight_configuration_id' 57 | 58 | source = Source('orig1', SourceType.BIG_QUERY, ('dt1', 'buyers')) 59 | destination = Destination( 60 | 'dest1', 61 | DestinationType.CM_OFFLINE_CONVERSION, 62 | (floodlight_activity_id, floodlight_configuration_id)) 63 | 64 | execution = Execution(_account_config, source, destination) 65 | 66 | current_time = time.time() 67 | 68 | uploader._do_process(Batch(execution, [{ 69 | 'gclid': '123' 70 | }, { 71 | 'gclid': '456' 72 | }]), current_time) 73 | 74 | expected_body = { 75 | 'conversions': [{ 76 | 'gclid': '123', 77 | 'floodlightActivityId': floodlight_activity_id, 78 | 'floodlightConfigurationId': floodlight_configuration_id, 79 | 'ordinal': math.floor(current_time * 10e5), 80 | 'timestampMicros': math.floor(current_time * 10e5) 81 | }, { 82 | 'gclid': '456', 83 | 'floodlightActivityId': floodlight_activity_id, 84 | 'floodlightConfigurationId': floodlight_configuration_id, 85 | 'ordinal': math.floor(current_time * 10e5), 86 | 'timestampMicros': math.floor(current_time * 10e5) 87 | }], 88 | } 89 | 90 | uploader._get_dcm_service().conversions().batchinsert.assert_any_call( 91 | profileId='dcm_profile_id', body=expected_body) 92 | 93 | 94 | def test_conversion_upload_match_id(mocker, uploader): 95 | mocker.patch.object(uploader, '_get_dcm_service') 96 | 97 | floodlight_activity_id = 'floodlight_activity_id' 98 | floodlight_configuration_id = 'floodlight_configuration_id' 99 | 100 | source = Source('orig1', SourceType.BIG_QUERY, ('dt1', 'buyers')) 101 | destination = Destination( 102 | 'dest1', 103 | DestinationType.CM_OFFLINE_CONVERSION, 104 | (floodlight_activity_id, floodlight_configuration_id)) 105 | execution = Execution(_account_config, source, destination) 106 | current_time = time.time() 107 | 108 | mocker.patch.object(time, 'time') 109 | time.time.return_value = current_time 110 | 111 | uploader._do_process(Batch(execution, [{'matchId': 'abc'}]), current_time) 112 | 113 | expected_body = { 114 | 'conversions': [{ 115 | 'matchId': 'abc', 116 | 'floodlightActivityId': floodlight_activity_id, 117 | 'floodlightConfigurationId': floodlight_configuration_id, 118 | 'ordinal': math.floor(current_time * 10e5), 119 | 'timestampMicros': math.floor(current_time * 10e5) 120 | }], 121 | } 122 | 123 | uploader._get_dcm_service().conversions().batchinsert.assert_any_call( 124 | profileId='dcm_profile_id', body=expected_body) 125 | 126 | 127 | def test_error_on_api_call(mocker, uploader, caplog): 128 | caplog.set_level(logging.INFO, 'megalista.CampaignManagerConversionsUploader') 129 | mocker.patch.object(uploader, '_get_dcm_service') 130 | service = mocker.MagicMock() 131 | uploader._get_dcm_service.return_value = service 132 | 133 | service.conversions().batchinsert().execute.return_value = { 134 | 'hasFailures': True, 135 | 'status': [{ 136 | 'errors': [{ 137 | 'code': '123', 138 | 'message': 'error_returned' 139 | }] 140 | }] 141 | } 142 | 143 | source = Source('orig1', SourceType.BIG_QUERY, ('dt1', 'buyers')) 144 | destination = Destination( 145 | 'dest1', DestinationType.CM_OFFLINE_CONVERSION, ['a', 'b']) 146 | execution = Execution(_account_config, source, destination) 147 | 148 | uploader._do_process(Batch(execution, [{'gclid': '123'}]), time.time()) 149 | 150 | assert 'Error(s) inserting conversions:' in caplog.text 151 | assert '[123]: error_returned' in caplog.text 152 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_ads/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DP6/marketing-data-sync/e65fd6627612281143f0696461bd6475e793785d/megalist_dataflow/uploaders/google_ads/__init__.py -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_ads/conversions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DP6/marketing-data-sync/e65fd6627612281143f0696461bd6475e793785d/megalist_dataflow/uploaders/google_ads/conversions/__init__.py -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_ads/conversions/google_ads_offline_conversions_uploader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | 17 | import apache_beam as beam 18 | from uploaders import utils 19 | from models.execution import Batch, DestinationType, Execution 20 | 21 | 22 | class GoogleAdsOfflineUploaderDoFn(beam.DoFn): 23 | 24 | def __init__(self, oauth_credentials, developer_token): 25 | super().__init__() 26 | self.oauth_credentials = oauth_credentials 27 | self.developer_token = developer_token 28 | self.active = self.developer_token is not None 29 | 30 | def _get_oc_service(self, customer_id): 31 | return utils.get_ads_service('OfflineConversionFeedService', 'v201809', 32 | self.oauth_credentials, 33 | self.developer_token.get(), customer_id) 34 | 35 | def start_bundle(self): 36 | pass 37 | 38 | @staticmethod 39 | def _assert_conversion_name_is_present(execution: Execution): 40 | destination = execution.destination.destination_metadata 41 | if len(destination) != 1: 42 | raise ValueError('Missing destination information. Found {}'.format( 43 | len(destination))) 44 | 45 | if not destination[0]: 46 | raise ValueError('Missing destination information. Received {}'.format( 47 | str(destination))) 48 | 49 | @utils.safe_process( 50 | logger=logging.getLogger('megalista.GoogleAdsOfflineUploader')) 51 | def process(self, batch: Batch, **kwargs): 52 | if not self.active: 53 | logging.getLogger().warning( 54 | 'Skipping upload, parameters not configured.') 55 | return 56 | execution = batch.execution 57 | self._assert_conversion_name_is_present(execution) 58 | 59 | oc_service = self._get_oc_service( 60 | execution.account_config.google_ads_account_id) 61 | 62 | self._do_upload(oc_service, 63 | execution.destination.destination_metadata[0], 64 | batch.elements) 65 | 66 | @staticmethod 67 | def _do_upload(oc_service, conversion_name, rows): 68 | logging.getLogger().warning('Uploading {} rows to Google Ads'.format( 69 | len(rows))) 70 | upload_data = [{ 71 | 'operator': 'ADD', 72 | 'operand': { 73 | 'conversionName': conversion_name, 74 | 'conversionTime': utils.format_date(conversion['time']), 75 | 'conversionValue': conversion['amount'], 76 | 'googleClickId': conversion['gclid'] 77 | } 78 | } for conversion in rows] 79 | 80 | oc_service.mutate(upload_data) 81 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_ads/conversions/google_ads_offline_conversions_uploader_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from apache_beam.options.value_provider import StaticValueProvider 16 | import pytest 17 | from uploaders.google_ads.conversions.google_ads_offline_conversions_uploader import GoogleAdsOfflineUploaderDoFn 18 | from models.execution import AccountConfig 19 | from models.execution import Destination 20 | from models.execution import DestinationType 21 | from models.execution import Execution 22 | from models.execution import Source 23 | from models.execution import SourceType 24 | from models.execution import Batch 25 | from models.oauth_credentials import OAuthCredentials 26 | 27 | _account_config = AccountConfig('account_id', False, 'ga_account_id', '', '') 28 | 29 | 30 | @pytest.fixture 31 | def uploader(mocker): 32 | mocker.patch('googleads.oauth2.GoogleRefreshTokenClient') 33 | mocker.patch('googleads.adwords.AdWordsClient') 34 | credential_id = StaticValueProvider(str, 'id') 35 | secret = StaticValueProvider(str, 'secret') 36 | access = StaticValueProvider(str, 'access') 37 | refresh = StaticValueProvider(str, 'refresh') 38 | credentials = OAuthCredentials(credential_id, secret, access, refresh) 39 | return GoogleAdsOfflineUploaderDoFn(credentials, 40 | StaticValueProvider(str, 'devtoken')) 41 | 42 | 43 | def test_get_service(mocker, uploader): 44 | assert uploader._get_oc_service(mocker.ANY) is not None 45 | 46 | 47 | def test_not_active(mocker, caplog): 48 | credential_id = StaticValueProvider(str, 'id') 49 | secret = StaticValueProvider(str, 'secret') 50 | access = StaticValueProvider(str, 'access') 51 | refresh = StaticValueProvider(str, 'refresh') 52 | credentials = OAuthCredentials(credential_id, secret, access, refresh) 53 | uploader_dofn = GoogleAdsOfflineUploaderDoFn(credentials, None) 54 | mocker.patch.object(uploader_dofn, '_get_oc_service') 55 | uploader_dofn.process(Batch(None, [])) 56 | uploader_dofn._get_oc_service.assert_not_called() 57 | assert 'Skipping upload, parameters not configured.' in caplog.text 58 | 59 | 60 | def test_conversion_upload(mocker, uploader): 61 | mocker.patch.object(uploader, '_get_oc_service') 62 | conversion_name = 'user_list' 63 | destination = Destination( 64 | 'dest1', DestinationType.ADS_OFFLINE_CONVERSION, ['user_list']) 65 | source = Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']) 66 | execution = Execution(_account_config, source, destination) 67 | 68 | time1 = '2020-04-09T14:13:55.0005' 69 | time1_result = '20200409 141355 America/Sao_Paulo' 70 | 71 | time2 = '2020-04-09T13:13:55.0005' 72 | time2_result = '20200409 131355 America/Sao_Paulo' 73 | 74 | batch = Batch(execution, [{ 75 | 'time': time1, 76 | 'amount': '123', 77 | 'gclid': '456' 78 | },{ 79 | 'time': time2, 80 | 'amount': '234', 81 | 'gclid': '567' 82 | }]) 83 | 84 | uploader.process(batch) 85 | 86 | uploader._get_oc_service.return_value.mutate.assert_any_call([{ 87 | 'operator': 'ADD', 88 | 'operand': { 89 | 'conversionName': conversion_name, 90 | 'conversionTime': time1_result, 91 | 'conversionValue': '123', 92 | 'googleClickId': '456' 93 | } 94 | }, { 95 | 'operator': 'ADD', 96 | 'operand': { 97 | 'conversionName': conversion_name, 98 | 'conversionTime': time2_result, 99 | 'conversionValue': '234', 100 | 'googleClickId': '567' 101 | } 102 | }]) 103 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_ads/conversions/google_ads_ssd_uploader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import apache_beam as beam 16 | import logging 17 | 18 | from uploaders import utils 19 | from models.execution import DestinationType, Batch, Execution 20 | 21 | 22 | class GoogleAdsSSDUploaderDoFn(beam.DoFn): 23 | 24 | def __init__(self, oauth_credentials, developer_token): 25 | super().__init__() 26 | self.oauth_credentials = oauth_credentials 27 | self.developer_token = developer_token 28 | self.active = developer_token is not None 29 | 30 | def _get_ssd_service(self, customer_id): 31 | return utils.get_ads_service('OfflineDataUploadService', 'v201809', 32 | self.oauth_credentials, 33 | self.developer_token.get(), customer_id) 34 | 35 | @staticmethod 36 | def _assert_conversion_metadata_is_present(execution: Execution): 37 | metadata = execution.destination.destination_metadata 38 | if len(metadata) != 2: 39 | raise ValueError( 40 | f'Missing destination information. Received {len(metadata)} entry(ies)') 41 | 42 | @utils.safe_process( 43 | logger=logging.getLogger('megalista.GoogleAdsSSDUploader')) 44 | def process(self, batch: Batch, **kwargs): 45 | execution = batch.execution 46 | self._assert_conversion_metadata_is_present(execution) 47 | 48 | ssd_service = self._get_ssd_service( 49 | execution.account_config._google_ads_account_id) 50 | self._do_upload(ssd_service, 51 | execution.destination.destination_metadata[0], 52 | execution.destination.destination_metadata[1], batch.elements) 53 | 54 | @staticmethod 55 | def _do_upload(ssd_service, conversion_name, ssd_external_upload_id, rows): 56 | upload_data = [{ 57 | 'StoreSalesTransaction': { 58 | 'userIdentifiers': [{ 59 | 'userIdentifierType': 'HASHED_EMAIL', 60 | 'value': conversion['hashedEmail'] 61 | }], 62 | 'transactionTime': utils.format_date(conversion['time']), 63 | 'transactionAmount': { 64 | 'currencyCode': 'BRL', 65 | 'money': { 66 | 'microAmount': conversion['amount'] 67 | } 68 | }, 69 | 'conversionName': conversion_name 70 | } 71 | } for conversion in rows] 72 | 73 | offline_data_upload = { 74 | 'externalUploadId': ssd_external_upload_id, 75 | 'offlineDataList': upload_data, 76 | 'uploadType': 'STORE_SALES_UPLOAD_FIRST_PARTY', 77 | 'uploadMetadata': { 78 | 'StoreSalesUploadCommonMetadata': { 79 | 'xsi_type': 'FirstPartyUploadMetadata', 80 | 'loyaltyRate': 1.0, 81 | 'transactionUploadRate': 1.0, 82 | } 83 | } 84 | } 85 | 86 | add_conversions_operation = { 87 | 'operand': offline_data_upload, 88 | 'operator': 'ADD' 89 | } 90 | ssd_service.mutate([add_conversions_operation]) 91 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_ads/conversions/google_ads_ssd_uploader_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | from apache_beam.options.value_provider import StaticValueProvider 17 | 18 | from uploaders.google_ads.conversions.google_ads_ssd_uploader import GoogleAdsSSDUploaderDoFn 19 | from models.execution import AccountConfig 20 | from models.execution import Destination 21 | from models.execution import DestinationType 22 | from models.execution import Execution 23 | from models.execution import Source 24 | from models.execution import SourceType 25 | from models.execution import Batch 26 | from models.oauth_credentials import OAuthCredentials 27 | 28 | _account_config = AccountConfig('account_id', False, 'ga_account_id', '', '') 29 | 30 | 31 | @pytest.fixture 32 | def uploader(mocker): 33 | mocker.patch('googleads.oauth2.GoogleRefreshTokenClient') 34 | mocker.patch('googleads.adwords.AdWordsClient') 35 | id = StaticValueProvider(str, 'id') 36 | secret = StaticValueProvider(str, 'secret') 37 | access = StaticValueProvider(str, 'access') 38 | refresh = StaticValueProvider(str, 'refresh') 39 | credentials = OAuthCredentials(id, secret, access, refresh) 40 | return GoogleAdsSSDUploaderDoFn(credentials, 41 | StaticValueProvider(str, 'devtoken')) 42 | 43 | 44 | def test_get_service(mocker, uploader): 45 | assert uploader._get_ssd_service(mocker.ANY) is not None 46 | 47 | 48 | def test_fail_missing_destination_metadata(uploader, mocker): 49 | mocker.patch.object(uploader, '_get_ssd_service') 50 | source = Source('orig1', SourceType.BIG_QUERY, ('dt1', 'buyers')) 51 | destination = Destination('dest1', DestinationType.ADS_SSD_UPLOAD, ['1']) 52 | execution = Execution(_account_config, source, destination) 53 | batch = Batch(execution, []) 54 | uploader.process(batch) 55 | uploader._get_ssd_service.assert_not_called() 56 | 57 | 58 | def test_conversion_upload(mocker, uploader): 59 | mocker.patch.object(uploader, '_get_ssd_service') 60 | conversion_name = 'ssd_conversion' 61 | external_upload_id = '123' 62 | source = Source('orig1', SourceType.BIG_QUERY, ('dt1', 'buyers')) 63 | destination = Destination('dest1', DestinationType.ADS_SSD_UPLOAD, 64 | [conversion_name, external_upload_id]) 65 | execution = Execution(_account_config, source, destination) 66 | 67 | time1 = '2020-04-09T14:13:55.0005' 68 | time1_result = '20200409 141355 America/Sao_Paulo' 69 | 70 | time2 = '2020-04-09T13:13:55.0005' 71 | time2_result = '20200409 131355 America/Sao_Paulo' 72 | 73 | batch = Batch(execution, [{ 74 | 'hashedEmail': 'a@a.com', 75 | 'time': time1, 76 | 'amount': '123' 77 | }, { 78 | 'hashedEmail': 'b@b.com', 79 | 'time': time2, 80 | 'amount': '234' 81 | }]) 82 | 83 | uploader.process(batch) 84 | 85 | upload_data = [{ 86 | 'StoreSalesTransaction': { 87 | 'userIdentifiers': [{ 88 | 'userIdentifierType': 'HASHED_EMAIL', 89 | 'value': 'a@a.com' 90 | }], 91 | 'transactionTime': time1_result, 92 | 'transactionAmount': { 93 | 'currencyCode': 'BRL', 94 | 'money': { 95 | 'microAmount': '123' 96 | } 97 | }, 98 | 'conversionName': conversion_name 99 | } 100 | }, { 101 | 'StoreSalesTransaction': { 102 | 'userIdentifiers': [{ 103 | 'userIdentifierType': 'HASHED_EMAIL', 104 | 'value': 'b@b.com' 105 | }], 106 | 'transactionTime': time2_result, 107 | 'transactionAmount': { 108 | 'currencyCode': 'BRL', 109 | 'money': { 110 | 'microAmount': '234' 111 | } 112 | }, 113 | 'conversionName': conversion_name 114 | } 115 | }] 116 | 117 | uploader._get_ssd_service.return_value.mutate.assert_any_call([{ 118 | 'operand': { 119 | 'externalUploadId': external_upload_id, 120 | 'offlineDataList': upload_data, 121 | 'uploadType': 'STORE_SALES_UPLOAD_FIRST_PARTY', 122 | 'uploadMetadata': { 123 | 'StoreSalesUploadCommonMetadata': { 124 | 'xsi_type': 'FirstPartyUploadMetadata', 125 | 'loyaltyRate': 1.0, 126 | 'transactionUploadRate': 1.0, 127 | } 128 | } 129 | }, 130 | 'operator': 'ADD' 131 | }]) 132 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_ads/customer_match/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name = "google_ads_customer_match" -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_ads/customer_match/abstract_uploader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | from typing import Dict, Any, List 17 | 18 | import apache_beam as beam 19 | from uploaders import utils 20 | from models.execution import AccountConfig 21 | from models.execution import DestinationType 22 | from models.execution import Batch 23 | from models.oauth_credentials import OAuthCredentials 24 | 25 | _DEFAULT_LOGGER: str = 'megalista.GoogleAdsCustomerMatchAbstractUploader' 26 | 27 | 28 | class GoogleAdsCustomerMatchAbstractUploaderDoFn(beam.DoFn): 29 | 30 | def __init__(self, oauth_credentials: OAuthCredentials, developer_token: str): 31 | super().__init__() 32 | self.oauth_credentials = oauth_credentials 33 | self.developer_token = developer_token 34 | self.active = True 35 | if self.developer_token is None: 36 | self.active = False 37 | self._user_list_id_cache: Dict[str, str] = {} 38 | 39 | def start_bundle(self): 40 | pass 41 | 42 | def _create_list_if_it_does_not_exist(self, user_list_service, list_name: str, 43 | list_definition: Dict[str, Any]) -> str: 44 | 45 | if self._user_list_id_cache.get(list_name) is None: 46 | self._user_list_id_cache[list_name] = \ 47 | self._do_create_list_if_it_does_not_exist( 48 | user_list_service, list_name, list_definition) 49 | 50 | return self._user_list_id_cache[list_name] 51 | 52 | def _do_create_list_if_it_does_not_exist(self, user_list_service, 53 | list_name: str, 54 | list_definition: Dict[str, Any] 55 | ) -> str: 56 | response = user_list_service.get([{ 57 | 'fields': ['Id', 'Name'], 58 | 'predicates': [{ 59 | 'field': 'Name', 60 | 'operator': 'EQUALS', 61 | 'values': [list_name] 62 | }] 63 | }]) 64 | 65 | if not response.entries: 66 | logging.getLogger(_DEFAULT_LOGGER).info( 67 | '%s list does not exist, creating...', list_name) 68 | result = user_list_service.mutate([{ 69 | 'operator': 'ADD', 70 | **list_definition 71 | }]) 72 | list_id = result['value'][0]['id'] 73 | logging.getLogger(_DEFAULT_LOGGER).info('List %s created with id: %d', 74 | list_name, list_id) 75 | else: 76 | list_id = response.entries[0]['id'] 77 | logging.getLogger(_DEFAULT_LOGGER).info('List found %s with id: %d', 78 | list_name, list_id) 79 | 80 | return str(list_id) 81 | 82 | # just to facilitate mocking 83 | def _get_user_list_service(self, customer_id): 84 | return utils.get_ads_service('AdwordsUserListService', 'v201809', 85 | self.oauth_credentials, 86 | self.developer_token.get(), customer_id) 87 | 88 | def _assert_execution_is_valid(self, execution) -> None: 89 | destination = execution.destination.destination_metadata 90 | 91 | # The number of parameters vary by upload. This test could be parameterized 92 | if not destination[0]: 93 | raise ValueError('Missing destination information. Received {}'.format( 94 | str(destination))) 95 | 96 | @utils.safe_process(logger=logging.getLogger(_DEFAULT_LOGGER)) 97 | def process(self, batch: Batch, **kwargs) -> None: 98 | if not self.active: 99 | logging.getLogger(_DEFAULT_LOGGER).warning( 100 | 'Skipping upload to ads, parameters not configured.') 101 | return 102 | 103 | execution = batch.execution 104 | 105 | self._assert_execution_is_valid(execution) 106 | 107 | user_list_service = self._get_user_list_service( 108 | execution.account_config.google_ads_account_id) 109 | list_id = self._create_list_if_it_does_not_exist( 110 | user_list_service, execution.destination.destination_metadata[0], 111 | self.get_list_definition( 112 | execution.account_config, 113 | execution.destination.destination_metadata)) 114 | 115 | rows = self.get_filtered_rows( 116 | batch.elements, self.get_row_keys()) 117 | 118 | mutate_members_operation = { 119 | 'operand': { 120 | 'userListId': list_id, 121 | 'membersList': rows 122 | }, 123 | 'operator': execution.destination.destination_metadata[1] 124 | } 125 | 126 | utils.safe_call_api(self.call_api, logging, user_list_service, [mutate_members_operation]) 127 | 128 | def call_api(self, service, operations): 129 | service.mutateMembers(operations) 130 | 131 | def get_filtered_rows(self, rows: List[Any], 132 | keys: List[str]) -> List[Dict[str, Any]]: 133 | return [{key: row.get(key) for key in keys} for row in rows] 134 | 135 | def get_list_definition(self, account_config: AccountConfig, 136 | destination_metadata: List[str]) -> Dict[str, Any]: 137 | pass 138 | 139 | def get_row_keys(self) -> List[str]: 140 | pass 141 | 142 | def get_action_type(self) -> DestinationType: 143 | pass 144 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_ads/customer_match/contact_info_uploader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import apache_beam as beam 16 | import logging 17 | 18 | from typing import Dict, Any, List 19 | 20 | from uploaders.google_ads.customer_match.abstract_uploader import GoogleAdsCustomerMatchAbstractUploaderDoFn 21 | from uploaders import utils 22 | from models.execution import DestinationType, AccountConfig 23 | 24 | 25 | class GoogleAdsCustomerMatchContactInfoUploaderDoFn(GoogleAdsCustomerMatchAbstractUploaderDoFn): 26 | def get_list_definition(self, account_config: AccountConfig, destination_metadata: List[str]) -> Dict[str, Any]: 27 | list_name = destination_metadata[0] 28 | return { 29 | 'operand': { 30 | 'xsi_type': 'CrmBasedUserList', 31 | 'name': list_name, 32 | 'description': list_name, 33 | # CRM-based user lists can use a membershipLifeSpan of 10000 to indicate 34 | # unlimited; otherwise normal values apply. 35 | 'membershipLifeSpan': 10000, 36 | 'uploadKeyType': 'CONTACT_INFO' 37 | } 38 | } 39 | 40 | def get_row_keys(self) -> List[str]: 41 | return ['hashedEmail', 'addressInfo', 'hashedPhoneNumber'] 42 | 43 | def get_action_type(self) -> DestinationType: 44 | return DestinationType.ADS_CUSTOMER_MATCH_CONTACT_INFO_UPLOAD 45 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_ads/customer_match/mobile_uploader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import apache_beam as beam 16 | import logging 17 | 18 | from typing import List, Dict, Any 19 | 20 | from uploaders.google_ads.customer_match.abstract_uploader import GoogleAdsCustomerMatchAbstractUploaderDoFn 21 | from uploaders import utils as utils 22 | from models.execution import DestinationType, AccountConfig 23 | from models.oauth_credentials import OAuthCredentials 24 | 25 | 26 | class GoogleAdsCustomerMatchMobileUploaderDoFn(GoogleAdsCustomerMatchAbstractUploaderDoFn): 27 | def get_list_definition(self, account_config: AccountConfig, destination_metadata: List[str]) -> Dict[str, Any]: 28 | list_name = destination_metadata[0] 29 | app_id = account_config.app_id 30 | 31 | #overwrite app_id from default to custom 32 | if len(destination_metadata) >=4 and len(destination_metadata[3]) > 0: 33 | app_id = destination_metadata[3] 34 | 35 | return { 36 | 'operand': { 37 | 'xsi_type': 'CrmBasedUserList', 38 | 'name': list_name, 39 | 'description': list_name, 40 | # CRM-based user list_name can use a membershipLifeSpan of 10000 to indicate 41 | # unlimited; otherwise normal values apply. 42 | 'membershipLifeSpan': 10000, 43 | 'appId': app_id, 44 | 'uploadKeyType': 'MOBILE_ADVERTISING_ID' 45 | } 46 | } 47 | 48 | def get_row_keys(self) -> List[str]: 49 | return ['mobileId'] 50 | 51 | def get_action_type(self) -> DestinationType: 52 | return DestinationType.ADS_CUSTOMER_MATCH_MOBILE_DEVICE_ID_UPLOAD 53 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_ads/customer_match/user_id_uploader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import Dict, Any, List 15 | 16 | from uploaders import utils 17 | from uploaders.google_ads.customer_match.abstract_uploader import GoogleAdsCustomerMatchAbstractUploaderDoFn 18 | from models.execution import DestinationType, AccountConfig 19 | 20 | 21 | class GoogleAdsCustomerMatchUserIdUploaderDoFn( 22 | GoogleAdsCustomerMatchAbstractUploaderDoFn): 23 | 24 | def get_list_definition( 25 | self, 26 | account_config: AccountConfig, 27 | destination_metadata: List[str]) -> Dict[str, Any]: 28 | list_name = destination_metadata[0] 29 | return { 30 | 'operand': { 31 | 'xsi_type': 'CrmBasedUserList', 32 | 'name': list_name, 33 | 'description': list_name, 34 | # CRM-based user list_name can use a membershipLifeSpan of 10000 to indicate 35 | # unlimited; otherwise normal values apply. 36 | 'membershipLifeSpan': 10000, 37 | 'uploadKeyType': 'CRM_ID' 38 | } 39 | } 40 | 41 | def get_row_keys(self) -> List[str]: 42 | return ['userId'] 43 | 44 | def get_action_type(self) -> DestinationType: 45 | return DestinationType.ADS_CUSTOMER_MATCH_USER_ID_UPLOAD 46 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_analytics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DP6/marketing-data-sync/e65fd6627612281143f0696461bd6475e793785d/megalist_dataflow/uploaders/google_analytics/__init__.py -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_analytics/google_analytics_4_measurement_protocol.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the 'License'); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an 'AS IS' BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import logging 17 | from typing import Dict, Any 18 | from urllib.parse import quote 19 | 20 | import apache_beam as beam 21 | import requests 22 | import json 23 | 24 | from uploaders import utils 25 | from models.execution import DestinationType, Batch 26 | 27 | 28 | class GoogleAnalytics4MeasurementProtocolUploaderDoFn(beam.DoFn): 29 | def __init__(self): 30 | super().__init__() 31 | self.API_URL = 'https://www.google-analytics.com/mp/collect' 32 | 33 | def start_bundle(self): 34 | pass 35 | 36 | @staticmethod 37 | def _str2bool(s: str) -> bool: 38 | return s.lower() == 'true' 39 | 40 | @staticmethod 41 | def _exactly_one_of(a: Any, b: Any) -> bool: 42 | return (a and not b) or (not a and b) 43 | 44 | @utils.safe_process(logger=logging.getLogger('megalista.GoogleAnalytics4MeasurementProtocolUploader')) 45 | def process(self, batch: Batch, **kwargs): 46 | execution = batch.execution 47 | 48 | api_secret = execution.destination.destination_metadata[0] 49 | is_event = self._str2bool(execution.destination.destination_metadata[1]) 50 | is_user_property = self._str2bool(execution.destination.destination_metadata[2]) 51 | non_personalized_ads = self._str2bool(execution.destination.destination_metadata[3]) 52 | 53 | firebase_app_id = None 54 | if len(execution.destination.destination_metadata) >= 5: 55 | firebase_app_id = execution.destination.destination_metadata[4] 56 | 57 | measurement_id = None 58 | if len(execution.destination.destination_metadata) >= 6: 59 | measurement_id = execution.destination.destination_metadata[5] 60 | 61 | if not self._exactly_one_of(firebase_app_id, measurement_id): 62 | raise ValueError( 63 | 'GA4 MP should be called either with a firebase_app_id (for apps) or a measurement_id (for web)') 64 | 65 | if not self._exactly_one_of(is_event, is_user_property): 66 | raise ValueError( 67 | 'GA4 MP should be called either for sending events or a user properties') 68 | 69 | payload: Dict[str, Any] = { 70 | 'nonPersonalizedAds': non_personalized_ads 71 | } 72 | 73 | accepted_elements = [] 74 | 75 | for row in batch.elements: 76 | app_instance_id = row.get('app_instance_id') 77 | client_id = row.get('client_id') 78 | user_id = row.get('user_id') 79 | 80 | if not self._exactly_one_of(app_instance_id, client_id): 81 | raise ValueError( 82 | 'GA4 MP should be called either with an app_instance_id (for apps) or a client_id (for web)') 83 | 84 | if is_event: 85 | params = {k: v for k, v in row.items() if k not in ('name', 'app_instance_id', 'client_id', 'uuid', 'user_id')} 86 | payload['events'] = [{'name': row['name'], 'params': params}] 87 | 88 | if is_user_property: 89 | payload['userProperties'] = {k: {'value': v} for k, v in row.items() if k not in ('app_instance_id', 'client_id', 'uuid', 'user_id')} 90 | payload['events'] = {'name': 'user_property_addition_event', 'params': {}} 91 | 92 | url_container = [f'{self.API_URL}?api_secret={api_secret}'] 93 | 94 | if firebase_app_id: 95 | url_container.append(f'&firebase_app_id={firebase_app_id}') 96 | if not app_instance_id: 97 | raise ValueError( 98 | 'GA4 MP needs an app_instance_id parameter when used for an App Stream.') 99 | payload['app_instance_id'] = app_instance_id 100 | 101 | if measurement_id: 102 | url_container.append(f'&measurement_id={measurement_id}') 103 | if not client_id: 104 | raise ValueError( 105 | 'GA4 MP needs a client_id parameter when used for a Web Stream.') 106 | payload['client_id'] = client_id 107 | 108 | if user_id: 109 | payload['user_id'] = user_id 110 | 111 | url = ''.join(url_container) 112 | response = requests.post(url,data=json.dumps(payload)) 113 | if response.status_code != 204: 114 | logging.getLogger('megalista.GoogleAnalytics4MeasurementProtocolUploader').error( 115 | f'Error calling GA4 MP {response.status_code}: {response.raw}') 116 | else: 117 | accepted_elements.append(row) 118 | 119 | logging.getLogger('megalista.GoogleAnalytics4MeasurementProtocolUploader').info( 120 | f'Successfully uploaded {len(accepted_elements)}/{len(batch.elements)} events.') 121 | yield Batch(execution, accepted_elements) 122 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_analytics/google_analytics_4_measurement_protocol_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the 'License'); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an 'AS IS' BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | from apache_beam.options.value_provider import StaticValueProvider 17 | 18 | from uploaders.google_analytics.google_analytics_4_measurement_protocol import GoogleAnalytics4MeasurementProtocolUploaderDoFn 19 | from models.execution import Execution, SourceType, DestinationType, Source, AccountConfig, Destination, Batch 20 | 21 | import requests 22 | import requests_mock 23 | 24 | from unittest import mock 25 | 26 | 27 | _account_config = AccountConfig('account_id', False, 'ga_account_id', '', '') 28 | 29 | 30 | @pytest.fixture 31 | def uploader(): 32 | return GoogleAnalytics4MeasurementProtocolUploaderDoFn() 33 | 34 | 35 | def test_exception_event_and_user_property(uploader, caplog): 36 | with requests_mock.Mocker() as m: 37 | m.post(requests_mock.ANY, status_code=204) 38 | destination = Destination( 39 | 'dest1', DestinationType.GA_4_MEASUREMENT_PROTOCOL, [ 40 | 'api_secret', 41 | 'True', 42 | 'True', 43 | '', 44 | 'some_id', 45 | '' 46 | ]) 47 | source = Source('orig1', SourceType.BIG_QUERY, []) 48 | execution = Execution(_account_config, source, destination) 49 | with pytest.raises(ValueError, match='GA4 MP should be called either for sending events'): 50 | next(uploader.process(Batch(execution, []))) 51 | 52 | 53 | def test_exception_no_event_nor_user_property(uploader, caplog): 54 | with requests_mock.Mocker() as m: 55 | m.post(requests_mock.ANY, status_code=204) 56 | destination = Destination( 57 | 'dest1', DestinationType.GA_4_MEASUREMENT_PROTOCOL, [ 58 | 'api_secret', 59 | 'False', 60 | 'False', 61 | '', 62 | 'some_id', 63 | '' 64 | ]) 65 | source = Source('orig1', SourceType.BIG_QUERY, []) 66 | execution = Execution(_account_config, source, destination) 67 | with pytest.raises(ValueError, match='GA4 MP should be called either for sending events'): 68 | next(uploader.process(Batch(execution, []))) 69 | 70 | 71 | def test_exception_app_and_web(uploader, caplog): 72 | with requests_mock.Mocker() as m: 73 | m.post(requests_mock.ANY, status_code=204) 74 | destination = Destination( 75 | 'dest1', DestinationType.GA_4_MEASUREMENT_PROTOCOL, [ 76 | 'api_secret', 77 | 'False', 78 | 'True', 79 | '', 80 | 'some_app_id', 81 | 'some_web_id' 82 | ]) 83 | source = Source('orig1', SourceType.BIG_QUERY, []) 84 | execution = Execution(_account_config, source, destination) 85 | with pytest.raises(ValueError, match='GA4 MP should be called either with a firebase_app_id'): 86 | next(uploader.process(Batch(execution, [{ 87 | 'name': 'event_name', 88 | }]))) 89 | 90 | 91 | def test_exception_no_id(uploader, caplog): 92 | with requests_mock.Mocker() as m: 93 | m.post(requests_mock.ANY, status_code=204) 94 | destination = Destination( 95 | 'dest1', DestinationType.GA_4_MEASUREMENT_PROTOCOL, [ 96 | 'api_secret', 97 | 'False', 98 | 'True', 99 | '', 100 | '', 101 | '' 102 | ]) 103 | source = Source('orig1', SourceType.BIG_QUERY, []) 104 | execution = Execution(_account_config, source, destination) 105 | with pytest.raises(ValueError, match='GA4 MP should be called either with a firebase_app_id'): 106 | next(uploader.process(Batch(execution, [{ 107 | 'name': 'event_name', 108 | 'value': '123' 109 | }]))) 110 | 111 | def test_exception_app_event_without_app_instance_id(uploader, caplog): 112 | with requests_mock.Mocker() as m: 113 | m.post(requests_mock.ANY, status_code=204) 114 | destination = Destination( 115 | 'dest1', DestinationType.GA_4_MEASUREMENT_PROTOCOL, [ 116 | 'api_secret', 117 | 'True', 118 | 'False', 119 | '', 120 | 'some_id', 121 | '' 122 | ]) 123 | source = Source('orig1', SourceType.BIG_QUERY, []) 124 | execution = Execution(_account_config, source, destination) 125 | with pytest.raises(ValueError, match='GA4 MP needs an app_instance_id parameter when used for an App Stream.'): 126 | next(uploader.process(Batch(execution, [{ 127 | 'client_id': '123', 128 | 'name': 'event_name', 129 | 'value': '42', 130 | 'important_event': 'False' 131 | }]))) 132 | 133 | def test_exception_web_event_without_client_id(uploader, caplog): 134 | with requests_mock.Mocker() as m: 135 | m.post(requests_mock.ANY, status_code=204) 136 | destination = Destination( 137 | 'dest1', DestinationType.GA_4_MEASUREMENT_PROTOCOL, [ 138 | 'api_secret', 139 | 'True', 140 | 'False', 141 | '', 142 | '', 143 | 'some_id' 144 | ]) 145 | source = Source('orig1', SourceType.BIG_QUERY, []) 146 | execution = Execution(_account_config, source, destination) 147 | with pytest.raises(ValueError, match='GA4 MP needs a client_id parameter when used for a Web Stream.'): 148 | next(uploader.process(Batch(execution, [{ 149 | 'app_instance_id': '123', 150 | 'name': 'event_name', 151 | 'value': '42', 152 | 'important_event': 'False' 153 | }]))) 154 | 155 | def test_succesful_app_event_call(uploader, caplog): 156 | with requests_mock.Mocker() as m: 157 | m.post(requests_mock.ANY, status_code=204) 158 | destination = Destination( 159 | 'dest1', DestinationType.GA_4_MEASUREMENT_PROTOCOL, [ 160 | 'api_secret', 161 | 'True', 162 | 'False', 163 | '', 164 | 'some_id', 165 | '' 166 | ]) 167 | source = Source('orig1', SourceType.BIG_QUERY, []) 168 | execution = Execution(_account_config, source, destination) 169 | next(uploader.process(Batch(execution, [{ 170 | 'app_instance_id': '123', 171 | 'name': 'event_name', 172 | 'value': '42', 173 | 'important_event': 'False' 174 | }]))) 175 | 176 | assert m.call_count == 1 177 | assert m.last_request.json()['events'][0]['params']['value'] == '42' 178 | 179 | 180 | def test_succesful_app_event_call_with_user_id(uploader, caplog): 181 | with requests_mock.Mocker() as m: 182 | m.post(requests_mock.ANY, status_code=204) 183 | destination = Destination( 184 | 'dest1', DestinationType.GA_4_MEASUREMENT_PROTOCOL, [ 185 | 'api_secret', 186 | 'True', 187 | 'False', 188 | '', 189 | 'some_id', 190 | '' 191 | ]) 192 | source = Source('orig1', SourceType.BIG_QUERY, []) 193 | execution = Execution(_account_config, source, destination) 194 | next(uploader.process(Batch(execution, [{ 195 | 'app_instance_id': '123', 196 | 'name': 'event_name', 197 | 'value': '42', 198 | 'user_id': 'Id42' 199 | }]))) 200 | 201 | assert m.call_count == 1 202 | assert m.last_request.json()['user_id'] == 'Id42' 203 | 204 | 205 | def test_succesful_web_user_property_call(uploader, caplog): 206 | with requests_mock.Mocker() as m: 207 | m.post(requests_mock.ANY, status_code=204) 208 | destination = Destination( 209 | 'dest1', DestinationType.GA_4_MEASUREMENT_PROTOCOL, [ 210 | 'api_secret', 211 | 'False', 212 | 'True', 213 | '', 214 | '', 215 | 'some_id' 216 | ]) 217 | source = Source('orig1', SourceType.BIG_QUERY, []) 218 | execution = Execution(_account_config, source, destination) 219 | next(uploader.process(Batch(execution, [{ 220 | 'user_ltv': '42', 221 | 'client_id': 'some_id' 222 | }, 223 | { 224 | 'user_will_churn': 'Maybe', 225 | 'client_id': 'some_id' 226 | } 227 | ]))) 228 | 229 | assert m.call_count == 2 230 | assert m.last_request.json( 231 | )['userProperties']['user_will_churn']['value'] == 'Maybe' 232 | 233 | def test_succesful_web_user_property_call_with_user_id(uploader, caplog): 234 | with requests_mock.Mocker() as m: 235 | m.post(requests_mock.ANY, status_code=204) 236 | destination = Destination( 237 | 'dest1', DestinationType.GA_4_MEASUREMENT_PROTOCOL, [ 238 | 'api_secret', 239 | 'False', 240 | 'True', 241 | '', 242 | '', 243 | 'some_id' 244 | ]) 245 | source = Source('orig1', SourceType.BIG_QUERY, []) 246 | execution = Execution(_account_config, source, destination) 247 | next(uploader.process(Batch(execution, [{ 248 | 'user_ltv': '42', 249 | 'user_id': 'Id42', 250 | 'client_id': 'someId' 251 | } 252 | ]))) 253 | 254 | assert m.call_count == 1 255 | assert m.last_request.json( 256 | )['user_id'] == 'Id42' 257 | 258 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_analytics/google_analytics_data_import_eraser.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | 17 | import apache_beam as beam 18 | from google.cloud import bigquery 19 | from google.oauth2.credentials import Credentials 20 | from googleapiclient.discovery import build 21 | 22 | from models.execution import DestinationType, Batch 23 | 24 | 25 | class GoogleAnalyticsDataImportEraser(beam.DoFn): 26 | """ 27 | Clean up every file in a Custom Data Import. 28 | 29 | If you are changing this code, be very careful, since this class deletes ALL FILES within a Data Import. 30 | Make sure you're not deleting files from the wrong Data Import. 31 | Also, make sure that all unit tests pass and write new ones as you feel appropriated. 32 | """ 33 | 34 | def __init__(self, oauth_credentials): 35 | super().__init__() 36 | self.oauth_credentials = oauth_credentials 37 | 38 | def _get_analytics_service(self): 39 | credentials = Credentials( 40 | token=self.oauth_credentials.get_access_token(), 41 | refresh_token=self.oauth_credentials.get_refresh_token(), 42 | client_id=self.oauth_credentials.get_client_id(), 43 | client_secret=self.oauth_credentials.get_client_secret(), 44 | token_uri='https://accounts.google.com/o/oauth2/token', 45 | scopes=["https://www.googleapis.com/auth/analytics.edit", 'https://www.googleapis.com/auth/adwords']) 46 | 47 | return build('analytics', 'v3', credentials=credentials) 48 | 49 | def start_bundle(self): 50 | pass 51 | 52 | @staticmethod 53 | def _assert_all_list_names_are_present(any_execution): 54 | destination = any_execution.destination.destination_metadata 55 | if len(destination) < 2: 56 | raise ValueError('Missing destination information. Found {}'.format(len(destination))) 57 | 58 | if not destination[0] or not destination[1]: 59 | raise ValueError('Missing destination information. Received {}'.format(str(destination))) 60 | 61 | def process(self, batch: Batch, **kwargs): 62 | execution = batch.execution 63 | self._assert_all_list_names_are_present(execution) 64 | 65 | ga_account_id = execution.account_config.google_analytics_account_id 66 | 67 | # Reads all metadata parameters 68 | metadata = execution.destination.destination_metadata 69 | 70 | web_property_id = metadata[0] 71 | data_import_name = metadata[1] 72 | 73 | analytics = self._get_analytics_service() 74 | data_sources = analytics.management().customDataSources().list( 75 | accountId=ga_account_id, webPropertyId=web_property_id).execute()['items'] 76 | data_source_results = list( 77 | filter(lambda data_source: data_source['name'] == data_import_name, data_sources)) 78 | 79 | if len(data_source_results) == 1: 80 | data_source_id = data_source_results[0]['id'] 81 | try: 82 | self._call_delete_api(analytics, data_import_name, ga_account_id, data_source_id, web_property_id) 83 | yield batch 84 | except Exception as e: 85 | logging.getLogger("megalista.GoogleAnalyticsDataImportUploader").error( 86 | 'Error while delete GA Data Import files: %s' % e) 87 | else: 88 | logging.getLogger("megalista.GoogleAnalyticsDataImportUploader").error( 89 | "%s - data import not found, please configure it in Google Analytics" % data_import_name) 90 | 91 | @staticmethod 92 | def _is_table_empty(execution): 93 | table_name = execution.source.source_metadata[0] + '.' + execution.source.source_metadata[1] 94 | client = bigquery.Client() 95 | query = "select count(*) from " + table_name + " data" 96 | logging.getLogger().info('Counting rows from table %s for Execution (%s)', table_name, str(execution)) 97 | 98 | # Get count value from BigQuery response 99 | return list(client.query(query).result())[0][0] == 0 100 | 101 | @staticmethod 102 | def _call_delete_api(analytics, data_import_name, ga_account_id, data_source_id, web_property_id): 103 | logging.getLogger("megalista.GoogleAnalyticsDataImportUploader").info( 104 | "Listing files from %s - %s" % (data_import_name, data_source_id)) 105 | 106 | uploads = analytics.management().uploads().list( 107 | accountId=ga_account_id, 108 | webPropertyId=web_property_id, 109 | customDataSourceId=data_source_id 110 | ).execute() 111 | 112 | file_ids = [upload.get('id') for upload in uploads.get('items', [])] 113 | if len(file_ids) == 0: 114 | logging.getLogger("megalista.GoogleAnalyticsDataImportUploader").error( 115 | "Data Source %s had no files to delete" % data_import_name) 116 | 117 | else: 118 | logging.getLogger("megalista.GoogleAnalyticsDataImportUploader").info( 119 | "File Ids: %s" % file_ids) 120 | 121 | logging.getLogger("megalista.GoogleAnalyticsDataImportUploader").info( 122 | "Deleting %s files from %s - %s" % (len(file_ids), data_import_name, data_source_id)) 123 | analytics.management().uploads().deleteUploadData( 124 | accountId=ga_account_id, 125 | webPropertyId=web_property_id, 126 | customDataSourceId=data_source_id, 127 | body={ 128 | 'customDataImportUids': file_ids 129 | } 130 | ).execute() 131 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_analytics/google_analytics_data_import_eraser_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import pytest 17 | from apache_beam.options.value_provider import StaticValueProvider 18 | from models.oauth_credentials import OAuthCredentials 19 | from models.execution import Execution, SourceType, DestinationType, Source, AccountConfig, Destination, Batch 20 | from uploaders.google_analytics.google_analytics_data_import_eraser import GoogleAnalyticsDataImportEraser 21 | 22 | 23 | @pytest.fixture 24 | def eraser(mocker): 25 | mocker.patch('googleads.oauth2.GoogleRefreshTokenClient') 26 | mocker.patch('googleads.adwords.AdWordsClient') 27 | client_id = StaticValueProvider(str, "id") 28 | secret = StaticValueProvider(str, "secret") 29 | access = StaticValueProvider(str, "access") 30 | refresh = StaticValueProvider(str, "refresh") 31 | credentials = OAuthCredentials(client_id, secret, access, refresh) 32 | return GoogleAnalyticsDataImportEraser(credentials) 33 | 34 | 35 | def test_analytics_has_not_data_sources(mocker, eraser, caplog): 36 | service = mocker.MagicMock() 37 | 38 | mocker.patch.object(eraser, '_get_analytics_service') 39 | eraser._get_analytics_service.return_value = service 40 | 41 | mocker.patch.object(eraser, '_is_table_empty') 42 | eraser._is_table_empty.return_value = False 43 | 44 | service.management().customDataSources().list().execute.return_value = { 45 | 'items': [] 46 | } 47 | 48 | execution = Execution(AccountConfig('', False, '', '', ''), 49 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']), 50 | Destination('dest1', DestinationType.GA_DATA_IMPORT, ['web_property', 'data_import_name'])) 51 | # Act 52 | try: 53 | next(eraser.process(Batch(execution, []))) 54 | except StopIteration: 55 | pass 56 | 57 | assert 'data_import_name - data import not found, please configure it in Google Analytics' in caplog.text 58 | 59 | 60 | def test_data_source_not_found(mocker, eraser, caplog): 61 | service = mocker.MagicMock() 62 | 63 | mocker.patch.object(eraser, '_get_analytics_service') 64 | eraser._get_analytics_service.return_value = service 65 | 66 | mocker.patch.object(eraser, '_is_table_empty') 67 | eraser._is_table_empty.return_value = False 68 | 69 | service.management().customDataSources().list().execute.return_value = { 70 | 'items': [{'id': 1, 'name': 'wrong_name'}] 71 | } 72 | 73 | execution = Execution(AccountConfig('', False, '', '', ''), 74 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']), 75 | Destination('dest1', DestinationType.GA_DATA_IMPORT, ['web_property', 'data_import_name'])) 76 | # Act 77 | try: 78 | next(eraser.process(Batch(execution, []))) 79 | except StopIteration: 80 | pass 81 | 82 | assert 'data_import_name - data import not found, please configure it in Google Analytics' in caplog.text 83 | 84 | 85 | def test_no_files_found(mocker, eraser): 86 | service = mocker.MagicMock() 87 | 88 | mocker.patch.object(eraser, '_get_analytics_service') 89 | eraser._get_analytics_service.return_value = service 90 | 91 | mocker.patch.object(eraser, '_is_table_empty') 92 | eraser._is_table_empty.return_value = False 93 | 94 | service.management().customDataSources().list().execute.return_value = { 95 | 'items': [{'id': 1, 'name': 'data_import_name'}, 96 | {'id': 2, 'name': 'data_import_name2'}] 97 | } 98 | 99 | execution = Execution(AccountConfig('', False, '', '', ''), 100 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']), 101 | Destination('dest1', DestinationType.GA_DATA_IMPORT, ['web_property', 'data_import_name'])) 102 | 103 | # Add mock to side effect of list uploads 104 | service.management().uploads().list().execute.return_value = {'items': []} 105 | 106 | # Add mock to side effect of deleteUploadData 107 | delete_call_mock = mocker.MagicMock() 108 | service.management().uploads().deleteUploadData.side_effect = delete_call_mock 109 | 110 | # Act 111 | next(eraser.process(Batch(execution, []))) 112 | 113 | # Called once 114 | delete_call_mock.assert_not_called() 115 | 116 | 117 | def test_files_deleted(mocker, eraser): 118 | service = mocker.MagicMock() 119 | 120 | mocker.patch.object(eraser, '_get_analytics_service') 121 | eraser._get_analytics_service.return_value = service 122 | 123 | mocker.patch.object(eraser, '_is_table_empty') 124 | eraser._is_table_empty.return_value = False 125 | 126 | service.management().customDataSources().list().execute.return_value = { 127 | 'items': [{'id': 1, 'name': 'data_import_name'}, 128 | {'id': 2, 'name': 'data_import_name2'}] 129 | } 130 | 131 | execution = Execution(AccountConfig('', False, '', '', ''), 132 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']), 133 | Destination('dest1', DestinationType.GA_DATA_IMPORT, ['web_property', 'data_import_name'])) 134 | 135 | # Add mock to side effect of list uploads 136 | service.management().uploads().list().execute.return_value = {'items': [{'id': 'ab'}, {'id': 'cd'}]} 137 | 138 | # Add mock to side effect of deleteUploadData 139 | delete_call_mock = mocker.MagicMock() 140 | service.management().uploads().deleteUploadData.side_effect = delete_call_mock 141 | 142 | # Act 143 | next(eraser.process(Batch(execution, []))) 144 | 145 | # Called once 146 | delete_call_mock.assert_called_once() 147 | 148 | # Intercept args called 149 | _, kwargs = delete_call_mock.call_args 150 | 151 | # Check if really sent values from custom field 152 | ids = kwargs['body'] 153 | 154 | # assert 155 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_analytics/google_analytics_data_import_uploader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | import logging 17 | from typing import List, Dict 18 | 19 | import apache_beam as beam 20 | from google.oauth2.credentials import Credentials 21 | from googleapiclient.discovery import build 22 | from googleapiclient.http import MediaInMemoryUpload 23 | 24 | from uploaders import utils 25 | from models.execution import DestinationType, Batch, Union 26 | 27 | 28 | class GoogleAnalyticsDataImportUploaderDoFn(beam.DoFn): 29 | """ 30 | This uploader uploads csv files to Google Analytics Data Import. 31 | The csv headers are the dict received keys. 32 | Only one Execution can ben handled at a time, meaning that only one data 33 | import can be handled at a time. 34 | 35 | """ 36 | 37 | def __init__(self, oauth_credentials): 38 | super().__init__() 39 | self.oauth_credentials = oauth_credentials 40 | 41 | def _get_analytics_service(self): 42 | credentials = Credentials( 43 | token=self.oauth_credentials.get_access_token(), 44 | refresh_token=self.oauth_credentials.get_refresh_token(), 45 | client_id=self.oauth_credentials.get_client_id(), 46 | client_secret=self.oauth_credentials.get_client_secret(), 47 | token_uri='https://accounts.google.com/o/oauth2/token', 48 | scopes=[ 49 | 'https://www.googleapis.com/auth/analytics.edit', 50 | 'https://www.googleapis.com/auth/adwords' 51 | ]) 52 | 53 | service = build('analytics', 'v3', credentials=credentials) 54 | return service 55 | 56 | def start_bundle(self): 57 | pass 58 | 59 | @staticmethod 60 | def _assert_all_list_names_are_present(any_execution): 61 | destination = any_execution.destination.destination_metadata 62 | if len(destination) < 2: 63 | raise ValueError('Missing destination information. Found ' 64 | f'{len(destination)}') 65 | 66 | if not destination[0] or not destination[1]: 67 | raise ValueError('Missing destination information. Received {}'.format( 68 | str(destination))) 69 | 70 | @utils.safe_process( 71 | logger=logging.getLogger('megalista.GoogleAnalyticsDataImportUploader')) 72 | def process(self, batch: Batch, **kwargs): 73 | execution = batch.execution 74 | self._assert_all_list_names_are_present(execution) 75 | 76 | ga_account_id = execution.account_config.google_analytics_account_id 77 | 78 | # Reads all metadata parameters 79 | metadata = execution.destination.destination_metadata 80 | 81 | web_property_id = metadata[0] 82 | data_import_name = metadata[1] 83 | 84 | self._do_upload_data(web_property_id, data_import_name, 85 | ga_account_id, batch.elements) 86 | 87 | def _do_upload_data(self, web_property_id, data_import_name, ga_account_id, 88 | rows: List[Dict[str, Union[str, Dict[str, str]]]]): 89 | analytics = self._get_analytics_service() 90 | data_sources = analytics.management().customDataSources().list( 91 | accountId=ga_account_id, 92 | webPropertyId=web_property_id).execute()['items'] 93 | 94 | data_source_results = list( 95 | filter(lambda x: x['name'] == data_import_name, data_sources)) 96 | 97 | if data_source_results: 98 | 99 | data_source_id = data_source_results[0]['id'] 100 | 101 | try: 102 | self._call_upload_api(analytics, data_import_name, ga_account_id, 103 | data_source_id, rows, web_property_id) 104 | except Exception as e: 105 | logging.getLogger('megalista.GoogleAnalyticsDataImportUploader').error( 106 | 'Error while uploading GA Data: %s', e) 107 | raise 108 | else: 109 | logging.getLogger('megalista.GoogleAnalyticsDataImportUploader').error( 110 | '%s - data import not found, please configure it in Google Analytics' 111 | % data_import_name) 112 | 113 | @staticmethod 114 | def prepare_csv(rows): 115 | """ 116 | Transform a input into this format: 117 | sample = [{'col1': 'val1a', 'col2': 'val2a', 'col3': 118 | 'val3a'}, 119 | {'col1': 'val1b', 'col2': 'val2b', 'col3': 'val3b'}, 120 | {'col1': 'val1c', 'col2': 'val2c', 'col3': 'val3c'}] 121 | into a csv: 122 | col1,col2,col3 123 | val1a,val2a,val3a 124 | val1b,val2b,val3b 125 | val1c,val2c,val3c 126 | """ 127 | column_names = ['ga:' + columnName for columnName in rows[0].keys()] 128 | header = ','.join(column_names) 129 | body = '\n'.join([ 130 | ','.join( 131 | ['' if element is None else element 132 | for element in row.values()]) 133 | for row in rows 134 | ]) 135 | return '\n'.join([header, body]) 136 | 137 | def _call_upload_api(self, analytics, data_import_name, ga_account_id, 138 | data_source_id, rows, web_property_id): 139 | logging.getLogger('megalista.GoogleAnalyticsDataImportUploader').info( 140 | 'Adding data to %s - %s' % (data_import_name, data_source_id)) 141 | csv = self.prepare_csv(rows) 142 | 143 | media = MediaInMemoryUpload( 144 | bytes(csv, 'UTF-8'), 145 | mimetype='application/octet-stream', 146 | resumable=True) 147 | 148 | analytics.management().uploads().uploadData( 149 | accountId=ga_account_id, 150 | webPropertyId=web_property_id, 151 | customDataSourceId=data_source_id, 152 | media_body=media).execute() 153 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_analytics/google_analytics_data_import_uploader_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | from apache_beam.options.value_provider import StaticValueProvider 17 | 18 | from models.oauth_credentials import OAuthCredentials 19 | from uploaders.google_analytics.google_analytics_data_import_uploader import GoogleAnalyticsDataImportUploaderDoFn 20 | from models.execution import AccountConfig 21 | from models.execution import Destination 22 | from models.execution import DestinationType 23 | from models.execution import Execution 24 | from models.execution import Source 25 | from models.execution import SourceType 26 | from models.execution import Batch 27 | 28 | _account_config = AccountConfig('1234567890', False, '1234567890', '', '') 29 | 30 | 31 | @pytest.fixture 32 | def uploader(mocker): 33 | mocker.patch('googleads.oauth2.GoogleRefreshTokenClient') 34 | mocker.patch('googleads.adwords.AdWordsClient') 35 | client_id = StaticValueProvider(str, 'id') 36 | secret = StaticValueProvider(str, 'secret') 37 | access = StaticValueProvider(str, 'access') 38 | refresh = StaticValueProvider(str, 'refresh') 39 | credentials = OAuthCredentials(client_id, secret, access, refresh) 40 | return GoogleAnalyticsDataImportUploaderDoFn(credentials) 41 | 42 | 43 | def test_get_service(uploader): 44 | assert uploader._get_analytics_service() is not None 45 | 46 | 47 | def test_elements_uploading(mocker, uploader): 48 | service = mocker.MagicMock() 49 | 50 | mocker.patch.object(uploader, '_get_analytics_service') 51 | uploader._get_analytics_service.return_value = service 52 | 53 | service.management().customDataSources().list().execute.return_value = { 54 | 'items': [{ 55 | 'id': 1, 56 | 'name': 'data_import_name' 57 | }] 58 | } 59 | 60 | execution = Execution( 61 | _account_config, 62 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']), 63 | Destination('dest1', DestinationType.GA_DATA_IMPORT, 64 | ['web_property', 'data_import_name'])) 65 | 66 | # Add mock to side effect of uploadData() 67 | my_mock = mocker.MagicMock() 68 | service.management().uploads().uploadData.side_effect = my_mock 69 | 70 | # Act 71 | uploader.process(Batch(execution, [{ 72 | 'user_id': '12', 73 | 'cd1': 'value1a', 74 | 'cd2': 'value2a' 75 | }, 76 | { 77 | 'user_id': '34', 78 | 'cd1': 'value1b', 79 | 'cd2': 'value2b' 80 | }, 81 | { 82 | 'user_id': '56', 83 | 'cd1': None, 84 | 'cd2': '' 85 | }])) 86 | 87 | 88 | # Called once 89 | my_mock.assert_called_once() 90 | 91 | # Intercept args called 92 | _, kwargs = my_mock.call_args 93 | 94 | # Check if really sent values from custom field 95 | media_bytes = kwargs['media_body'].getbytes(0, -1) 96 | 97 | print(media_bytes) 98 | assert media_bytes == b'ga:user_id,ga:cd1,ga:cd2\n' \ 99 | b'12,value1a,value2a\n' \ 100 | b'34,value1b,value2b\n' \ 101 | b'56,,' 102 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_analytics/google_analytics_measurement_protocol.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import logging 17 | from typing import Dict, Any 18 | from urllib.parse import quote 19 | 20 | import apache_beam as beam 21 | import requests 22 | import re 23 | 24 | from uploaders import utils 25 | from models.execution import DestinationType, Batch 26 | 27 | 28 | class GoogleAnalyticsMeasurementProtocolUploaderDoFn(beam.DoFn): 29 | def __init__(self): 30 | super().__init__() 31 | self.API_URL = "https://www.google-analytics.com/batch" 32 | self.UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" 33 | 34 | def start_bundle(self): 35 | pass 36 | 37 | def _format_hit(self, payload: Dict[str, Any]) -> str: 38 | return "&".join([key + "=" + quote(str(value)) for key, value in payload.items() if value is not None]) 39 | 40 | @utils.safe_process(logger=logging.getLogger("megalista.GoogleAnalyticsMeasurementProtocolUploader")) 41 | def process(self, batch: Batch, **kwargs): 42 | execution = batch.execution 43 | rows = batch.elements 44 | payloads = [{ 45 | "v": 1, 46 | "tid": execution.destination.destination_metadata[0], 47 | "ni": execution.destination.destination_metadata[1], 48 | "t": "event", 49 | "ds": "mp - megalista", 50 | **{'cid': row[key] for key in row.keys() if key.startswith("client_id")}, 51 | **{'uid': row[key] for key in row.keys() if key.startswith("user_id")}, 52 | "ea": row['event_action'], 53 | "ec": row['event_category'], 54 | "ev": row.get('event_value'), 55 | "el": row.get('event_label'), 56 | "ua": self.UA, 57 | **{key: row[key] for key in row.keys() if re.match('c[dm]\d+',key)} 58 | } for row in rows] 59 | 60 | encoded = [self._format_hit(payload) for payload in payloads] 61 | 62 | payload = '\n'.join(encoded) 63 | response = requests.post(url=self.API_URL, data=payload) 64 | if response.status_code != 200: 65 | raise Exception( 66 | f"Error uploading to Analytics HTTP {response.status_code}: {response.raw}") 67 | else: 68 | yield batch 69 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_analytics/google_analytics_user_list_uploader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import logging 17 | 18 | import apache_beam as beam 19 | from google.oauth2.credentials import Credentials 20 | from googleapiclient.discovery import build 21 | from googleapiclient.http import MediaInMemoryUpload 22 | 23 | from uploaders import utils 24 | from models.execution import Batch, DestinationType 25 | 26 | 27 | class GoogleAnalyticsUserListUploaderDoFn(beam.DoFn): 28 | def __init__(self, 29 | oauth_credentials): 30 | super().__init__() 31 | self.oauth_credentials = oauth_credentials 32 | 33 | def _get_analytics_service(self): 34 | credentials = Credentials( 35 | token=self.oauth_credentials.get_access_token(), 36 | refresh_token=self.oauth_credentials.get_refresh_token(), 37 | client_id=self.oauth_credentials.get_client_id(), 38 | client_secret=self.oauth_credentials.get_client_secret(), 39 | token_uri='https://accounts.google.com/o/oauth2/token', 40 | scopes=["https://www.googleapis.com/auth/analytics.edit", 'https://www.googleapis.com/auth/adwords']) 41 | 42 | service = build('analytics', 'v3', credentials=credentials) 43 | return service 44 | 45 | def _create_list_if_doesnt_exist(self, analytics, web_property_id, view_ids, list_name, list_definition, 46 | ga_account_id, ads_customer_id, mcc): 47 | lists = analytics.management().remarketingAudience().list( 48 | accountId=ga_account_id, webPropertyId=web_property_id).execute()['items'] 49 | results = list( 50 | filter(lambda x: x['name'] == list_name, lists)) 51 | if len(results) == 0: 52 | logging.getLogger().info('%s list does not exist, creating...' % list_name) 53 | 54 | response = analytics.management().remarketingAudience().insert( 55 | accountId=ga_account_id, 56 | webPropertyId=web_property_id, 57 | body={ 58 | 'name': list_name, 59 | 'linkedViews': view_ids, 60 | 'linkedAdAccounts': [{ 61 | 'type': 'MCC_LINKS' if mcc else 'ADWORDS_LINKS', 62 | 'linkedAccountId': ads_customer_id 63 | }], 64 | **list_definition 65 | }).execute() 66 | id = response['id'] 67 | logging.getLogger().info('%s created with id: %s' % (list_name, id)) 68 | else: 69 | id = results[0]['id'] 70 | logging.getLogger().info('%s found with id: %s' % (list_name, id)) 71 | return id 72 | 73 | def start_bundle(self): 74 | pass 75 | 76 | def _create_list(self, web_property_id, view_id, user_id_list_name, buyer_custom_dim, ga_account_id, 77 | ads_customer_id, 78 | mcc): 79 | analytics = self._get_analytics_service() 80 | view_ids = [view_id] 81 | self._create_list_if_doesnt_exist(analytics, web_property_id, view_ids, user_id_list_name, { 82 | 'audienceType': 'SIMPLE', 83 | 'audienceDefinition': { 84 | 'includeConditions': { 85 | 'kind': 'analytics#includeConditions', 86 | 'isSmartList': False, 87 | 'segment': 'users::condition::%s==buyer' % buyer_custom_dim, 88 | 'membershipDurationDays': 365 89 | } 90 | } 91 | }, ga_account_id, ads_customer_id, mcc) 92 | 93 | @staticmethod 94 | def _assert_all_list_names_are_present(any_execution): 95 | destination = any_execution.destination.destination_metadata 96 | if len(destination) < 6: 97 | raise ValueError('Missing destination information. Found {}'.format(len(destination))) 98 | 99 | if not destination[0] \ 100 | or not destination[1] \ 101 | or not destination[2] \ 102 | or not destination[4] \ 103 | or not destination[5]: 104 | raise ValueError('Missing destination information. Received {}'.format(str(destination))) 105 | 106 | @utils.safe_process(logger=logging.getLogger("megalista.GoogleAnalyticsUserListUploader")) 107 | def process(self, batch: Batch, **kwargs): 108 | execution = batch.execution 109 | self._assert_all_list_names_are_present(execution) 110 | 111 | ads_customer_id = execution.account_config.google_ads_account_id 112 | mcc = execution.account_config.mcc 113 | ga_account_id = execution.account_config.google_analytics_account_id 114 | 115 | # Reads all metadata parameters 116 | metadata = execution.destination.destination_metadata 117 | 118 | web_property_id = metadata[0] 119 | view_id = metadata[1] 120 | data_import_name = metadata[2] 121 | user_id_list_name = metadata[3] 122 | user_id_custom_dim = metadata[4] 123 | buyer_custom_dim = metadata[5] 124 | 125 | # Optional parameter 126 | custom_dim_field = metadata[6] if len(metadata) > 6 else None 127 | 128 | self._do_upload_data(web_property_id, view_id, data_import_name, user_id_list_name, user_id_custom_dim, 129 | buyer_custom_dim, custom_dim_field, ga_account_id, ads_customer_id, mcc, 130 | batch.elements) 131 | 132 | def _do_upload_data(self, web_property_id, view_id, data_import_name, user_id_list_name, user_id_custom_dim, 133 | buyer_custom_dim, custom_dim_field, ga_account_id, ads_customer_id, mcc, rows): 134 | 135 | if user_id_list_name: 136 | self._create_list(web_property_id, view_id, user_id_list_name, buyer_custom_dim, ga_account_id, 137 | ads_customer_id, mcc) 138 | 139 | analytics = self._get_analytics_service() 140 | data_sources = analytics.management().customDataSources().list( 141 | accountId=ga_account_id, webPropertyId=web_property_id).execute()['items'] 142 | results = list( 143 | filter(lambda x: x['name'] == data_import_name, data_sources)) 144 | 145 | if len(results) == 1: 146 | 147 | id = results[0]['id'] 148 | 149 | logging.getLogger().info("Adding data to %s - %s" % (data_import_name, id)) 150 | body = '\n'.join([ 151 | '%s,%s' % (user_id_custom_dim, buyer_custom_dim), 152 | *['%s,%s' % (row['user_id'], row[custom_dim_field] if custom_dim_field else 'buyer') for row in rows] 153 | ]) 154 | 155 | try: 156 | media = MediaInMemoryUpload(bytes(body, 'UTF-8'), 157 | mimetype='application/octet-stream', 158 | resumable=True) 159 | analytics.management().uploads().uploadData( 160 | accountId=ga_account_id, 161 | webPropertyId=web_property_id, 162 | customDataSourceId=id, 163 | media_body=media).execute() 164 | except Exception as e: 165 | logging.getLogger().error('Error while uploading GA Data: %s' % e) 166 | else: 167 | logging.getLogger().error( 168 | "%s - data import not found, please configure it in Google Analytics" % data_import_name) 169 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/google_analytics/google_analytics_user_list_uploader_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | from apache_beam.options.value_provider import StaticValueProvider 17 | 18 | from uploaders.google_analytics.google_analytics_user_list_uploader import GoogleAnalyticsUserListUploaderDoFn 19 | from models.oauth_credentials import OAuthCredentials 20 | from models.execution import Execution, SourceType, DestinationType, Source, AccountConfig, Destination, Batch 21 | 22 | 23 | @pytest.fixture 24 | def uploader(mocker): 25 | mocker.patch('googleads.oauth2.GoogleRefreshTokenClient') 26 | mocker.patch('googleads.adwords.AdWordsClient') 27 | client_id = StaticValueProvider(str, 'id') 28 | secret = StaticValueProvider(str, 'secret') 29 | access = StaticValueProvider(str, 'access') 30 | refresh = StaticValueProvider(str, 'refresh') 31 | credentials = OAuthCredentials(client_id, secret, access, refresh) 32 | return GoogleAnalyticsUserListUploaderDoFn(credentials) 33 | 34 | 35 | def test_get_service(uploader): 36 | assert uploader._get_analytics_service() is not None 37 | 38 | 39 | def test_list_already_exists(mocker, uploader): 40 | service = mocker.MagicMock() 41 | service.management().remarketingAudience().list().execute = mocker.Mock( 42 | return_value={'items': [{ 43 | 'id': 1, 44 | 'name': 'list' 45 | }]}) 46 | 47 | mocker.patch.object(uploader, '_get_analytics_service') 48 | uploader._get_analytics_service.return_value = service 49 | 50 | execution = Execution( 51 | AccountConfig('', False, '', '', ''), 52 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']), 53 | Destination('dest1', DestinationType.GA_USER_LIST_UPLOAD, 54 | ['a', 'b', 'c', 'list', 'd', 'e'])) 55 | 56 | uploader.process(Batch(execution, [])) 57 | 58 | uploader._get_analytics_service().management().remarketingAudience( 59 | ).insert.assert_not_called() 60 | 61 | 62 | def test_list_creation_not_mcc(mocker, uploader): 63 | ads_account_id = 'xxx-yyy-zzzz' 64 | ga_account_id = 'acc' 65 | 66 | service = mocker.MagicMock() 67 | 68 | mocker.patch.object(uploader, '_get_analytics_service') 69 | uploader._get_analytics_service.return_value = service 70 | 71 | service.management().remarketingAudience().insert().execute.return_value = { 72 | 'id': 1 73 | } 74 | 75 | execution = Execution( 76 | AccountConfig(ads_account_id, False, ga_account_id, '', ''), 77 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']), 78 | Destination( 79 | 'dest1', DestinationType.GA_USER_LIST_UPLOAD, 80 | ['web_property', 'view', 'c', 'list', 'd', 'buyers_custom_dim'])) 81 | uploader.process(Batch(execution, [])) 82 | 83 | service.management().remarketingAudience().insert.assert_any_call( 84 | accountId=ga_account_id, 85 | webPropertyId='web_property', 86 | body={ 87 | 'name': 'list', 88 | 'linkedViews': ['view'], 89 | 'linkedAdAccounts': [{ 90 | 'type': 'ADWORDS_LINKS', 91 | 'linkedAccountId': ads_account_id 92 | }], 93 | 'audienceType': 'SIMPLE', 94 | 'audienceDefinition': { 95 | 'includeConditions': { 96 | 'kind': 97 | 'analytics#includeConditions', 98 | 'isSmartList': 99 | False, 100 | 'segment': 101 | 'users::condition::%s==buyer' % 'buyers_custom_dim', 102 | 'membershipDurationDays': 103 | 365 104 | } 105 | } 106 | }) 107 | 108 | 109 | def test_list_creation_mcc(mocker, uploader): 110 | ads_account_id = 'xxx-yyy-zzzz' 111 | ga_account_id = 'acc' 112 | 113 | service = mocker.MagicMock() 114 | 115 | mocker.patch.object(uploader, '_get_analytics_service') 116 | uploader._get_analytics_service.return_value = service 117 | 118 | service.management().remarketingAudience().insert().execute.return_value = { 119 | 'id': 1 120 | } 121 | 122 | execution = Execution( 123 | AccountConfig(ads_account_id, True, ga_account_id, '', ''), 124 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']), 125 | Destination( 126 | 'dest1', DestinationType.GA_USER_LIST_UPLOAD, 127 | ['web_property', 'view', 'c', 'list', 'd', 'buyers_custom_dim'])) 128 | uploader.process(Batch(execution, [])) 129 | 130 | service.management().remarketingAudience().insert.assert_any_call( 131 | accountId=ga_account_id, 132 | webPropertyId='web_property', 133 | body={ 134 | 'name': 'list', 135 | 'linkedViews': ['view'], 136 | 'linkedAdAccounts': [{ 137 | 'type': 'MCC_LINKS', 138 | 'linkedAccountId': ads_account_id 139 | }], 140 | 'audienceType': 'SIMPLE', 141 | 'audienceDefinition': { 142 | 'includeConditions': { 143 | 'kind': 144 | 'analytics#includeConditions', 145 | 'isSmartList': 146 | False, 147 | 'segment': 148 | 'users::condition::%s==buyer' % 'buyers_custom_dim', 149 | 'membershipDurationDays': 150 | 365 151 | } 152 | } 153 | }) 154 | 155 | 156 | def test_avoid_list_creation_when_name_blank(mocker, uploader): 157 | ads_account_id = 'xxx-yyy-zzzz' 158 | ga_account_id = 'acc' 159 | 160 | service = mocker.MagicMock() 161 | 162 | mocker.patch.object(uploader, '_get_analytics_service') 163 | uploader._get_analytics_service.return_value = service 164 | 165 | execution = Execution( 166 | AccountConfig(ads_account_id, True, ga_account_id, '', ''), 167 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']), 168 | Destination('dest1', DestinationType.GA_USER_LIST_UPLOAD, 169 | ['web_property', 'view', 'c', '', 'd', 'buyers_custom_dim'])) 170 | 171 | uploader.process(Batch(execution, [])) 172 | 173 | service.management().remarketingAudience().insert.assert_not_called() 174 | 175 | 176 | def test_elements_uploading(mocker, uploader): 177 | service = mocker.MagicMock() 178 | 179 | mocker.patch.object(uploader, '_get_analytics_service') 180 | uploader._get_analytics_service.return_value = service 181 | 182 | service.management().customDataSources().list().execute.return_value = { 183 | 'items': [{ 184 | 'id': 1, 185 | 'name': 'data_import_name' 186 | }] 187 | } 188 | 189 | execution = Execution( 190 | AccountConfig('', False, '', '', ''), 191 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']), 192 | Destination('dest1', DestinationType.GA_USER_LIST_UPLOAD, [ 193 | 'web_property', 'b', 'data_import_name', 'd', 'user_id_custom_dim', 194 | 'buyer_custom_dim' 195 | ])) 196 | 197 | # Add mock to side effect of uploadData() 198 | my_mock = mocker.MagicMock() 199 | service.management().uploads().uploadData.side_effect = my_mock 200 | 201 | # Act 202 | uploader.process(Batch(execution, [{ 203 | 'user_id': '12' 204 | }, { 205 | 'user_id': '34' 206 | }])) 207 | 208 | # Called once 209 | my_mock.assert_called_once() 210 | 211 | # Intercept args called 212 | _, kwargs = my_mock.call_args 213 | 214 | # Check if really sent values from custom field 215 | media_bytes = kwargs['media_body'].getbytes(0, -1) 216 | 217 | assert media_bytes == b'user_id_custom_dim,buyer_custom_dim\n12,buyer\n34,buyer' 218 | 219 | 220 | def test_elements_uploading_custom_field(mocker, uploader): 221 | service = mocker.MagicMock() 222 | 223 | mocker.patch.object(uploader, '_get_analytics_service') 224 | uploader._get_analytics_service.return_value = service 225 | 226 | service.management().customDataSources().list().execute.return_value = { 227 | 'items': [{ 228 | 'id': 1, 229 | 'name': 'data_import_name' 230 | }] 231 | } 232 | 233 | execution = Execution( 234 | AccountConfig('', False, '', '', ''), 235 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']), 236 | Destination('dest1', DestinationType.GA_USER_LIST_UPLOAD, [ 237 | 'web_property', 'b', 'data_import_name', 'd', 'user_id_custom_dim', 238 | 'buyer_custom_dim', 'my_field' 239 | ])) 240 | 241 | # Add mock to side effect of uploadData() 242 | my_mock = mocker.MagicMock() 243 | service.management().uploads().uploadData.side_effect = my_mock 244 | 245 | # Act 246 | uploader.process(Batch(execution, [{ 247 | 'user_id': '12', 248 | 'my_field': '11' 249 | }, { 250 | 'user_id': '34', 251 | 'my_field': '22' 252 | }])) 253 | 254 | # Called once 255 | my_mock.assert_called_once() 256 | 257 | # Intercept args called 258 | _, kwargs = my_mock.call_args 259 | 260 | # Check if really sent values from custom field 261 | media_bytes = kwargs['media_body'].getbytes(0, -1) 262 | 263 | assert media_bytes == b'user_id_custom_dim,buyer_custom_dim\n12,11\n34,22' 264 | -------------------------------------------------------------------------------- /megalist_dataflow/uploaders/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import datetime 16 | from models.execution import DestinationType 17 | from models.execution import Execution 18 | import pytz 19 | 20 | MAX_RETRIES = 3 21 | 22 | timezone = pytz.timezone('America/Sao_Paulo') 23 | 24 | 25 | def get_ads_service(service_name, version, oauth_credentials, developer_token, 26 | customer_id): 27 | from googleads import adwords 28 | from googleads import oauth2 29 | oauth2_client = oauth2.GoogleRefreshTokenClient( 30 | oauth_credentials.get_client_id(), oauth_credentials.get_client_secret(), 31 | oauth_credentials.get_refresh_token()) 32 | client = adwords.AdWordsClient( 33 | developer_token, 34 | oauth2_client, 35 | 'Mds Dataflow', 36 | client_customer_id=customer_id) 37 | client.partial_failure = True 38 | return client.GetService(service_name, version=version) 39 | 40 | 41 | def format_date(date): 42 | if isinstance(date, datetime.datetime): 43 | pdate = date 44 | else: 45 | pdate = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%S.%f') 46 | 47 | return f'{datetime.datetime.strftime(pdate, "%Y%m%d %H%M%S")} {timezone.zone}' 48 | 49 | 50 | def safe_process(logger): 51 | def deco(func): 52 | def inner(*args, **kwargs): 53 | batch = args[1] 54 | if not batch: 55 | logger.warning('Skipping upload, received no elements.') 56 | return 57 | logger.info(f'Uploading {len(batch.elements)} rows...') 58 | try: 59 | return func(*args, **kwargs) 60 | except Exception as e: 61 | logger.error(f'Error uploading data for :{batch.elements}') 62 | logger.error(e, exc_info=True) 63 | logger.exception('Error uploading data.') 64 | 65 | return inner 66 | 67 | return deco 68 | 69 | 70 | def safe_call_api(function, logger, *args, **kwargs): 71 | current_retry = 1 72 | _do_safe_call_api(function, logger, current_retry, *args, **kwargs) 73 | 74 | 75 | def _do_safe_call_api(function, logger, current_retry, *args, **kwargs): 76 | try: 77 | return function(*args, *kwargs) 78 | except Exception as e: 79 | if current_retry < MAX_RETRIES: 80 | logger.exception( 81 | f'Fail number {current_retry}. Stack track follows. Trying again.') 82 | current_retry += 1 83 | return _do_safe_call_api(function, logger, current_retry, *args, **kwargs) 84 | 85 | 86 | def convert_datetime_tz(dt, origin_tz, destination_tz): 87 | datetime_obj = pytz.timezone(origin_tz).localize(dt) 88 | return datetime_obj.astimezone(pytz.timezone(destination_tz)) 89 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | python_version = 3.8 3 | warn_return_any = True 4 | warn_unused_configs = True 5 | ignore_missing_imports = True -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.semantic_release] 2 | upload_to_pypi = false 3 | branch = 'master' 4 | version_variable = [ 5 | 'megalist_dataflow/setup.py:__version__' 6 | ] 7 | version_source = 'tag' 8 | build_command = false 9 | -------------------------------------------------------------------------------- /run_cloud.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2021 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | if [ $# != 3 ]; then 18 | echo "Usage: $0 gcp_project_id bucket_name region" 19 | exit 1 20 | fi 21 | 22 | gcloud config set project $1 23 | token=$(gcloud auth application-default print-access-token) 24 | curl -H "Authorization: Bearer $token" -H "Content-Type:application/json" "https://dataflow.googleapis.com/v1b3/projects/$1/locations/$3/templates:launch?gcsPath=gs://$2/templates/mds" --data-binary "@cloud_config/scheduler.json" 25 | -------------------------------------------------------------------------------- /run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2021 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | python3 -m mypy megalist_dataflow 17 | python3 -m pytest -vv --cov=megalist_dataflow -W ignore::DeprecationWarning 18 | -------------------------------------------------------------------------------- /terraform/external.tf: -------------------------------------------------------------------------------- 1 | #create detaflow metadata 2 | resource "null_resource" "bucket_megalista_metadata" { 3 | provisioner "local-exec" { 4 | command = "sh ./scripts/deploy_cloud.sh ${data.google_client_config.current.project} ${var.bucket_name} ${var.region}" 5 | } 6 | 7 | depends_on = [google_storage_bucket.my_storage] 8 | } -------------------------------------------------------------------------------- /terraform/main.tf: -------------------------------------------------------------------------------- 1 | data "google_client_config" "current" { 2 | } 3 | 4 | data "google_client_openid_userinfo" "me" { 5 | } 6 | 7 | resource "google_bigquery_dataset" "dataset" { 8 | dataset_id = var.bq_ops_dataset 9 | location = var.location 10 | description = "Auxliary bigquery dataset for Megalista operations to create" 11 | delete_contents_on_destroy = true 12 | } 13 | 14 | locals { 15 | scheduler_body = <