├── .github
├── CHANGELOG.md
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
├── PULL_REQUEST_TEMPLATE.md
├── dependabot.yml
└── workflows
│ ├── codacy-analysis.yml
│ ├── python-app.yml
│ ├── semantic-release.yml
│ └── terraform.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── _config.yml
├── cloud_config
├── buyers.json
├── generate_megalist_token.py
└── scheduler_sample.json
├── documentation
└── Megalista - Technical User Guide - EXTERNAL.pdf
├── generate_megalist_token.sh
├── megalist_dataflow
├── main.py
├── mappers
│ ├── __init__.py
│ ├── ads_ssd_hashing_mapper.py
│ ├── ads_user_list_pii_hashing_mapper.py
│ └── ads_user_list_pii_hashing_mapper_test.py
├── megalist_metadata
├── models
│ ├── __init__.py
│ ├── execution.py
│ ├── oauth_credentials.py
│ ├── oauth_credentials_test.py
│ ├── options.py
│ ├── options_test.py
│ └── sheets_config.py
├── requirements.txt
├── setup.py
├── sources
│ ├── __init__.py
│ ├── base_bounded_source.py
│ ├── batches_from_executions.py
│ ├── firestore_execution_source.py
│ └── spreadsheet_execution_source.py
└── uploaders
│ ├── __init__.py
│ ├── appsflyer
│ ├── __init__.py
│ └── appsflyer_s2s_uploader_async.py
│ ├── big_query
│ ├── __init__.py
│ ├── transactional_events_results_writer.py
│ └── transactional_events_results_writer_test.py
│ ├── campaign_manager
│ ├── __init__.py
│ ├── campaign_manager_conversion_uploader.py
│ └── campaign_manager_conversion_uploader_test.py
│ ├── google_ads
│ ├── __init__.py
│ ├── conversions
│ │ ├── __init__.py
│ │ ├── google_ads_offline_conversions_uploader.py
│ │ ├── google_ads_offline_conversions_uploader_test.py
│ │ ├── google_ads_ssd_uploader.py
│ │ └── google_ads_ssd_uploader_test.py
│ └── customer_match
│ │ ├── __init__.py
│ │ ├── abstract_uploader.py
│ │ ├── contact_info_uploader.py
│ │ ├── mobile_uploader.py
│ │ └── user_id_uploader.py
│ ├── google_analytics
│ ├── __init__.py
│ ├── google_analytics_4_measurement_protocol.py
│ ├── google_analytics_4_measurement_protocol_test.py
│ ├── google_analytics_data_import_eraser.py
│ ├── google_analytics_data_import_eraser_test.py
│ ├── google_analytics_data_import_uploader.py
│ ├── google_analytics_data_import_uploader_test.py
│ ├── google_analytics_measurement_protocol.py
│ ├── google_analytics_user_list_uploader.py
│ └── google_analytics_user_list_uploader_test.py
│ └── utils.py
├── mypy.ini
├── pyproject.toml
├── run_cloud.sh
├── run_tests.sh
├── terraform
├── external.tf
├── main.tf
├── scripts
│ └── deploy_cloud.sh
└── variables.tf
└── terraform_deploy.sh
/.github/CHANGELOG.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DP6/marketing-data-sync/e65fd6627612281143f0696461bd6475e793785d/.github/CHANGELOG.md
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: '[BUG]'
5 | labels: bug
6 | assignees: ''
7 | ---
8 |
9 | **Describe the bug**
10 | A clear and concise description of what the bug is.
11 |
12 | **To Reproduce**
13 | Steps to reproduce the behavior:
14 |
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Additional context**
27 | Add any other context about the problem here.
28 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: '[NEW]'
5 | labels: enhancement
6 | assignees: ''
7 | ---
8 |
9 | **Is your feature request related to a problem? Please describe.**
10 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
11 |
12 | **Describe the solution you'd like**
13 | A clear and concise description of what you want to happen.
14 |
15 | **Describe alternatives you've considered**
16 | A clear and concise description of any alternative solutions or features you've considered.
17 |
18 | **Additional context**
19 | Add any other context or screenshots about the feature request here.
20 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | **What issue does this pull request resolve?**
2 |
3 | **What changes did you make?**
4 |
5 | **Is there anything that requires more attention while reviewing?**
6 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | # Maintain dependencies for GitHub Actions
4 | - package-ecosystem: 'github-actions'
5 | directory: '/'
6 | schedule:
7 | interval: 'monthly'
8 |
9 | # Maintain dependencies for npm
10 | - package-ecosystem: 'pip'
11 | directory: '/megalist_dataflow'
12 | schedule:
13 | interval: 'monthly'
14 |
--------------------------------------------------------------------------------
/.github/workflows/codacy-analysis.yml:
--------------------------------------------------------------------------------
1 | # This workflow checks out code, performs a Codacy security scan
2 | # and integrates the results with the
3 | # GitHub Advanced Security code scanning feature. For more information on
4 | # the Codacy security scan action usage and parameters, see
5 | # https://github.com/codacy/codacy-analysis-cli-action.
6 | # For more information on Codacy Analysis CLI in general, see
7 | # https://github.com/codacy/codacy-analysis-cli.
8 |
9 | name: Codacy Analysis
10 |
11 | on: ['push']
12 |
13 | jobs:
14 | codacy-security-scan:
15 | name: Codacy Analysis
16 | runs-on: ubuntu-latest
17 | steps:
18 | # Checkout the repository to the GitHub Actions runner
19 | - name: Checkout code
20 | uses: actions/checkout@v2
21 |
22 | # Execute Codacy Analysis CLI and generate a SARIF output with the security issues identified during the analysis
23 | - name: Run Codacy Analysis CLI
24 | uses: codacy/codacy-analysis-cli-action@3.0.0
25 | with:
26 | # Check https://github.com/codacy/codacy-analysis-cli#project-token to get your project token from your Codacy repository
27 | # You can also omit the token and run the tools that support default configurations
28 | project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
29 | verbose: true
30 | output: results.sarif
31 | format: sarif
32 | # Adjust severity of non-security issues
33 | gh-code-scanning-compat: true
34 | # Force 0 exit code to allow SARIF file generation
35 | # This will handover control about PR rejection to the GitHub side
36 | max-allowed-issues: 2147483647
37 |
--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Python
5 |
6 | on:
7 | push:
8 | branches: [ develop ]
9 | pull_request:
10 | branches: [ main, master ]
11 |
12 | jobs:
13 | unit_testing:
14 | name: Test
15 | runs-on: ubuntu-latest
16 |
17 | steps:
18 | - uses: actions/checkout@v2
19 | - name: Set up Python 3.8
20 | uses: actions/setup-python@v2
21 | with:
22 | python-version: 3.8
23 | - name: Install dependencies
24 | run: |
25 | python -m pip install --upgrade pip
26 | pip install -r megalist_dataflow/requirements.txt
27 | - name: Run tests
28 | run: |
29 | ./run_tests.sh
30 | - name: Upload coverage to Codacy
31 | run: export CODACY_PROJECT_TOKEN=${{ secrets.CODACY_PROJECT_TOKEN }} && bash <(curl -Ls https://coverage.codacy.com/get.sh) report -r megalist_dataflow/*
32 | continue-on-error: true
33 |
--------------------------------------------------------------------------------
/.github/workflows/semantic-release.yml:
--------------------------------------------------------------------------------
1 | name: Semantic Release
2 |
3 | on:
4 | push:
5 | branches: [ main, master ]
6 |
7 | jobs:
8 | release:
9 | name: Release
10 | runs-on: ubuntu-latest
11 |
12 | steps:
13 | - uses: actions/checkout@v2
14 | with:
15 | fetch-depth: 0
16 |
17 | - name: Python Semantic Release
18 | uses: relekang/python-semantic-release@master
19 | with:
20 | github_token: ${{ secrets.GITHUB_TOKEN }}
21 | pypi_token: ${{ secrets.PYPI_TOKEN }}
--------------------------------------------------------------------------------
/.github/workflows/terraform.yml:
--------------------------------------------------------------------------------
1 | name: Terraform Validate
2 |
3 | on: ['push']
4 |
5 | jobs:
6 | terraform-actions:
7 | name: tf validate
8 | runs-on: ubuntu-latest
9 | defaults:
10 | run:
11 | working-directory: ./terraform
12 | steps:
13 | - name: Checkout Repository
14 | uses: actions/checkout@master
15 |
16 | - name: HashiCorp - Setup Terraform
17 | uses: hashicorp/setup-terraform@v1.3.2
18 | with:
19 | terraform_version: 0.14.6
20 |
21 | - name: Terraform Init
22 | id: init
23 | run: terraform init
24 | continue-on-error: true
25 |
26 | - name: Terraform Fmt
27 | id: fmt
28 | run: terraform fmt -check -diff
29 | continue-on-error: true
30 |
31 | - name: Terraform Validate
32 | id: validate
33 | run: terraform validate -no-color
34 | continue-on-error: false
35 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *__pycache__*
2 | *.egg-info
3 | temp
4 | .pytest_cache
5 | .coverage
6 | htmlcov
7 | cloud_config/scheduler.json
8 | run_local.sh
9 | generate_csv.sh
10 | .terraform
11 | .idea
12 | .venv
13 | .vscode
14 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to make participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | - Using welcoming and inclusive language
18 | - Being respectful of differing viewpoints and experiences
19 | - Gracefully accepting constructive criticism
20 | - Focusing on what is best for the community
21 | - Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | - The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | - Trolling, insulting/derogatory comments, and personal or political attacks
28 | - Public or private harassment
29 | - Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | - Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at koopas@dp6.com.br. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | We'd love to accept your patches and contributions to this project. There are
4 | just a few small guidelines you need to follow.
5 |
6 | ## Contributor License Agreement
7 |
8 | Contributions to this project must be accompanied by a Contributor License
9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to to see
12 | your current agreements on file or to sign a new one.
13 |
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 |
18 | ## Code reviews
19 |
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 |
25 | ## Community Guidelines
26 |
27 | This project follows
28 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
29 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MDS - Marketing Data Sync
2 |
3 | Solution based on the [Google Megalista project](https://github.com/google/megalista).
4 |
5 |
6 |

7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 | Sample integration code for onboarding offline/CRM data from BigQuery as custom audiences or offline conversions in Google Ads, Google Analytics 360, Google Display & Video 360, Google Campaign Manager and Facebook Ads.
20 |
21 | ## Supported integrations
22 | - **Google Ads**
23 | - Contact Info **Customer Match** (email, phone, address) [[details]](https://support.google.com/google-ads/answer/6379332?&ref_topic=6296507)
24 | - Id Based **Customer Match** (device Id, user id)
25 | - Offline Conversions through **gclid** [[details]](https://support.google.com/google-ads/answer/2998031?)
26 | - Store Sales Direct **(SSD)** conversions [[details]](https://support.google.com/google-ads/answer/9995886?hl=en)
27 |
28 | - **Google Analytics (Universal analytics)**
29 | - Custom segments through **Data Import** [[details]](https://support.google.com/analytics/answer/3191589?hl=en)
30 | - Measurement Protocol [[details]](https://developers.google.com/analytics/devguides/collection/protocol/v1#:~:text=Measurement%20Protocol%20Overview%20bookmark_border&text=The%20Google%20Analytics%20Measurement%20Protocol,directly%20to%20Google%20Analytics%20servers.)
31 |
32 | - **Campaign Manager**
33 | - Offline Conversions API **(user id, device id, match id, gclid, dclid)** [[details]](https://developers.google.com/doubleclick-advertisers/guides/conversions_upload)
34 |
35 | - **Google Analytics 4**
36 | - Measurement protocol (Web + App) [[details]](https://developers.google.com/analytics/devguides/collection/protocol/ga4)
37 |
38 | - **Appsflyer**
39 | - S2S Offline events API (conversion upload), to be used for audience creation and in-app events with Google Ads and DV360 [[details]](https://support.appsflyer.com/hc/en-us/articles/207034486-API-de-eventos-de-servidor-para-servidor-S2S-mobile-para-mobile)
40 |
41 | ## How does it work
42 | MDS was design to separate the configuration of conversion/audience upload rules from the engine, giving more freedom for non-technical teams (i.e. Media and Business Inteligence) to setup multiple upload rules on their own.
43 |
44 | The solution consists in #1 a Google Spreadsheet (template) in which all rules are defined by mapping a data source (BigQuery Table) to a destination (data upload endpoint) and #2, an apache beam workflow running on Google Dataflow, scheduled to upload the data in batch mode.
45 |
46 | ## Prerequisites
47 |
48 | ### Google Cloud Services
49 | - **Google Cloud Platform** account
50 | - **Billing** enabled
51 | - **BigQuery** enabled
52 | - **Dataflow** enabled
53 | - **Cloud storage** enabled
54 | - **Cloud scheduler** enabled
55 | - At least one of:
56 | - **Google Ads** API Access
57 | - **Campaign Manager** API Access
58 | - **Google Analytics** API Access
59 | - **Python3**
60 | - **Google Cloud SDK**
61 |
62 | ### Access Requirements
63 | Those are the minimum roles necessary to deploy MDS:
64 | - OAuth Config Editor
65 | - BigQuery User
66 | - BigQuery Job User
67 | - BigQuery Data Viewer
68 | - Cloud Scheduler Admin
69 | - Storage Admin
70 | - Dataflow Admin
71 | - Service Account Admin
72 | - Logs Viewer
73 | - Service Consumer
74 |
75 | ### APIs
76 | Required APIs will depend on upload endpoints in use. We recomend you to enable all of them:
77 | - Google Sheets (required for any use case) [[link]](https://console.cloud.google.com/apis/library/sheets.googleapis.com)
78 | - Google Analytics [[link]](https://console.cloud.google.com/apis/library/analytics.googleapis.com)
79 | - Google Analytics Reporting [[link]](https://console.cloud.google.com/apis/library/analyticsreporting.googleapis.com)
80 | - Google Ads [[link]](https://console.cloud.google.com/apis/library/googleads.googleapis.com)
81 | - Campaign Manager [[link]](https://console.cloud.google.com/apis/library/dfareporting.googleapis.com)
82 |
83 |
84 | ## Installation
85 |
86 | ### Create a copy of the configuration Spreadsheet
87 | WIP
88 |
89 | ### Creating required access tokens
90 | To access campaigns and user lists on Google's platforms, this dataflow will need OAuth tokens for a account that can authenticate in those systems.
91 |
92 | In order to create it, follow these steps:
93 | - Access GCP console
94 | - Go to the **API & Services** section on the top-left menu.
95 | - On the **OAuth Consent Screen** and configure an *Application name*
96 | - Then, go to the **Credentials** and create an *OAuth client Id* with Application type set as *Desktop App*
97 | - This will generate a *Client Id* and a *Client secret*
98 | - Run the **generate_mds_token.sh** script in this folder providing these two values and follow the instructions
99 | - Sample: `./generate_mds_token.sh client_id client_secret`
100 | - This will generate the *Access Token* and the *Refresh token*
101 |
102 | ### Creating a bucket on Cloud Storage
103 | This bucket will hold the deployed code for this solution. To create it, navigate to the *Storage* link on the top-left menu on GCP and click on *Create bucket*. You can use Regional location and Standard data type for this bucket.
104 |
105 | ## Running MDS
106 |
107 | We recommend first running it locally and make sure that everything works.
108 | Make some sample tables on BigQuery for one of the uploaders and make sure that the data is getting correctly to the destination.
109 | After that is done, upload the Dataflow template to GCP and try running it manually via the UI to make sure it works.
110 | Lastly, configure the Cloud Scheduler to run MDS in the frequency desired and you'll have a fully functional data integration pipeline.
111 |
112 | ### Running locally
113 | ```bash
114 | python3 mds_dataflow/main.py \
115 | --runner DirectRunner \
116 | --developer_token ${GOOGLE_ADS_DEVELOPER_TOKEN} \
117 | --setup_sheet_id ${CONFIGURATION_SHEET_ID} \
118 | --refresh_token ${REFRESH_TOKEN} \
119 | --access_token ${ACCESS_TOKEN} \
120 | --client_id ${CLIENT_ID} \
121 | --client_secret ${CLIENT_SECRET} \
122 | --project ${GCP_PROJECT_ID} \
123 | --region us-central1 \
124 | --temp_location gs://{$GCS_BUCKET}/tmp
125 | ```
126 |
127 | ### Deploying Pipeline
128 | To deploy, use the following commands from the root folder:
129 | ```
130 | cd terraform
131 | ./scripts/deploy_cloud.sh project_id bucket_name region_name
132 | ```
133 |
134 | #### Manually executing pipeline using Dataflow UI
135 | To execute the pipeline, use the following steps:
136 | - Go to **Dataflow** on GCP console
137 | - Click on *Create job from template*
138 | - On the template selection dropdown, select *Custom template*
139 | - Find the *mds* file on the bucket you've created, on the templates folder
140 | - Fill in the parameters required and execute
141 |
142 | ### Scheduling pipeline
143 | To schedule daily/hourly runs, go to **Cloud Scheduler**:
144 | - Click on *create job*
145 | - Add a name and frequency as desired
146 | - For *target* set as HTTP
147 | - Configure a *POST* for url: https://dataflow.googleapis.com/v1b3/projects/${YOUR_PROJECT_ID}/locations/${LOCATION}/templates:launch?gcsPath=gs://${BUCKET_NAME}/templates/mds, replacing the params with the actual values
148 | - For a sample on the *body* of the request, check **cloud_config/scheduler.json**
149 | - Add OAuth Headers
150 | - Scope: https://www.googleapis.com/auth/cloud-platform
151 |
152 | #### Creating a Service Account
153 | It's recommended to create a new Service Account to be used with the Cloud Scheduler
154 | - Go to IAM & Admin > Service Accounts
155 | - Create a new Service Account with the following roles:
156 | - Cloud Dataflow Service Agent
157 | - Dataflow Admin
158 | - Storage Objects Viewer
159 |
160 |
161 | ## Usage
162 | Every upload method expects as source a BigQuery data with specific fields, in addition to specific configuration metadata. For details on how to setup your upload routines, refer to the [MDS Wiki](https://github.com/dp6/marketing-data-sync/wiki) or the [MDS user guide](https://github.com/dp6/marketing-data-sync/blob/main/documentation/mds%20-%20Technical%20User%20Guide%20-%20EXTERNAL.pdf).
163 |
164 | ### Mandatory requirements
165 |
166 | Only contributions that meet the following requirements will be accepted:
167 |
168 | - [Commit pattern](https://www.conventionalcommits.org/en/v1.0.0/)
169 |
170 | ## Support:
171 |
172 | **DP6 Koopa-troopa Team**
173 |
174 | _e-mail: _
175 |
176 |
177 |
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | title: DP6 - Centro de inovações
2 | initiative: 'Marketing Data Sync'
3 |
4 | remote_theme: dp6/dp6.github.io
5 |
6 | plugins:
7 | - jekyll-sitemap
8 | - jekyll-gzip
9 |
--------------------------------------------------------------------------------
/cloud_config/buyers.json:
--------------------------------------------------------------------------------
1 | {"user_id": "uuid1", "gclid":"E45C235","mobile_device_id":"A76B923847E","email":"test@test.com","mailing_address":{"first_name":"John","last_name":"Doe","country":"US","zip":"111-2222"},"phone":"555-1234","conversions":[{"id":"1","value":3.75,"time":"2019-06-21 01:11:21.805627 UTC"},{"id":"2","value":5.99,"time":"2019-06-21 01:11:21.805627 UTC"}]}
2 | {"user_id": "uuid2","gclid":"AB9203","mobile_device_id":"35883792E","email":"test2@test2.com","mailing_address":{"first_name":"Jane","last_name":"Doe","country":"US","zip":"111-2222"},"phone":"555-4321","conversions":[{"id":"1","value":53,"time":"2019-06-21 01:10:48.460715 UTC"},{"id":"2","value":12.99,"time":"2019-06-21 01:10:48.460715 UTC"}]}
--------------------------------------------------------------------------------
/cloud_config/generate_megalist_token.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Copyright 2014 Google Inc. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | """Generates refresh token for AdWords using the Installed Application flow."""
18 |
19 |
20 | import argparse
21 | import sys
22 |
23 | from google_auth_oauthlib.flow import InstalledAppFlow
24 | from oauthlib.oauth2.rfc6749.errors import InvalidGrantError
25 |
26 | # Your OAuth2 Client ID and Secret. If you do not have an ID and Secret yet,
27 | # please go to https://console.developers.google.com and create a set.
28 | DEFAULT_CLIENT_ID = None
29 | DEFAULT_CLIENT_SECRET = None
30 |
31 | # The AdWords API OAuth2 scope.
32 | SCOPES = ['https://www.googleapis.com/auth/adwords',
33 | 'https://www.googleapis.com/auth/dfareporting',
34 | 'https://www.googleapis.com/auth/dfatrafficking',
35 | 'https://www.googleapis.com/auth/ddmconversions',
36 | "https://www.googleapis.com/auth/analytics.edit",
37 | 'https://www.googleapis.com/auth/spreadsheets.readonly']
38 |
39 | # The redirect URI set for the given Client ID. The redirect URI for Client ID
40 | # generated for an installed application will always have this value.
41 | _REDIRECT_URI = 'urn:ietf:wg:oauth:2.0:oob'
42 |
43 | parser = argparse.ArgumentParser(description='Generates a refresh token with '
44 | 'the provided credentials.')
45 | parser.add_argument('--client_id', default=DEFAULT_CLIENT_ID,
46 | help='Client Id retrieved from the Developer\'s Console.')
47 | parser.add_argument('--client_secret', default=DEFAULT_CLIENT_SECRET,
48 | help='Client Secret retrieved from the Developer\'s '
49 | 'Console.')
50 | parser.add_argument('--additional_scopes', default=None,
51 | help='Additional scopes to apply when generating the '
52 | 'refresh token. Each scope should be separated by a comma.')
53 |
54 |
55 | class ClientConfigBuilder(object):
56 | """Helper class used to build a client config dict used in the OAuth 2.0 flow.
57 | """
58 | _DEFAULT_AUTH_URI = 'https://accounts.google.com/o/oauth2/auth'
59 | _DEFAULT_TOKEN_URI = 'https://accounts.google.com/o/oauth2/token'
60 | CLIENT_TYPE_WEB = 'web'
61 | CLIENT_TYPE_INSTALLED_APP = 'installed'
62 |
63 | def __init__(self, client_type=None, client_id=None, client_secret=None,
64 | auth_uri=_DEFAULT_AUTH_URI, token_uri=_DEFAULT_TOKEN_URI):
65 | self.client_type = client_type
66 | self.client_id = client_id
67 | self.client_secret = client_secret
68 | self.auth_uri = auth_uri
69 | self.token_uri = token_uri
70 |
71 | def Build(self):
72 | """Builds a client config dictionary used in the OAuth 2.0 flow."""
73 | if all((self.client_type, self.client_id, self.client_secret,
74 | self.auth_uri, self.token_uri)):
75 | client_config = {
76 | self.client_type: {
77 | 'client_id': self.client_id,
78 | 'client_secret': self.client_secret,
79 | 'auth_uri': self.auth_uri,
80 | 'token_uri': self.token_uri
81 | }
82 | }
83 | else:
84 | raise ValueError('Required field is missing.')
85 |
86 | return client_config
87 |
88 |
89 | def main(client_id, client_secret, scopes):
90 | """Retrieve and display the access and refresh token."""
91 | client_config = ClientConfigBuilder(
92 | client_type=ClientConfigBuilder.CLIENT_TYPE_WEB, client_id=client_id,
93 | client_secret=client_secret)
94 |
95 | flow = InstalledAppFlow.from_client_config(
96 | client_config.Build(), scopes=scopes)
97 | # Note that from_client_config will not produce a flow with the
98 | # redirect_uris (if any) set in the client_config. This must be set
99 | # separately.
100 | flow.redirect_uri = _REDIRECT_URI
101 |
102 | auth_url, _ = flow.authorization_url(prompt='consent')
103 |
104 | print('Log into the Google Account you use to access your AdWords account '
105 | 'and go to the following URL: \n%s\n' % auth_url)
106 | print('After approving the token enter the verification code (if specified).')
107 | code = input('Code: ').strip()
108 |
109 | try:
110 | flow.fetch_token(code=code)
111 | except InvalidGrantError as ex:
112 | print('Authentication has failed: %s' % ex)
113 | sys.exit(1)
114 |
115 | print('Access token: %s' % flow.credentials.token)
116 | print('Refresh token: %s' % flow.credentials.refresh_token)
117 |
118 |
119 | if __name__ == '__main__':
120 | args = parser.parse_args()
121 | configured_scopes = SCOPES
122 | if not (any([args.client_id, DEFAULT_CLIENT_ID]) and
123 | any([args.client_secret, DEFAULT_CLIENT_SECRET])):
124 | raise AttributeError('No client_id or client_secret specified.')
125 | if args.additional_scopes:
126 | configured_scopes.extend(
127 | args.additional_scopes.replace(' ', '').split(','))
128 | main(args.client_id, args.client_secret, configured_scopes)
129 |
--------------------------------------------------------------------------------
/cloud_config/scheduler_sample.json:
--------------------------------------------------------------------------------
1 | {
2 | "jobName": "mds_daily",
3 | "parameters": {
4 | "developer_token": "Google Ads Developer Token",
5 | "client_id": "GCP OAuth Client id",
6 | "client_secret": "GCP OAuth Client Secret",
7 | "access_token": "GCP OAuth access token",
8 | "refresh_token": "GCP OAuth refresh token",
9 | "setup_sheet_id": "Setup Google Sheets Id",
10 | "bq_ops_dataset": "Auxliary bigquery dataset used for MDS operations",
11 | "appsflyer_dev_key": "Apps flyer dev key"
12 | },
13 | "environment": {
14 | "tempLocation": "gs://bucket-name/temp",
15 | "zone": "us-central1-f"
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/documentation/Megalista - Technical User Guide - EXTERNAL.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DP6/marketing-data-sync/e65fd6627612281143f0696461bd6475e793785d/documentation/Megalista - Technical User Guide - EXTERNAL.pdf
--------------------------------------------------------------------------------
/generate_megalist_token.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2021 Google LLC
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # https://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | if [ $# != 2 ]; then
17 | echo "Usage: $0 client_id client_secret"
18 | exit 1
19 | fi
20 |
21 | pip3 install --user -q -r megalist_dataflow/requirements.txt
22 | python3 cloud_config/generate_megalist_token.py --client_id $1 --client_secret $2
23 |
--------------------------------------------------------------------------------
/megalist_dataflow/mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
--------------------------------------------------------------------------------
/megalist_dataflow/mappers/ads_ssd_hashing_mapper.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import Iterable
16 | from models.execution import Batch
17 |
18 | class AdsSSDHashingMapper():
19 | def _hash_field(self, s):
20 | import hashlib
21 | return hashlib.sha256(s.strip().lower().encode('utf-8')).hexdigest()
22 |
23 | def _map_conversion(self, conversion):
24 | return {
25 | 'hashedEmail': self._hash_field(conversion['email']),
26 | 'time': conversion['time'],
27 | 'amount': conversion['amount']
28 | }
29 |
30 | def _map_conversions(self, conversions):
31 | return [self._map_conversion(conversion) for conversion in conversions]
32 |
33 | def map_batch(self, batch: Batch):
34 | return Batch(batch.execution, self._map_conversions(batch.elements))
35 |
36 |
--------------------------------------------------------------------------------
/megalist_dataflow/mappers/ads_user_list_pii_hashing_mapper.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 |
17 | from models.execution import Batch
18 |
19 |
20 | class FieldHasher:
21 |
22 | def __init__(self, should_hash_fields):
23 | self.should_hash_fields = should_hash_fields
24 |
25 | def hash_field(self, field):
26 | import hashlib
27 |
28 | if self.should_hash_fields:
29 | return hashlib.sha256(field.strip().lower().encode('utf-8')).hexdigest()
30 |
31 | return field
32 |
33 |
34 | class AdsUserListPIIHashingMapper:
35 | def __init__(self):
36 | self.logger = logging.getLogger(
37 | 'megalista.AdsUserListPIIHashingMapper')
38 |
39 | def _hash_user(self, user, hasher):
40 |
41 | hashed = user.copy()
42 |
43 | try:
44 | if 'email' in user:
45 | hashed['hashedEmail'] = hasher.hash_field(user['email'])
46 | del hashed['email']
47 | except:
48 | self.logger.error("Error hashing email for user: %s" % user)
49 |
50 | try:
51 | if 'mailing_address_first_name' in user and 'mailing_address_last_name' in user:
52 | hashed['addressInfo'] = {
53 | 'hashedFirstName': hasher.hash_field(user['mailing_address_first_name']),
54 | 'hashedLastName': hasher.hash_field(user['mailing_address_last_name']),
55 | 'countryCode': user['mailing_address_country'],
56 | 'zipCode': user['mailing_address_zip']
57 | }
58 | del hashed['mailing_address_first_name']
59 | del hashed['mailing_address_last_name']
60 | del hashed['mailing_address_country']
61 | del hashed['mailing_address_zip']
62 | except:
63 | self.logger.error("Error hashing address for user: %s" % user)
64 |
65 | try:
66 | if 'phone' in user:
67 | hashed['hashedPhoneNumber'] = hasher.hash_field(user['phone'])
68 | del hashed['phone']
69 | except:
70 | self.logger.error("Error hashing phone for user: %s" % user)
71 |
72 | try:
73 | if 'mobile_device_id' in user:
74 | hashed['mobileId'] = user['mobile_device_id']
75 | del hashed['mobile_device_id']
76 | except:
77 | self.logger.error(
78 | "Error hashing mobile_device_id for user: %s" % user)
79 |
80 | try:
81 | if 'user_id' in user:
82 | hashed['userId'] = hasher.hash_field(user['user_id'])
83 | del hashed['user_id']
84 | except:
85 | self.logger.error("Error hashing user_id for user: %s" % user)
86 |
87 | return hashed
88 |
89 | def _get_should_hash_fields(self, metadata_list):
90 |
91 | if len(metadata_list) < 3:
92 | return True
93 |
94 | should_hash_fields = metadata_list[2]
95 |
96 | if not should_hash_fields:
97 | return True
98 |
99 | return should_hash_fields.lower() != 'false'
100 |
101 | def hash_users(self, batch: Batch):
102 |
103 | should_hash_fields = self._get_should_hash_fields(
104 | batch.execution.destination.destination_metadata)
105 | self.logger.debug('Should hash fields is %s' % should_hash_fields)
106 |
107 | return Batch(batch.execution, [self._hash_user(element, FieldHasher(should_hash_fields)) for element in batch.elements])
108 |
--------------------------------------------------------------------------------
/megalist_dataflow/mappers/ads_user_list_pii_hashing_mapper_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from mappers.ads_user_list_pii_hashing_mapper import AdsUserListPIIHashingMapper
16 |
17 | from models.execution import Batch
18 |
19 |
20 | def test_get_should_hash_fields():
21 |
22 | hasher = AdsUserListPIIHashingMapper()
23 |
24 | # True
25 | assert hasher._get_should_hash_fields(['ListName', 'Operator', 'True'])
26 | assert hasher._get_should_hash_fields(['ListName', 'Operator'])
27 | assert hasher._get_should_hash_fields(['ListName', 'Operator', None])
28 | assert hasher._get_should_hash_fields(['ListName', 'Operator', ''])
29 | assert hasher._get_should_hash_fields(['ListName', 'Operator', 'anything'])
30 |
31 | # False
32 | assert not hasher._get_should_hash_fields(['ListName', 'Operator', 'false'])
33 | assert not hasher._get_should_hash_fields(['ListName', 'Operator', 'FALSE'])
34 | assert not hasher._get_should_hash_fields(['ListName', 'Operator', 'False'])
35 |
36 |
37 | def test_pii_hashing(mocker):
38 |
39 | users = [{
40 | "email": "john@doe.com",
41 | "mailing_address_first_name": "John",
42 | "mailing_address_last_name": "Doe",
43 | "mailing_address_zip": "12345",
44 | "mailing_address_country": "US"
45 | },
46 | {
47 | "email": "jane@doe.com",
48 | "mailing_address_first_name": "Jane",
49 | "mailing_address_last_name": "Doe",
50 | "mailing_address_zip": "12345",
51 | "mailing_address_country": "US"
52 | }]
53 |
54 | # Execution mock
55 | execution = mocker.MagicMock()
56 | execution.destination.destination_metadata = ['Audience', 'ADD']
57 |
58 | batch = Batch(execution, [users[0], users[1]])
59 |
60 | # Call
61 | hasher = AdsUserListPIIHashingMapper()
62 | hashed = hasher.hash_users(batch).elements
63 |
64 | assert len(hashed) == 2
65 |
66 | assert hashed[0] == {
67 | 'hashedEmail': 'd709f370e52b57b4eb75f04e2b3422c4d41a05148cad8f81776d94a048fb70af',
68 | 'addressInfo': {
69 | 'countryCode': 'US',
70 | 'hashedFirstName': '96d9632f363564cc3032521409cf22a852f2032eec099ed5967c0d000cec607a',
71 | 'hashedLastName': '799ef92a11af918e3fb741df42934f3b568ed2d93ac1df74f1b8d41a27932a6f',
72 | 'zipCode': '12345'
73 | }}
74 |
75 | assert hashed[1] == {
76 | 'hashedEmail': '7c815580ad3844bcb627c74d24eaf700e1a711d9c23e9beb62ab8d28e8cb7954',
77 | 'addressInfo': {
78 | 'countryCode': 'US',
79 | 'hashedFirstName': '81f8f6dde88365f3928796ec7aa53f72820b06db8664f5fe76a7eb13e24546a2',
80 | 'hashedLastName': '799ef92a11af918e3fb741df42934f3b568ed2d93ac1df74f1b8d41a27932a6f',
81 | 'zipCode': '12345'
82 | }}
83 |
84 |
85 | def test_avoid_pii_hashing(mocker):
86 | users = [{
87 | "email": "john@doe.com",
88 | "mailing_address_first_name": "John",
89 | "mailing_address_last_name": "Doe",
90 | "mailing_address_zip": "12345",
91 | "mailing_address_country": "US"
92 | },
93 | {
94 | "email": "jane@doe.com",
95 | "mailing_address_first_name": "Jane",
96 | "mailing_address_last_name": "Doe",
97 | "mailing_address_zip": "12345",
98 | "mailing_address_country": "US"
99 | }]
100 |
101 | # Mock the execution
102 | execution = mocker.MagicMock()
103 | execution.destination.destination_metadata = ['Audience', 'ADD', 'False']
104 |
105 | batch = Batch(execution, [users[0], users[1]])
106 |
107 | # Call
108 | hasher = AdsUserListPIIHashingMapper()
109 | hashed = hasher.hash_users(batch).elements
110 |
111 | assert len(hashed) == 2
112 |
113 | assert hashed[0] == {
114 | 'hashedEmail': 'john@doe.com',
115 | 'addressInfo': {
116 | 'countryCode': 'US',
117 | 'hashedFirstName': 'John',
118 | 'hashedLastName': 'Doe',
119 | 'zipCode': '12345'
120 | }}
121 |
122 | assert hashed[1] == {
123 | 'hashedEmail': 'jane@doe.com',
124 | 'addressInfo': {
125 | 'countryCode': 'US',
126 | 'hashedFirstName': 'Jane',
127 | 'hashedLastName': 'Doe',
128 | 'zipCode': '12345'
129 | }}
130 |
--------------------------------------------------------------------------------
/megalist_dataflow/megalist_metadata:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Megalist",
3 | "description": "Buyers audience generator and uploader",
4 | "parameters": [
5 | {
6 | "name": "developer_token",
7 | "label": "Google Ads Developer Token",
8 | "help_text": "Google Ads Developer Token",
9 | "is_optional": "true"
10 | },
11 | {
12 | "name": "client_id",
13 | "label": "Client Id for the Google APIs",
14 | "help_text": "Client Id for the Google APIs"
15 | },
16 | {
17 | "name": "client_secret",
18 | "label": "Client Secret for the Google APIs",
19 | "help_text": "Client Secret for the Google APIs"
20 | },
21 | {
22 | "name": "access_token",
23 | "label": "Access Token for the Google APIs",
24 | "help_text": "Access Token for the Google APIs"
25 | },
26 | {
27 | "name": "refresh_token",
28 | "label": "Refresh Token for the Google APIs",
29 | "help_text": "Refresh Token for the Google APIs"
30 | },
31 | {
32 | "name": "setup_sheet_id",
33 | "label": "Google Sheets id for config",
34 | "help_text": "Google Sheets id for config"
35 | },
36 | {
37 | "name": "bq_ops_dataset",
38 | "label": "Auxliary bigquery dataset used for Megalista operations",
39 | "help_text": "Auxliary bigquery dataset used for Megalista operations"
40 | },
41 | {
42 | "name": "appsflyer_dev_key",
43 | "label": "Developer key for AppsFlyer S2S API",
44 | "help_text": "Developer key for AppsFlyer S2S API",
45 | "is_optional": "true"
46 | }
47 | ]
48 | }
49 |
--------------------------------------------------------------------------------
/megalist_dataflow/models/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | name = "dataflow_deps"
--------------------------------------------------------------------------------
/megalist_dataflow/models/execution.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from enum import Enum
16 | from typing import Dict, List, Union
17 |
18 | OK_STATUS = 'OK'
19 |
20 |
21 | class DestinationType(Enum):
22 | CM_OFFLINE_CONVERSION, \
23 | ADS_OFFLINE_CONVERSION, \
24 | ADS_SSD_UPLOAD, \
25 | ADS_ENHANCED_CONVERSION, \
26 | ADS_CUSTOMER_MATCH_CONTACT_INFO_UPLOAD, \
27 | ADS_CUSTOMER_MATCH_MOBILE_DEVICE_ID_UPLOAD, \
28 | ADS_CUSTOMER_MATCH_USER_ID_UPLOAD, \
29 | GA_USER_LIST_UPLOAD, \
30 | APPSFLYER_S2S_EVENTS, \
31 | GA_MEASUREMENT_PROTOCOL, \
32 | GA_DATA_IMPORT, \
33 | GA_4_MEASUREMENT_PROTOCOL = range(12)
34 |
35 | def __eq__(self, other):
36 | if other is None:
37 | return False
38 | return self.name == other.name
39 |
40 |
41 | class SourceType(Enum):
42 | BIG_QUERY, \
43 | CSV = range(2)
44 | # TODO: CSV not yet implemented
45 |
46 |
47 | class AccountConfig:
48 | def __init__(
49 | self,
50 | google_ads_account_id: str,
51 | mcc: bool,
52 | google_analytics_account_id: str,
53 | campaign_manager_account_id: str,
54 | app_id: str
55 | ):
56 | self._google_ads_account_id = google_ads_account_id
57 | self._mcc = mcc
58 | self._google_analytics_account_id = google_analytics_account_id
59 | self._campaign_manager_account_id = campaign_manager_account_id
60 | self._app_id = app_id
61 |
62 | @property
63 | def google_ads_account_id(self) -> str:
64 | return self._google_ads_account_id
65 |
66 | @property
67 | def mcc(self) -> bool:
68 | return self._mcc
69 |
70 | @property
71 | def google_analytics_account_id(self) -> str:
72 | return self._google_analytics_account_id
73 |
74 | @property
75 | def campaign_manager_account_id(self) -> str:
76 | return self._campaign_manager_account_id
77 |
78 | @property
79 | def app_id(self) -> str:
80 | return self._app_id
81 |
82 | def __str__(self) -> str:
83 | return f"\n[Account Config]\n\t" \
84 | f"Google Ads Customer Id: {self.google_ads_account_id}\n\t" \
85 | f"Google Ads MCC: {self._mcc}\n\t" \
86 | f"Google Analytics Account Id: {self.google_analytics_account_id}\n\t" \
87 | f"Campaign Manager Account Id: {self.campaign_manager_account_id}\n\t" \
88 | f"Play Store App Id: {self.app_id}"
89 |
90 | def __eq__(self, other):
91 | return self.google_ads_account_id == other.google_ads_account_id \
92 | and self.google_analytics_account_id == other.google_analytics_account_id \
93 | and self.campaign_manager_account_id == other.campaign_manager_account_id \
94 | and self.app_id == other.app_id
95 |
96 | def __hash__(self):
97 | return hash((self.google_ads_account_id, self.google_analytics_account_id,
98 | self.campaign_manager_account_id, self.app_id))
99 |
100 |
101 | class Source:
102 | def __init__(
103 | self,
104 | source_name: str,
105 | source_type: SourceType,
106 | source_metadata: List[str]
107 | ):
108 | self._source_name = source_name
109 | self._source_type = source_type
110 | self._source_metadata = source_metadata
111 |
112 | @property
113 | def source_name(self) -> str:
114 | return self._source_name
115 |
116 | @property
117 | def source_type(self) -> SourceType:
118 | return self._source_type
119 |
120 | @property
121 | def source_metadata(self) -> List[str]:
122 | return self._source_metadata
123 |
124 | def __eq__(self, other):
125 | return self.source_name == other.source_name \
126 | and self.source_type == other.source_type \
127 | and self.source_metadata == other.source_metadata
128 |
129 | def __hash__(self):
130 | return hash((self.source_name, self.source_type, self.source_metadata[0], self.source_metadata[1]))
131 |
132 |
133 | class Destination:
134 | def __init__(
135 | self,
136 | destination_name: str,
137 | destination_type: DestinationType,
138 | destination_metadata: List[str]
139 | ):
140 | self._destination_name = destination_name
141 | self._destination_type = destination_type
142 | self._destination_metadata = destination_metadata
143 |
144 | @property
145 | def destination_name(self) -> str:
146 | return self._destination_name
147 |
148 | @property
149 | def destination_type(self) -> DestinationType:
150 | return self._destination_type
151 |
152 | @property
153 | def destination_metadata(self) -> List[str]:
154 | return self._destination_metadata
155 |
156 | def __eq__(self, other) -> bool:
157 | return bool(self.destination_name == other.destination_name and self.destination_metadata[0] == other.destination_metadata[0])
158 |
159 | def __hash__(self) -> int:
160 | return hash((self.destination_name, self.destination_type.name, self.destination_metadata[0]))
161 |
162 |
163 | class Execution:
164 | def __init__(
165 | self,
166 | account_config: AccountConfig,
167 | source: Source,
168 | destination: Destination
169 | ):
170 | self._account_config = account_config
171 | self._source = source
172 | self._destination = destination
173 |
174 | @property
175 | def source(self) -> Source:
176 | return self._source
177 |
178 | @property
179 | def destination(self) -> Destination:
180 | return self._destination
181 |
182 | @property
183 | def account_config(self) -> AccountConfig:
184 | return self._account_config
185 |
186 | def __str__(self):
187 | return 'Origin name: {}. Action: {}. Destination name: {}'.format(self.source.source_name,
188 | self.destination.destination_type,
189 | self.destination.destination_name)
190 |
191 | def __eq__(self, other):
192 | if other is None:
193 | return False
194 | return self.source == other.source \
195 | and self.destination == other.destination \
196 | and self.account_config == other.account_config
197 |
198 | def __hash__(self):
199 | return hash((self.source, self.destination, self.account_config))
200 |
201 |
202 | class Batch:
203 | def __init__(
204 | self,
205 | execution: Execution,
206 | elements: List[Dict[str, Union[str, Dict[str, str]]]]
207 | ):
208 | self._execution = execution
209 | self._elements = elements
210 |
211 | @property
212 | def execution(self) -> Execution:
213 | return self._execution
214 |
215 | @property
216 | def elements(self) -> List[Dict[str, Union[str, Dict[str, str]]]]:
217 | return self._elements
218 |
219 | def __str__(self):
220 | return f'Execution: {self._execution}. Elements: {self._elements}'
221 |
222 | def __eq__(self, other):
223 | if other is None:
224 | return False
225 | return self.execution == other.execution and self.elements == other.elements
226 |
227 | def __hash__(self):
228 | return hash(('Batch', self.execution))
229 |
--------------------------------------------------------------------------------
/megalist_dataflow/models/oauth_credentials.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | class OAuthCredentials():
17 | def __init__(self, client_id, client_secret, access_token, refresh_token):
18 | self.client_id = client_id
19 | self.client_secret = client_secret
20 | self.access_token = access_token
21 | self.refresh_token = refresh_token
22 |
23 | def get_client_id(self):
24 | return self.client_id.get()
25 |
26 | def get_client_secret(self):
27 | return self.client_secret.get()
28 |
29 | def get_access_token(self):
30 | return self.access_token.get()
31 |
32 | def get_refresh_token(self):
33 | return self.refresh_token.get()
34 |
--------------------------------------------------------------------------------
/megalist_dataflow/models/oauth_credentials_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from models.oauth_credentials import OAuthCredentials
16 | from apache_beam.options.value_provider import StaticValueProvider
17 |
18 |
19 | def test_init():
20 | id = StaticValueProvider(str, "id")
21 | secret = StaticValueProvider(str, "secret")
22 | access = StaticValueProvider(str, "access")
23 | refresh = StaticValueProvider(str, "refresh")
24 | credentials = OAuthCredentials(id, secret, access, refresh)
25 | assert credentials.get_client_id() == "id"
26 | assert credentials.get_client_secret() == "secret"
27 | assert credentials.get_access_token() == "access"
28 | assert credentials.get_refresh_token() == "refresh"
29 |
--------------------------------------------------------------------------------
/megalist_dataflow/models/options.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from apache_beam.options.pipeline_options import PipelineOptions
16 |
17 |
18 | class DataflowOptions(PipelineOptions):
19 |
20 | @classmethod
21 | def _add_argparse_args(cls, parser):
22 | # OAUTH
23 | parser.add_value_provider_argument(
24 | '--client_id', help='Client Id for the Google APIs')
25 | parser.add_value_provider_argument(
26 | '--client_secret', help='Client Secret for the Google APIs')
27 | parser.add_value_provider_argument(
28 | '--refresh_token', help='OAUTH Refresh Token for the Google APIs')
29 | parser.add_value_provider_argument(
30 | '--access_token', help='OAUTH Access Token for the Google APIs')
31 | # Set up
32 | parser.add_value_provider_argument(
33 | '--setup_sheet_id', help='Id of Spreadsheet with execution info')
34 | parser.add_value_provider_argument(
35 | '--setup_firestore_collection', help='Name of Firestore collection with execution info')
36 | parser.add_value_provider_argument(
37 | '--bq_ops_dataset',
38 | help='Auxliary bigquery dataset used for Megalista operations')
39 | # Google Ads
40 | parser.add_value_provider_argument(
41 | '--developer_token', help='Developer Token for Google Ads API')
42 | parser.add_value_provider_argument(
43 | '--customer_id', help='Google Ads Customer Id')
44 | # Google Analytics
45 | parser.add_value_provider_argument(
46 | '--google_analytics_account_id', help='Google Analytics account Id')
47 | parser.add_value_provider_argument(
48 | '--google_analytics_web_property_id',
49 | help='Google Analytics web property Id')
50 | parser.add_value_provider_argument(
51 | '--google_analytics_buyer_custom_dim',
52 | help='Google Analytics buyer custom dimension')
53 | parser.add_value_provider_argument(
54 | '--google_analytics_user_id_custom_dim',
55 | help='Google Analytics User Id custom dimension')
56 | # Campaign Manager
57 | parser.add_value_provider_argument(
58 | '--dcm_profile_id', help='CampaignManager profile Id')
59 | parser.add_value_provider_argument(
60 | '--floodlight_activity_id',
61 | help='CampaignManager floodlight activity Id')
62 | parser.add_value_provider_argument(
63 | '--floodlight_configuration_id',
64 | help='CampaignManager floodlight configuration Id')
65 | # Conversion Plus
66 | parser.add_value_provider_argument(
67 | '--cp_sheet_id', help='Conversion Plus Sheet Id')
68 | parser.add_value_provider_argument(
69 | '--cp_sheet_range',
70 | help='Name of the Conversion Plus Sheet config range')
71 | # BigQuery
72 | parser.add_value_provider_argument(
73 | '--dataset_id', default='megalist', help='BigQuery dataset Id')
74 | parser.add_value_provider_argument(
75 | '--table_id', default='crm_upload', help='BigQuery dataset Id')
76 | # GCP
77 | parser.add_argument(
78 | '--gcp_project_id', help='ID Google Cloud Project to use')
79 | parser.add_argument('--output', help='Output file to write results to.')
80 | # APPSFLYER
81 | parser.add_value_provider_argument(
82 | '--appsflyer_dev_key', help='Developer key for AppsFlyer S2S API')
--------------------------------------------------------------------------------
/megalist_dataflow/models/options_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from models.options import DataflowOptions
16 |
17 |
18 | def test_options(mocker):
19 | parser = mocker.MagicMock()
20 | DataflowOptions._add_argparse_args(parser)
21 | parser.add_value_provider_argument.assert_any_call("--client_id", help=mocker.ANY)
22 | parser.add_value_provider_argument.assert_any_call("--client_secret", help=mocker.ANY)
23 | parser.add_value_provider_argument.assert_any_call("--refresh_token", help=mocker.ANY)
24 | parser.add_value_provider_argument.assert_any_call("--access_token", help=mocker.ANY)
25 | parser.add_value_provider_argument.assert_any_call("--developer_token", help=mocker.ANY)
26 | parser.add_value_provider_argument.assert_any_call("--customer_id", help=mocker.ANY)
27 | parser.add_value_provider_argument.assert_any_call("--google_analytics_account_id", help=mocker.ANY)
28 | parser.add_value_provider_argument.assert_any_call("--google_analytics_web_property_id", help=mocker.ANY)
29 | parser.add_value_provider_argument.assert_any_call("--google_analytics_buyer_custom_dim", help=mocker.ANY)
30 | parser.add_value_provider_argument.assert_any_call("--google_analytics_user_id_custom_dim", help=mocker.ANY)
31 | parser.add_value_provider_argument.assert_any_call("--dcm_profile_id", help=mocker.ANY)
32 | parser.add_value_provider_argument.assert_any_call("--floodlight_activity_id", help=mocker.ANY)
33 | parser.add_value_provider_argument.assert_any_call("--floodlight_configuration_id", help=mocker.ANY)
34 | parser.add_value_provider_argument.assert_any_call("--dataset_id", default="megalist", help=mocker.ANY)
35 | parser.add_value_provider_argument.assert_any_call("--table_id", default="crm_upload", help=mocker.ANY)
36 | parser.add_argument.assert_any_call("--gcp_project_id", help=mocker.ANY)
37 | parser.add_argument.assert_any_call("--output", help=mocker.ANY)
38 |
--------------------------------------------------------------------------------
/megalist_dataflow/models/sheets_config.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from google.oauth2.credentials import Credentials
16 | from googleapiclient.discovery import build
17 |
18 |
19 | class SheetsConfig:
20 | def __init__(self, oauth_credentials):
21 | self._oauth_credentials = oauth_credentials
22 | self._sheets_service = None
23 |
24 | def _get_sheets_service(self):
25 | if not self._sheets_service:
26 | credentials = Credentials(
27 | token=self._oauth_credentials.get_access_token(),
28 | refresh_token=self._oauth_credentials.get_refresh_token(),
29 | client_id=self._oauth_credentials.get_client_id(),
30 | client_secret=self._oauth_credentials.get_client_secret(),
31 | token_uri='https://accounts.google.com/o/oauth2/token',
32 | scopes=['https://www.googleapis.com/auth/spreadsheets.readonly'])
33 |
34 | self._sheets_service = build('sheets', 'v4', credentials=credentials)
35 | return self._sheets_service
36 |
37 | def to_dict(self, config):
38 | return dict(map(lambda x: (x[0], {"op": x[1], "value": x[2], "multiplier": x[3]}), config))
39 |
40 | def get_config(self, sheet_id, range):
41 | config_range = self.get_range(sheet_id, range)
42 | return self.to_dict(config_range['values'])
43 |
44 | def get_range(self, sheet_id, range):
45 | return self._get_sheets_service().spreadsheets().values().get(spreadsheetId=sheet_id, range=range).execute()
46 |
47 | def get_value(self, sheet_id, range):
48 | range = self.get_range(sheet_id, range)
49 | if range.get('values') is None:
50 | return None
51 | return range['values'][0][0]
52 |
--------------------------------------------------------------------------------
/megalist_dataflow/requirements.txt:
--------------------------------------------------------------------------------
1 | googleads==24.1.0
2 | httplib2==0.17.4
3 | protobuf==3.13.0
4 | google-api-python-client==1.12.8
5 | google-cloud-core==1.4.1
6 | google-cloud-bigquery==1.27.2
7 | apache-beam[gcp]==2.28.0
8 | apache-beam==2.28.0
9 | google-cloud-datastore==1.13.1
10 | google-apitools==0.5.31
11 | pytest==5.4.3
12 | pytest-cov==2.11.1
13 | pytest-mock==3.2.0
14 | requests-mock==1.8.0
15 | pytz==2021.1
16 | wheel==0.34.2
17 | pyarrow==0.17.1
18 | aiohttp==3.7.4
19 | bloom-filter==1.3
20 | six==1.15.0
21 | mypy==0.790
--------------------------------------------------------------------------------
/megalist_dataflow/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import setuptools
16 |
17 | __version__ = "1.0.0"
18 | setuptools.setup(
19 | name='megalist_dataflow',
20 | version=__version__,
21 | author='DP6 fork from Google/megalista',
22 | author_email='koopas@dp6.com.br',
23 | url='https://github.com/DP6/marketing-data-sync',
24 | install_requires=['googleads==24.1.0', 'google-api-python-client==1.10.0',
25 | 'google-cloud-core==1.3.0', 'google-cloud-bigquery==1.26.0',
26 | 'google-cloud-datastore==1.13.1', 'aiohttp==3.7.4'],
27 | packages=setuptools.find_packages(),
28 | )
29 |
--------------------------------------------------------------------------------
/megalist_dataflow/sources/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DP6/marketing-data-sync/e65fd6627612281143f0696461bd6475e793785d/megalist_dataflow/sources/__init__.py
--------------------------------------------------------------------------------
/megalist_dataflow/sources/base_bounded_source.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from abc import abstractmethod
16 |
17 | from apache_beam.io import OffsetRangeTracker
18 | from apache_beam.io import iobase
19 | from apache_beam.io.iobase import RangeTracker
20 | from apache_beam.io.iobase import SourceBundle
21 |
22 | from typing import Any
23 | from typing import Iterator
24 | from typing import Optional
25 |
26 |
27 | class BaseBoundedSource(iobase.BoundedSource):
28 | """
29 | Abstract class implementing common methods of BoundedSource applicable to a fixed size Source
30 | """
31 |
32 | def __init__(self):
33 | self._count = None
34 |
35 | def count(self):
36 | if self._count is None:
37 | self._count = self._do_count()
38 | return self._count
39 |
40 | @abstractmethod
41 | def _do_count(self):
42 | """
43 | :return: Size of source
44 | """
45 | raise NotImplementedError
46 |
47 | def split(self,
48 | desired_bundle_size, # type: int
49 | start_position=None, # type: Optional[Any]
50 | stop_position=None, # type: Optional[Any]
51 | ): # type: (...) -> Iterator[SourceBundle]
52 | if start_position is None:
53 | start_position = 0
54 | if stop_position is None:
55 | stop_position = self.count()
56 |
57 | bundle_start = start_position
58 | while bundle_start < stop_position:
59 | bundle_stop = min(stop_position, bundle_start + desired_bundle_size)
60 | yield iobase.SourceBundle(
61 | weight=(bundle_stop - bundle_start),
62 | source=self,
63 | start_position=bundle_start,
64 | stop_position=bundle_stop)
65 | bundle_start = bundle_stop
66 |
67 | def get_range_tracker(self,
68 | start_position, # type: Optional[Any]
69 | stop_position, # type: Optional[Any]
70 | ): # type: (...) -> RangeTracker
71 | if start_position is None:
72 | start_position = 0
73 | if stop_position is None:
74 | stop_position = self.count()
75 |
76 | return OffsetRangeTracker(start_position, stop_position)
77 |
--------------------------------------------------------------------------------
/megalist_dataflow/sources/batches_from_executions.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import Any, Dict, List, Iterable, Tuple
16 |
17 | import apache_beam as beam
18 | import logging
19 | from google.cloud import bigquery
20 | from apache_beam.io.gcp.bigquery import ReadFromBigQueryRequest
21 |
22 | from models.execution import DestinationType, Execution, Batch
23 |
24 |
25 | class BatchesFromExecutions(beam.PTransform):
26 | """
27 | Filter the received executions by the received action,
28 | load the data using the received source and group by that batch size and Execution.
29 | """
30 |
31 | class _ExecutionIntoBigQueryRequest(beam.DoFn):
32 | def process(self, execution: Execution) -> Iterable[ReadFromBigQueryRequest]:
33 | table_name = execution.source.source_metadata[0] + \
34 | '.' + execution.source.source_metadata[1]
35 | query = f"SELECT Data.*, '{hash(execution)}' AS execution_hash FROM {table_name} AS Data"
36 | return [ReadFromBigQueryRequest(query=query)]
37 |
38 | class _ExecutionIntoBigQueryRequestTransactional(beam.DoFn):
39 | def process(self, execution: Execution) -> Iterable[ReadFromBigQueryRequest]:
40 | table_name = execution.source.source_metadata[0] + \
41 | '.' + execution.source.source_metadata[1]
42 | uploaded_table_name = f"{table_name}_uploaded"
43 | client = bigquery.Client()
44 |
45 | query = "CREATE TABLE IF NOT EXISTS " + uploaded_table_name + " ( \
46 | timestamp TIMESTAMP OPTIONS(description= 'Event timestamp'), \
47 | uuid STRING OPTIONS(description='Event unique identifier'))\
48 | PARTITION BY _PARTITIONDATE \
49 | OPTIONS(partition_expiration_days=15)"
50 |
51 | logging.getLogger("megalista.ExecutionIntoBigQueryRequestTransactional").info(
52 | "Creating table %s if it doesn't exist", uploaded_table_name)
53 |
54 | client.query(query).result()
55 |
56 | query = f"SELECT Data.*, '{hash(execution)}' AS execution_hash FROM {table_name} AS Data \
57 | LEFT JOIN {uploaded_table_name} AS Uploaded USING(uuid) \
58 | WHERE Uploaded.uuid IS NULL;"
59 |
60 | return [ReadFromBigQueryRequest(query=query)]
61 |
62 | class _BatchElements(beam.DoFn):
63 | def __init__(self, batch_size: int):
64 | self._batch_size = batch_size
65 |
66 | def process(self, element, executions: Iterable[Execution]):
67 | execution = next(
68 | (execution for execution in executions if str(hash(execution)) == element[0]))
69 | batch: List[Any] = []
70 | for i, element in enumerate(element[1]):
71 | if i != 0 and i % self._batch_size == 0:
72 | yield Batch(execution, batch)
73 | batch = []
74 | batch.append(element)
75 | yield Batch(execution, batch)
76 |
77 | def __init__(
78 | self,
79 | destination_type: DestinationType,
80 | batch_size: int = 5000,
81 | transactional: bool = False
82 | ):
83 | super().__init__()
84 | self._destination_type = destination_type
85 | self._batch_size = batch_size
86 | self._transactional = transactional
87 |
88 | def _get_bq_request_class(self):
89 | if self._transactional:
90 | return self._ExecutionIntoBigQueryRequestTransactional()
91 | return self._ExecutionIntoBigQueryRequest()
92 |
93 | def expand(self, executions):
94 | return (
95 | executions
96 | | beam.Filter(lambda execution: execution.destination.destination_type == self._destination_type)
97 | | beam.ParDo(self._get_bq_request_class())
98 | | beam.io.ReadAllFromBigQuery()
99 | | beam.GroupBy(lambda x: x['execution_hash'])
100 | | beam.ParDo(self._BatchElements(self._batch_size), beam.pvalue.AsList(executions))
101 | )
102 |
--------------------------------------------------------------------------------
/megalist_dataflow/sources/firestore_execution_source.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import distutils.util
15 | import logging
16 |
17 | from apache_beam.options.value_provider import ValueProvider
18 |
19 | from google.cloud import firestore
20 | from sources.base_bounded_source import BaseBoundedSource
21 | from models.execution import Destination, DestinationType
22 | from models.execution import Execution, AccountConfig
23 | from models.execution import Source, SourceType
24 |
25 |
26 | class FirestoreExecutionSource(BaseBoundedSource):
27 | """
28 | Read Execution data from a Firestore collection. The collection name is set-up in the parameter "setup_firestore_collection"
29 | """
30 |
31 | def __init__(
32 | self,
33 | setup_firestore_collection: ValueProvider
34 | ):
35 | super().__init__()
36 | self._setup_firestore_collection = setup_firestore_collection
37 |
38 | def _do_count(self):
39 | # TODO: implement count
40 | return 3
41 |
42 | def read(self, range_tracker):
43 | def document_to_dict(doc):
44 | if not doc.exists:
45 | return None
46 | doc_dict = doc.to_dict()
47 | doc_dict['id'] = doc.id
48 | return doc_dict
49 |
50 | firestore_collection = self._setup_firestore_collection.get()
51 | logging.getLogger("megalista.FirestoreExecutionSource").info(f"Loading Firestore collection {firestore_collection}...")
52 | db = firestore.Client()
53 | entries = db.collection(self._setup_firestore_collection.get()).where('active', '==', 'yes').stream()
54 | entries = [document_to_dict(doc) for doc in entries]
55 |
56 | account_data = document_to_dict(db.collection(self._setup_firestore_collection.get()).document('account_config').get())
57 |
58 | if not account_data:
59 | raise Exception('Firestore collection is absent')
60 | google_ads_id = account_data.get('google_ads_id', 'empty')
61 | mcc_trix = account_data.get('mcc_trix', 'FALSE')
62 | mcc = False if mcc_trix is None else bool(distutils.util.strtobool(mcc_trix))
63 | app_id = account_data.get('app_id', 'empty')
64 | google_analytics_account_id = account_data.get('google_analytics_account_id', 'empty')
65 | campaign_manager_account_id = account_data.get('campaign_manager_account_id', 'empty')
66 |
67 | account_config = AccountConfig(google_ads_id, mcc, google_analytics_account_id, campaign_manager_account_id, app_id)
68 | logging.getLogger("megalista.FirestoreExecutionSource").info(f"Loaded: {account_config}")
69 |
70 | sources = self._read_sources(entries)
71 | destinations = self._read_destination(entries)
72 | if entries:
73 | for entry in entries:
74 | if entry['active'].upper() == 'YES':
75 | logging.getLogger("megalista.FirestoreExecutionSource").info(
76 | f"Executing step Source:{sources[entry['id'] + '_source'].source_name} -> Destination:{destinations[entry['id'] + '_destination'].destination_name}")
77 | yield Execution(account_config, sources[entry['id'] + '_source'], destinations[entry['id'] + '_destination'])
78 | else:
79 | logging.getLogger("megalista.FirestoreExecutionSource").warn("No schedules found!")
80 |
81 | @staticmethod
82 | def _read_sources(entries):
83 | sources = {}
84 | if entries:
85 | for entry in entries:
86 | metadata = [entry['bq_dataset'], entry['bq_table']] #TODO: flexibilize for other source types
87 | source = Source(entry['id'] + '_source', SourceType[entry['source']], metadata)
88 | sources[source.source_name] = source
89 | else:
90 | logging.getLogger("megalista.FirestoreExecutionSource").warn("No sources found!")
91 | return sources
92 |
93 | @staticmethod
94 | def _read_destination(entries):
95 | def create_metadata_list(entry):
96 | metadata_list = {
97 | 'ADS_OFFLINE_CONVERSION': ['gads_conversion_name'],
98 | 'ADS_SSD_UPLOAD': ['gads_conversion_name', 'gads_external_upload_id'],
99 | 'ADS_CUSTOMER_MATCH_CONTACT_INFO_UPLOAD': ['gads_audience_name', 'gads_operation', 'gads_hash'],
100 | 'ADS_CUSTOMER_MATCH_MOBILE_DEVICE_ID_UPLOAD': ['gads_audience_name', 'gads_operation'],
101 | 'ADS_CUSTOMER_MATCH_USER_ID_UPLOAD': ['gads_audience_name', 'gads_operation'],
102 | 'GA_MEASUREMENT_PROTOCOL': ['google_analytics_property_id', 'google_analytics_non_interaction'],
103 | 'CM_OFFLINE_CONVERSION': ['campaign_manager_floodlight_activity_id', 'campaign_manager_floodlight_configuration_id'],
104 | 'APPSFLYER_S2S_EVENTS': ['appsflyer_app_id'],
105 | }
106 |
107 | entry_type = entry['type']
108 | metadata = metadata_list.get(entry_type, None)
109 | if not metadata:
110 | raise Exception(f'Upload type not implemented: {entry_type}')
111 | entry_metadata = []
112 | for m in metadata:
113 | if m in entry:
114 | entry_metadata.append(entry[m])
115 | else:
116 | raise Exception(f'Missing field in Firestore document for {entry_type}: {m}')
117 | return entry_metadata
118 |
119 |
120 | destinations = {}
121 | if entries:
122 | for entry in entries:
123 | destination = Destination(entry['id'] + '_destination', DestinationType[entry['type']], create_metadata_list(entry))
124 | destinations[destination.destination_name] = destination
125 | else:
126 | logging.getLogger("megalista.FirestoreExecutionSource").warn("No destinations found!")
127 | return destinations
128 |
--------------------------------------------------------------------------------
/megalist_dataflow/sources/spreadsheet_execution_source.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import distutils.util
15 | import logging
16 |
17 | from apache_beam.options.value_provider import ValueProvider
18 |
19 | from sources.base_bounded_source import BaseBoundedSource
20 | from models.execution import Destination, DestinationType
21 | from models.execution import Execution, AccountConfig
22 | from models.execution import Source, SourceType
23 | from models.sheets_config import SheetsConfig
24 |
25 |
26 | class SpreadsheetExecutionSource(BaseBoundedSource):
27 | """
28 | Read Execution data from a sheet. The sheet id is set-up in the parameter "setup_sheet_id"
29 | """
30 |
31 | def __init__(
32 | self,
33 | sheets_config: SheetsConfig,
34 | setup_sheet_id: ValueProvider
35 | ):
36 | super().__init__()
37 | self._sheets_config = sheets_config
38 | self._setup_sheet_id = setup_sheet_id
39 |
40 | def _do_count(self):
41 | # TODO: really count the number of lines in the sheet
42 | return 3
43 |
44 | def read(self, range_tracker):
45 | sheet_id = self._setup_sheet_id.get()
46 | logging.getLogger("megalista.SpreadsheetExecutionSource").info(f"Loading configuration sheet {sheet_id}...")
47 | google_ads_id = self._sheets_config.get_value(sheet_id, "GoogleAdsAccountId")
48 | mcc_trix = self._sheets_config.get_value(sheet_id, "GoogleAdsMCC")
49 | mcc = False if mcc_trix is None else bool(distutils.util.strtobool(mcc_trix))
50 | app_id = self._sheets_config.get_value(sheet_id, "AppId")
51 | google_analytics_account_id = self._sheets_config.get_value(sheet_id, "GoogleAnalyticsAccountId")
52 | campaign_manager_account_id = self._sheets_config.get_value(sheet_id, "CampaignManagerAccountId")
53 | account_config = AccountConfig(google_ads_id, mcc, google_analytics_account_id, campaign_manager_account_id, app_id)
54 | logging.getLogger("megalista.SpreadsheetExecutionSource").info(f"Loaded: {account_config}")
55 |
56 | sources = self._read_sources(self._sheets_config, sheet_id)
57 | destinations = self._read_destination(self._sheets_config, sheet_id)
58 |
59 | schedules_range = self._sheets_config.get_range(sheet_id, 'SchedulesRange')
60 | if 'values' in schedules_range:
61 | for schedule in schedules_range['values']:
62 | if schedule[0] == 'YES':
63 | logging.getLogger("megalista.SpreadsheetExecutionSource").info(
64 | f"Executing step Source:{sources[schedule[1]].source_name} -> Destination:{destinations[schedule[2]].destination_name}")
65 | yield Execution(account_config, sources[schedule[1]], destinations[schedule[2]])
66 | else:
67 | logging.getLogger("megalista.SpreadsheetExecutionSource").warn("No schedules found!")
68 |
69 | @staticmethod
70 | def _read_sources(sheets_config, sheet_id):
71 | range = sheets_config.get_range(sheet_id, 'SourcesRange')
72 | sources = {}
73 | if 'values' in range:
74 | for row in range['values']:
75 | source = Source(row[0], SourceType[row[1]], row[2:])
76 | sources[source.source_name] = source
77 | else:
78 | logging.getLogger("megalista.SpreadsheetExecutionSource").warn("No sources found!")
79 | return sources
80 |
81 | @staticmethod
82 | def _read_destination(sheets_config, sheet_id):
83 | range = sheets_config.get_range(sheet_id, 'DestinationsRange')
84 | destinations = {}
85 | if 'values' in range:
86 | for row in range['values']:
87 | destination = Destination(row[0], DestinationType[row[1]], row[2:])
88 | destinations[destination.destination_name] = destination
89 | else:
90 | logging.getLogger("megalista.SpreadsheetExecutionSource").warn("No destinations found!")
91 | return destinations
92 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | name = "uploaders"
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/appsflyer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DP6/marketing-data-sync/e65fd6627612281143f0696461bd6475e793785d/megalist_dataflow/uploaders/appsflyer/__init__.py
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/appsflyer/appsflyer_s2s_uploader_async.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 |
17 | import apache_beam as beam
18 | import time
19 | from datetime import datetime
20 | from typing import Any, List
21 |
22 | import asyncio
23 | from aiohttp import ClientSession, ClientTimeout
24 |
25 | from uploaders import utils
26 | from models.execution import DestinationType, Batch
27 |
28 |
29 | class AppsFlyerS2SUploaderDoFn(beam.DoFn):
30 | def __init__(self, dev_key):
31 | super().__init__()
32 | self.API_URL = "https://api2.appsflyer.com/inappevent/"
33 | self.dev_key = dev_key
34 | self.app_id = None
35 | self.timeout = ClientTimeout(total=15) #15 sec timeout
36 |
37 | def start_bundle(self):
38 | pass
39 |
40 |
41 | async def _prepare_and_send(self, session, row, success_elements):
42 |
43 | #prepare payload
44 | payload = {
45 | "appsflyer_id": row['appsflyer_id'],
46 | "eventName": row['event_eventName'],
47 | "eventValue": "",
48 | "af_events_api" :"true"
49 | }
50 | self.bind_key(payload, row, 'device_ids_idfa','idfa')
51 | self.bind_key(payload, row, 'device_ids_advertising_id','advertising_id')
52 | self.bind_key(payload, row, 'device_ids_oaid','oaid')
53 | self.bind_key(payload, row, 'device_ids_amazon_aid','amazon_aid')
54 | self.bind_key(payload, row, 'device_ids_imei','imei')
55 | self.bind_key(payload, row, 'customer_user_id','customer_user_id')
56 | self.bind_key(payload, row, 'ip','ip')
57 | self.bind_key(payload, row, 'event_eventValue','eventValue')
58 | self.bind_key(payload, row, 'event_eventTime','eventTime')
59 | if 'eventTime' in payload:
60 | payload['eventTime'] = payload['eventTime'].strftime("%Y-%m-%d %H:%M:%S.%f")
61 | self.bind_key(payload, row, 'event_eventCurrency','eventCurrency')
62 |
63 |
64 | # run request asyncronously.
65 | response = await self._send_http_request(session, payload, 1)
66 | if response == 200:
67 | success_elements.append(row)
68 | return response
69 |
70 |
71 | async def _send_http_request(self, session, payload, curr_retry):
72 | url = self.API_URL + self.app_id
73 | headers = {
74 | "authentication": self.dev_key.get(),
75 | 'Content-Type': 'application/json'
76 | }
77 |
78 | try:
79 | async with session.post(url, headers=headers, json=payload,
80 | raise_for_status=False, timeout=15) as response:
81 | if response.status != 200:
82 | if curr_retry < 3:
83 | await asyncio.sleep(curr_retry)
84 | return await self._send_http_request(session, payload, curr_retry+1)
85 | else:
86 | logging.getLogger("megalista.AppsFlyerS2SUploader").error(
87 | f"Fail to send event. Response code: {response.status}, "
88 | f"reason: {response.reason}")
89 | #print(await response.text()) #uncomment to troubleshoot
90 | return response.status
91 |
92 | except Exception as exc:
93 | if curr_retry < 3:
94 | await asyncio.sleep(curr_retry)
95 | return await self._send_http_request(session, payload, curr_retry+1)
96 | else:
97 | logging.getLogger("megalista.AppsFlyerS2SUploader").error('Error inserting event: ' + str(exc))
98 | return -1
99 |
100 |
101 | async def _async_request_runner(self, elements, success_elements):
102 | tasks = []
103 |
104 | # Create client session to prevent multiple connections
105 | async with ClientSession(timeout=self.timeout) as session:
106 |
107 | # For each event
108 | for element in elements:
109 | task = asyncio.ensure_future(self._prepare_and_send(session, element, success_elements))
110 | tasks.append(task)
111 |
112 | responses = asyncio.gather(*tasks)
113 | return await responses
114 |
115 |
116 | def bind_key(self, payload, row, row_key, name):
117 | if row_key in row and row[row_key] is not None and row[row_key] != "":
118 | payload[name] = row[row_key]
119 |
120 |
121 | @utils.safe_process(logger=logging.getLogger("megalista.AppsFlyerS2SUploader"))
122 | def process(self, batch: Batch, **kwargs):
123 | success_elements: List[Any] = []
124 | start_datetime = datetime.now()
125 | execution = batch.execution
126 |
127 | self.app_id = execution.destination.destination_metadata[0]
128 |
129 | #send all requests asyncronously
130 | loop = asyncio.new_event_loop()
131 | future = asyncio.ensure_future(self._async_request_runner(batch.elements, success_elements), loop = loop)
132 | responses = loop.run_until_complete(future)
133 |
134 |
135 | #wait to avoid api trotle
136 | delta_sec = (datetime.now()-start_datetime).total_seconds()
137 | min_duration_sec = len(batch.elements)/500 #Using Rate limitation = 500 per sec
138 | if delta_sec < min_duration_sec:
139 | time.sleep(min_duration_sec - delta_sec)
140 | logging.getLogger("megalista.AppsFlyerS2SUploader").info(
141 | f"Successfully uploaded {len(success_elements)}/{len(batch.elements)} events.")
142 |
143 | yield Batch(execution, success_elements)
144 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/big_query/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DP6/marketing-data-sync/e65fd6627612281143f0696461bd6475e793785d/megalist_dataflow/uploaders/big_query/__init__.py
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/big_query/transactional_events_results_writer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 | from datetime import datetime
17 |
18 | import apache_beam as beam
19 | from google.cloud import bigquery
20 | from google.cloud.bigquery import SchemaField
21 |
22 | from uploaders import utils
23 | from models.execution import Batch
24 |
25 |
26 | class TransactionalEventsResultsWriter(beam.DoFn):
27 | """
28 | Uploads UUIDs from rows successfully sent by the uploader.
29 | It uploads the rows to a table with the same name of the source table plus the suffix '_uploaded'.
30 | """
31 |
32 | def __init__(self, bq_ops_dataset):
33 | super().__init__()
34 | self._bq_ops_dataset = bq_ops_dataset
35 |
36 | @utils.safe_process(logger=logging.getLogger("megalista.TransactionalEventsResultsWriter"))
37 | def process(self, batch: Batch, *args, **kwargs):
38 | self._do_process(batch, datetime.now().timestamp())
39 |
40 | def _do_process(self, batch: Batch, now):
41 | execution = batch.execution
42 |
43 | table_name = self._bq_ops_dataset.get() + '.' + execution.source.source_metadata[1] + "_uploaded"
44 |
45 | rows = batch.elements
46 | client = self._get_bq_client()
47 | table = client.get_table(table_name)
48 | results = client.insert_rows(table, [{'uuid': row['uuid'], 'timestamp': now} for row in rows],
49 | (SchemaField("uuid", "string"), SchemaField("timestamp", "timestamp")))
50 |
51 | for result in results:
52 | logging.getLogger("megalista.TransactionalEventsResultsWriter").error(result['errors'])
53 |
54 | @staticmethod
55 | def _get_bq_client():
56 | return bigquery.Client()
57 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/big_query/transactional_events_results_writer_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import datetime
16 |
17 | from models.execution import AccountConfig
18 | from models.execution import Destination
19 | from models.execution import DestinationType
20 | from models.execution import Execution
21 | from models.execution import Source
22 | from models.execution import SourceType
23 | from models.execution import Batch
24 | import pytest
25 | from uploaders.big_query.transactional_events_results_writer import TransactionalEventsResultsWriter
26 |
27 | from google.cloud.bigquery import SchemaField
28 |
29 | from apache_beam.options.value_provider import StaticValueProvider
30 |
31 |
32 | @pytest.fixture
33 | def uploader():
34 | return TransactionalEventsResultsWriter(StaticValueProvider(str, 'bq_ops_dataset'))
35 |
36 |
37 | def test_bigquery_write(mocker, uploader):
38 | bq_client = mocker.MagicMock()
39 |
40 | mocker.patch.object(uploader, "_get_bq_client")
41 | uploader._get_bq_client.return_value = bq_client
42 |
43 | table = mocker.MagicMock()
44 | bq_client.get_table.return_value = table
45 |
46 | now = datetime.datetime.now().timestamp()
47 |
48 | account_config = AccountConfig("account_id", False, "ga_account_id", "", "")
49 | destination = Destination(
50 | "dest1",
51 | DestinationType.GA_MEASUREMENT_PROTOCOL,
52 | ["web_property", "view", "c", "list", "d", "buyers_custom_dim"])
53 | source = Source("orig1", SourceType.BIG_QUERY, ["dt1", "buyers"])
54 | execution = Execution(account_config, source, destination)
55 |
56 | uploader._do_process(Batch(execution, [{"uuid": "uuid-1"}, {"uuid": "uuid-2"}]), now)
57 |
58 | bq_client.insert_rows.assert_called_once_with(
59 | table,
60 | [{"uuid": "uuid-1", "timestamp": now},
61 | {"uuid": "uuid-2", "timestamp": now}],
62 | (SchemaField("uuid", "string"),
63 | SchemaField("timestamp", "timestamp")))
64 |
65 |
66 | def test_bigquery_write_failure(mocker, uploader, caplog):
67 | bq_client = mocker.MagicMock()
68 |
69 | mocker.patch.object(uploader, "_get_bq_client")
70 | uploader._get_bq_client.return_value = bq_client
71 |
72 | error_message = "This is an error message"
73 | bq_client.insert_rows.return_value = [{"errors": error_message}]
74 |
75 | account_config = AccountConfig("account_id", False, "ga_account_id", "", "")
76 | source = Source("orig1", SourceType.BIG_QUERY, ["dt1", "buyers"])
77 | destination = Destination(
78 | "dest1",
79 | DestinationType.GA_MEASUREMENT_PROTOCOL,
80 | ["web_property", "view", "c", "list", "d", "buyers_custom_dim"])
81 |
82 | execution = Execution(account_config, source, destination)
83 |
84 | uploader.process(Batch(execution, [{"uuid": "uuid-1"}]))
85 |
86 | assert error_message in caplog.text
87 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/campaign_manager/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DP6/marketing-data-sync/e65fd6627612281143f0696461bd6475e793785d/megalist_dataflow/uploaders/campaign_manager/__init__.py
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/campaign_manager/campaign_manager_conversion_uploader.py:
--------------------------------------------------------------------------------
1 | """Campaign Manager Conversion Uploader beam module."""
2 | # Copyright 2021 Google LLC
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # https://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | import logging
17 | import math
18 | import time
19 |
20 | import apache_beam as beam
21 | from google.oauth2.credentials import Credentials
22 | from googleapiclient.discovery import build
23 |
24 | from uploaders import utils
25 | from models.execution import DestinationType, Batch
26 |
27 | _LOGGER_NAME: str = 'megalista.CampaignManagerConversionsUploader'
28 |
29 |
30 | class CampaignManagerConversionUploaderDoFn(beam.DoFn):
31 | """Apache Beam DoFn class implementation."""
32 |
33 | def __init__(self, oauth_credentials):
34 | super().__init__()
35 | self.oauth_credentials = oauth_credentials
36 |
37 | def _get_dcm_service(self):
38 | credentials = Credentials(
39 | token=self.oauth_credentials.get_access_token(),
40 | refresh_token=self.oauth_credentials.get_refresh_token(),
41 | client_id=self.oauth_credentials.get_client_id(),
42 | client_secret=self.oauth_credentials.get_client_secret(),
43 | token_uri='https://accounts.google.com/o/oauth2/token',
44 | scopes=[
45 | 'https://www.googleapis.com/auth/dfareporting',
46 | 'https://www.googleapis.com/auth/dfatrafficking',
47 | 'https://www.googleapis.com/auth/ddmconversions'])
48 |
49 | return build('dfareporting', 'v3.4', credentials=credentials)
50 |
51 | def start_bundle(self):
52 | pass
53 |
54 | @staticmethod
55 | def _assert_all_list_names_are_present(any_execution):
56 | destination = any_execution.destination.destination_metadata
57 | if len(destination) != 2:
58 | raise ValueError(
59 | f'Missing destination information. Found {len(destination)}')
60 |
61 | if not destination[0] \
62 | or not destination[1]:
63 | raise ValueError(
64 | f'Missing destination information. Received {str(destination)}')
65 |
66 | @utils.safe_process(logger=logging.getLogger(_LOGGER_NAME))
67 | def process(self, batch: Batch, **kwargs):
68 | self._do_process(batch, time.time())
69 | yield batch
70 |
71 | def _do_process(self, batch: Batch, timestamp):
72 | execution = batch.execution
73 | self._assert_all_list_names_are_present(execution)
74 |
75 | self._do_upload_data(
76 | execution.destination.destination_metadata[0],
77 | execution.destination.destination_metadata[1],
78 | execution.account_config.campaign_manager_account_id,
79 | timestamp,
80 | batch.elements)
81 |
82 | def _do_upload_data(
83 | self,
84 | floodlight_activity_id,
85 | floodlight_configuration_id,
86 | campaign_manager_account_id,
87 | timestamp,
88 | rows):
89 |
90 | service = self._get_dcm_service()
91 | conversions = []
92 | logger = logging.getLogger(_LOGGER_NAME)
93 | for conversion in rows:
94 | to_upload = {
95 | 'floodlightActivityId': floodlight_activity_id,
96 | 'floodlightConfigurationId': floodlight_configuration_id,
97 | 'ordinal': math.floor(timestamp * 10e5),
98 | 'timestampMicros': math.floor(timestamp * 10e5)
99 | }
100 |
101 | if 'gclid' in conversion and conversion['gclid']:
102 | to_upload['gclid'] = conversion['gclid']
103 | elif 'encryptedUserId' in conversion and conversion['encryptedUserId']:
104 | to_upload['encryptedUserId'] = conversion['encryptedUserId']
105 | elif 'mobileDeviceId' in conversion and conversion['mobileDeviceId']:
106 | to_upload['mobileDeviceId'] = conversion['mobileDeviceId']
107 | elif 'matchId' in conversion and conversion['matchId']:
108 | to_upload['matchId'] = conversion['matchId']
109 |
110 | conversions.append(to_upload)
111 |
112 | request_body = {
113 | 'conversions': conversions,
114 | }
115 |
116 | logger.info(f'Conversions: \n{conversions}')
117 |
118 | request = service.conversions().batchinsert(
119 | profileId=campaign_manager_account_id, body=request_body)
120 | response = request.execute()
121 |
122 | if response['hasFailures']:
123 | logger.error(f'Error(s) inserting conversions:\n{response}')
124 | conversions_status = response['status']
125 | error_messages = []
126 |
127 | for status in conversions_status:
128 | if 'errors' in status:
129 | for error in status['errors']:
130 | error_messages.append('[{}]: {}'.format(error['code'], error['message']))
131 |
132 | logger.error('Errors from API:\n{}'.format('\n'.join(error_messages)))
133 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/campaign_manager/campaign_manager_conversion_uploader_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import math
15 | import time
16 | import logging
17 |
18 | from apache_beam.options.value_provider import StaticValueProvider
19 | from uploaders.campaign_manager.campaign_manager_conversion_uploader import CampaignManagerConversionUploaderDoFn
20 | from models.execution import AccountConfig
21 | from models.execution import Destination
22 | from models.execution import DestinationType
23 | from models.execution import Execution
24 | from models.execution import Source
25 | from models.execution import SourceType
26 | from models.execution import Batch
27 | from models.oauth_credentials import OAuthCredentials
28 | import pytest
29 |
30 | _account_config = AccountConfig(mcc=False,
31 | campaign_manager_account_id='dcm_profile_id',
32 | google_ads_account_id='',
33 | google_analytics_account_id='',
34 | app_id='')
35 |
36 |
37 | @pytest.fixture
38 | def uploader(mocker):
39 | credential_id = StaticValueProvider(str, 'id')
40 | secret = StaticValueProvider(str, 'secret')
41 | access = StaticValueProvider(str, 'access')
42 | refresh = StaticValueProvider(str, 'refresh')
43 | credentials = OAuthCredentials(credential_id, secret, access, refresh)
44 |
45 | return CampaignManagerConversionUploaderDoFn(credentials)
46 |
47 |
48 | def test_get_service(uploader):
49 | assert uploader._get_dcm_service() is not None
50 |
51 |
52 | def test_conversion_upload(mocker, uploader):
53 | mocker.patch.object(uploader, '_get_dcm_service')
54 |
55 | floodlight_activity_id = 'floodlight_activity_id'
56 | floodlight_configuration_id = 'floodlight_configuration_id'
57 |
58 | source = Source('orig1', SourceType.BIG_QUERY, ('dt1', 'buyers'))
59 | destination = Destination(
60 | 'dest1',
61 | DestinationType.CM_OFFLINE_CONVERSION,
62 | (floodlight_activity_id, floodlight_configuration_id))
63 |
64 | execution = Execution(_account_config, source, destination)
65 |
66 | current_time = time.time()
67 |
68 | uploader._do_process(Batch(execution, [{
69 | 'gclid': '123'
70 | }, {
71 | 'gclid': '456'
72 | }]), current_time)
73 |
74 | expected_body = {
75 | 'conversions': [{
76 | 'gclid': '123',
77 | 'floodlightActivityId': floodlight_activity_id,
78 | 'floodlightConfigurationId': floodlight_configuration_id,
79 | 'ordinal': math.floor(current_time * 10e5),
80 | 'timestampMicros': math.floor(current_time * 10e5)
81 | }, {
82 | 'gclid': '456',
83 | 'floodlightActivityId': floodlight_activity_id,
84 | 'floodlightConfigurationId': floodlight_configuration_id,
85 | 'ordinal': math.floor(current_time * 10e5),
86 | 'timestampMicros': math.floor(current_time * 10e5)
87 | }],
88 | }
89 |
90 | uploader._get_dcm_service().conversions().batchinsert.assert_any_call(
91 | profileId='dcm_profile_id', body=expected_body)
92 |
93 |
94 | def test_conversion_upload_match_id(mocker, uploader):
95 | mocker.patch.object(uploader, '_get_dcm_service')
96 |
97 | floodlight_activity_id = 'floodlight_activity_id'
98 | floodlight_configuration_id = 'floodlight_configuration_id'
99 |
100 | source = Source('orig1', SourceType.BIG_QUERY, ('dt1', 'buyers'))
101 | destination = Destination(
102 | 'dest1',
103 | DestinationType.CM_OFFLINE_CONVERSION,
104 | (floodlight_activity_id, floodlight_configuration_id))
105 | execution = Execution(_account_config, source, destination)
106 | current_time = time.time()
107 |
108 | mocker.patch.object(time, 'time')
109 | time.time.return_value = current_time
110 |
111 | uploader._do_process(Batch(execution, [{'matchId': 'abc'}]), current_time)
112 |
113 | expected_body = {
114 | 'conversions': [{
115 | 'matchId': 'abc',
116 | 'floodlightActivityId': floodlight_activity_id,
117 | 'floodlightConfigurationId': floodlight_configuration_id,
118 | 'ordinal': math.floor(current_time * 10e5),
119 | 'timestampMicros': math.floor(current_time * 10e5)
120 | }],
121 | }
122 |
123 | uploader._get_dcm_service().conversions().batchinsert.assert_any_call(
124 | profileId='dcm_profile_id', body=expected_body)
125 |
126 |
127 | def test_error_on_api_call(mocker, uploader, caplog):
128 | caplog.set_level(logging.INFO, 'megalista.CampaignManagerConversionsUploader')
129 | mocker.patch.object(uploader, '_get_dcm_service')
130 | service = mocker.MagicMock()
131 | uploader._get_dcm_service.return_value = service
132 |
133 | service.conversions().batchinsert().execute.return_value = {
134 | 'hasFailures': True,
135 | 'status': [{
136 | 'errors': [{
137 | 'code': '123',
138 | 'message': 'error_returned'
139 | }]
140 | }]
141 | }
142 |
143 | source = Source('orig1', SourceType.BIG_QUERY, ('dt1', 'buyers'))
144 | destination = Destination(
145 | 'dest1', DestinationType.CM_OFFLINE_CONVERSION, ['a', 'b'])
146 | execution = Execution(_account_config, source, destination)
147 |
148 | uploader._do_process(Batch(execution, [{'gclid': '123'}]), time.time())
149 |
150 | assert 'Error(s) inserting conversions:' in caplog.text
151 | assert '[123]: error_returned' in caplog.text
152 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_ads/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DP6/marketing-data-sync/e65fd6627612281143f0696461bd6475e793785d/megalist_dataflow/uploaders/google_ads/__init__.py
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_ads/conversions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DP6/marketing-data-sync/e65fd6627612281143f0696461bd6475e793785d/megalist_dataflow/uploaders/google_ads/conversions/__init__.py
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_ads/conversions/google_ads_offline_conversions_uploader.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 |
17 | import apache_beam as beam
18 | from uploaders import utils
19 | from models.execution import Batch, DestinationType, Execution
20 |
21 |
22 | class GoogleAdsOfflineUploaderDoFn(beam.DoFn):
23 |
24 | def __init__(self, oauth_credentials, developer_token):
25 | super().__init__()
26 | self.oauth_credentials = oauth_credentials
27 | self.developer_token = developer_token
28 | self.active = self.developer_token is not None
29 |
30 | def _get_oc_service(self, customer_id):
31 | return utils.get_ads_service('OfflineConversionFeedService', 'v201809',
32 | self.oauth_credentials,
33 | self.developer_token.get(), customer_id)
34 |
35 | def start_bundle(self):
36 | pass
37 |
38 | @staticmethod
39 | def _assert_conversion_name_is_present(execution: Execution):
40 | destination = execution.destination.destination_metadata
41 | if len(destination) != 1:
42 | raise ValueError('Missing destination information. Found {}'.format(
43 | len(destination)))
44 |
45 | if not destination[0]:
46 | raise ValueError('Missing destination information. Received {}'.format(
47 | str(destination)))
48 |
49 | @utils.safe_process(
50 | logger=logging.getLogger('megalista.GoogleAdsOfflineUploader'))
51 | def process(self, batch: Batch, **kwargs):
52 | if not self.active:
53 | logging.getLogger().warning(
54 | 'Skipping upload, parameters not configured.')
55 | return
56 | execution = batch.execution
57 | self._assert_conversion_name_is_present(execution)
58 |
59 | oc_service = self._get_oc_service(
60 | execution.account_config.google_ads_account_id)
61 |
62 | self._do_upload(oc_service,
63 | execution.destination.destination_metadata[0],
64 | batch.elements)
65 |
66 | @staticmethod
67 | def _do_upload(oc_service, conversion_name, rows):
68 | logging.getLogger().warning('Uploading {} rows to Google Ads'.format(
69 | len(rows)))
70 | upload_data = [{
71 | 'operator': 'ADD',
72 | 'operand': {
73 | 'conversionName': conversion_name,
74 | 'conversionTime': utils.format_date(conversion['time']),
75 | 'conversionValue': conversion['amount'],
76 | 'googleClickId': conversion['gclid']
77 | }
78 | } for conversion in rows]
79 |
80 | oc_service.mutate(upload_data)
81 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_ads/conversions/google_ads_offline_conversions_uploader_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from apache_beam.options.value_provider import StaticValueProvider
16 | import pytest
17 | from uploaders.google_ads.conversions.google_ads_offline_conversions_uploader import GoogleAdsOfflineUploaderDoFn
18 | from models.execution import AccountConfig
19 | from models.execution import Destination
20 | from models.execution import DestinationType
21 | from models.execution import Execution
22 | from models.execution import Source
23 | from models.execution import SourceType
24 | from models.execution import Batch
25 | from models.oauth_credentials import OAuthCredentials
26 |
27 | _account_config = AccountConfig('account_id', False, 'ga_account_id', '', '')
28 |
29 |
30 | @pytest.fixture
31 | def uploader(mocker):
32 | mocker.patch('googleads.oauth2.GoogleRefreshTokenClient')
33 | mocker.patch('googleads.adwords.AdWordsClient')
34 | credential_id = StaticValueProvider(str, 'id')
35 | secret = StaticValueProvider(str, 'secret')
36 | access = StaticValueProvider(str, 'access')
37 | refresh = StaticValueProvider(str, 'refresh')
38 | credentials = OAuthCredentials(credential_id, secret, access, refresh)
39 | return GoogleAdsOfflineUploaderDoFn(credentials,
40 | StaticValueProvider(str, 'devtoken'))
41 |
42 |
43 | def test_get_service(mocker, uploader):
44 | assert uploader._get_oc_service(mocker.ANY) is not None
45 |
46 |
47 | def test_not_active(mocker, caplog):
48 | credential_id = StaticValueProvider(str, 'id')
49 | secret = StaticValueProvider(str, 'secret')
50 | access = StaticValueProvider(str, 'access')
51 | refresh = StaticValueProvider(str, 'refresh')
52 | credentials = OAuthCredentials(credential_id, secret, access, refresh)
53 | uploader_dofn = GoogleAdsOfflineUploaderDoFn(credentials, None)
54 | mocker.patch.object(uploader_dofn, '_get_oc_service')
55 | uploader_dofn.process(Batch(None, []))
56 | uploader_dofn._get_oc_service.assert_not_called()
57 | assert 'Skipping upload, parameters not configured.' in caplog.text
58 |
59 |
60 | def test_conversion_upload(mocker, uploader):
61 | mocker.patch.object(uploader, '_get_oc_service')
62 | conversion_name = 'user_list'
63 | destination = Destination(
64 | 'dest1', DestinationType.ADS_OFFLINE_CONVERSION, ['user_list'])
65 | source = Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers'])
66 | execution = Execution(_account_config, source, destination)
67 |
68 | time1 = '2020-04-09T14:13:55.0005'
69 | time1_result = '20200409 141355 America/Sao_Paulo'
70 |
71 | time2 = '2020-04-09T13:13:55.0005'
72 | time2_result = '20200409 131355 America/Sao_Paulo'
73 |
74 | batch = Batch(execution, [{
75 | 'time': time1,
76 | 'amount': '123',
77 | 'gclid': '456'
78 | },{
79 | 'time': time2,
80 | 'amount': '234',
81 | 'gclid': '567'
82 | }])
83 |
84 | uploader.process(batch)
85 |
86 | uploader._get_oc_service.return_value.mutate.assert_any_call([{
87 | 'operator': 'ADD',
88 | 'operand': {
89 | 'conversionName': conversion_name,
90 | 'conversionTime': time1_result,
91 | 'conversionValue': '123',
92 | 'googleClickId': '456'
93 | }
94 | }, {
95 | 'operator': 'ADD',
96 | 'operand': {
97 | 'conversionName': conversion_name,
98 | 'conversionTime': time2_result,
99 | 'conversionValue': '234',
100 | 'googleClickId': '567'
101 | }
102 | }])
103 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_ads/conversions/google_ads_ssd_uploader.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import apache_beam as beam
16 | import logging
17 |
18 | from uploaders import utils
19 | from models.execution import DestinationType, Batch, Execution
20 |
21 |
22 | class GoogleAdsSSDUploaderDoFn(beam.DoFn):
23 |
24 | def __init__(self, oauth_credentials, developer_token):
25 | super().__init__()
26 | self.oauth_credentials = oauth_credentials
27 | self.developer_token = developer_token
28 | self.active = developer_token is not None
29 |
30 | def _get_ssd_service(self, customer_id):
31 | return utils.get_ads_service('OfflineDataUploadService', 'v201809',
32 | self.oauth_credentials,
33 | self.developer_token.get(), customer_id)
34 |
35 | @staticmethod
36 | def _assert_conversion_metadata_is_present(execution: Execution):
37 | metadata = execution.destination.destination_metadata
38 | if len(metadata) != 2:
39 | raise ValueError(
40 | f'Missing destination information. Received {len(metadata)} entry(ies)')
41 |
42 | @utils.safe_process(
43 | logger=logging.getLogger('megalista.GoogleAdsSSDUploader'))
44 | def process(self, batch: Batch, **kwargs):
45 | execution = batch.execution
46 | self._assert_conversion_metadata_is_present(execution)
47 |
48 | ssd_service = self._get_ssd_service(
49 | execution.account_config._google_ads_account_id)
50 | self._do_upload(ssd_service,
51 | execution.destination.destination_metadata[0],
52 | execution.destination.destination_metadata[1], batch.elements)
53 |
54 | @staticmethod
55 | def _do_upload(ssd_service, conversion_name, ssd_external_upload_id, rows):
56 | upload_data = [{
57 | 'StoreSalesTransaction': {
58 | 'userIdentifiers': [{
59 | 'userIdentifierType': 'HASHED_EMAIL',
60 | 'value': conversion['hashedEmail']
61 | }],
62 | 'transactionTime': utils.format_date(conversion['time']),
63 | 'transactionAmount': {
64 | 'currencyCode': 'BRL',
65 | 'money': {
66 | 'microAmount': conversion['amount']
67 | }
68 | },
69 | 'conversionName': conversion_name
70 | }
71 | } for conversion in rows]
72 |
73 | offline_data_upload = {
74 | 'externalUploadId': ssd_external_upload_id,
75 | 'offlineDataList': upload_data,
76 | 'uploadType': 'STORE_SALES_UPLOAD_FIRST_PARTY',
77 | 'uploadMetadata': {
78 | 'StoreSalesUploadCommonMetadata': {
79 | 'xsi_type': 'FirstPartyUploadMetadata',
80 | 'loyaltyRate': 1.0,
81 | 'transactionUploadRate': 1.0,
82 | }
83 | }
84 | }
85 |
86 | add_conversions_operation = {
87 | 'operand': offline_data_upload,
88 | 'operator': 'ADD'
89 | }
90 | ssd_service.mutate([add_conversions_operation])
91 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_ads/conversions/google_ads_ssd_uploader_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 | from apache_beam.options.value_provider import StaticValueProvider
17 |
18 | from uploaders.google_ads.conversions.google_ads_ssd_uploader import GoogleAdsSSDUploaderDoFn
19 | from models.execution import AccountConfig
20 | from models.execution import Destination
21 | from models.execution import DestinationType
22 | from models.execution import Execution
23 | from models.execution import Source
24 | from models.execution import SourceType
25 | from models.execution import Batch
26 | from models.oauth_credentials import OAuthCredentials
27 |
28 | _account_config = AccountConfig('account_id', False, 'ga_account_id', '', '')
29 |
30 |
31 | @pytest.fixture
32 | def uploader(mocker):
33 | mocker.patch('googleads.oauth2.GoogleRefreshTokenClient')
34 | mocker.patch('googleads.adwords.AdWordsClient')
35 | id = StaticValueProvider(str, 'id')
36 | secret = StaticValueProvider(str, 'secret')
37 | access = StaticValueProvider(str, 'access')
38 | refresh = StaticValueProvider(str, 'refresh')
39 | credentials = OAuthCredentials(id, secret, access, refresh)
40 | return GoogleAdsSSDUploaderDoFn(credentials,
41 | StaticValueProvider(str, 'devtoken'))
42 |
43 |
44 | def test_get_service(mocker, uploader):
45 | assert uploader._get_ssd_service(mocker.ANY) is not None
46 |
47 |
48 | def test_fail_missing_destination_metadata(uploader, mocker):
49 | mocker.patch.object(uploader, '_get_ssd_service')
50 | source = Source('orig1', SourceType.BIG_QUERY, ('dt1', 'buyers'))
51 | destination = Destination('dest1', DestinationType.ADS_SSD_UPLOAD, ['1'])
52 | execution = Execution(_account_config, source, destination)
53 | batch = Batch(execution, [])
54 | uploader.process(batch)
55 | uploader._get_ssd_service.assert_not_called()
56 |
57 |
58 | def test_conversion_upload(mocker, uploader):
59 | mocker.patch.object(uploader, '_get_ssd_service')
60 | conversion_name = 'ssd_conversion'
61 | external_upload_id = '123'
62 | source = Source('orig1', SourceType.BIG_QUERY, ('dt1', 'buyers'))
63 | destination = Destination('dest1', DestinationType.ADS_SSD_UPLOAD,
64 | [conversion_name, external_upload_id])
65 | execution = Execution(_account_config, source, destination)
66 |
67 | time1 = '2020-04-09T14:13:55.0005'
68 | time1_result = '20200409 141355 America/Sao_Paulo'
69 |
70 | time2 = '2020-04-09T13:13:55.0005'
71 | time2_result = '20200409 131355 America/Sao_Paulo'
72 |
73 | batch = Batch(execution, [{
74 | 'hashedEmail': 'a@a.com',
75 | 'time': time1,
76 | 'amount': '123'
77 | }, {
78 | 'hashedEmail': 'b@b.com',
79 | 'time': time2,
80 | 'amount': '234'
81 | }])
82 |
83 | uploader.process(batch)
84 |
85 | upload_data = [{
86 | 'StoreSalesTransaction': {
87 | 'userIdentifiers': [{
88 | 'userIdentifierType': 'HASHED_EMAIL',
89 | 'value': 'a@a.com'
90 | }],
91 | 'transactionTime': time1_result,
92 | 'transactionAmount': {
93 | 'currencyCode': 'BRL',
94 | 'money': {
95 | 'microAmount': '123'
96 | }
97 | },
98 | 'conversionName': conversion_name
99 | }
100 | }, {
101 | 'StoreSalesTransaction': {
102 | 'userIdentifiers': [{
103 | 'userIdentifierType': 'HASHED_EMAIL',
104 | 'value': 'b@b.com'
105 | }],
106 | 'transactionTime': time2_result,
107 | 'transactionAmount': {
108 | 'currencyCode': 'BRL',
109 | 'money': {
110 | 'microAmount': '234'
111 | }
112 | },
113 | 'conversionName': conversion_name
114 | }
115 | }]
116 |
117 | uploader._get_ssd_service.return_value.mutate.assert_any_call([{
118 | 'operand': {
119 | 'externalUploadId': external_upload_id,
120 | 'offlineDataList': upload_data,
121 | 'uploadType': 'STORE_SALES_UPLOAD_FIRST_PARTY',
122 | 'uploadMetadata': {
123 | 'StoreSalesUploadCommonMetadata': {
124 | 'xsi_type': 'FirstPartyUploadMetadata',
125 | 'loyaltyRate': 1.0,
126 | 'transactionUploadRate': 1.0,
127 | }
128 | }
129 | },
130 | 'operator': 'ADD'
131 | }])
132 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_ads/customer_match/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | name = "google_ads_customer_match"
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_ads/customer_match/abstract_uploader.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 | from typing import Dict, Any, List
17 |
18 | import apache_beam as beam
19 | from uploaders import utils
20 | from models.execution import AccountConfig
21 | from models.execution import DestinationType
22 | from models.execution import Batch
23 | from models.oauth_credentials import OAuthCredentials
24 |
25 | _DEFAULT_LOGGER: str = 'megalista.GoogleAdsCustomerMatchAbstractUploader'
26 |
27 |
28 | class GoogleAdsCustomerMatchAbstractUploaderDoFn(beam.DoFn):
29 |
30 | def __init__(self, oauth_credentials: OAuthCredentials, developer_token: str):
31 | super().__init__()
32 | self.oauth_credentials = oauth_credentials
33 | self.developer_token = developer_token
34 | self.active = True
35 | if self.developer_token is None:
36 | self.active = False
37 | self._user_list_id_cache: Dict[str, str] = {}
38 |
39 | def start_bundle(self):
40 | pass
41 |
42 | def _create_list_if_it_does_not_exist(self, user_list_service, list_name: str,
43 | list_definition: Dict[str, Any]) -> str:
44 |
45 | if self._user_list_id_cache.get(list_name) is None:
46 | self._user_list_id_cache[list_name] = \
47 | self._do_create_list_if_it_does_not_exist(
48 | user_list_service, list_name, list_definition)
49 |
50 | return self._user_list_id_cache[list_name]
51 |
52 | def _do_create_list_if_it_does_not_exist(self, user_list_service,
53 | list_name: str,
54 | list_definition: Dict[str, Any]
55 | ) -> str:
56 | response = user_list_service.get([{
57 | 'fields': ['Id', 'Name'],
58 | 'predicates': [{
59 | 'field': 'Name',
60 | 'operator': 'EQUALS',
61 | 'values': [list_name]
62 | }]
63 | }])
64 |
65 | if not response.entries:
66 | logging.getLogger(_DEFAULT_LOGGER).info(
67 | '%s list does not exist, creating...', list_name)
68 | result = user_list_service.mutate([{
69 | 'operator': 'ADD',
70 | **list_definition
71 | }])
72 | list_id = result['value'][0]['id']
73 | logging.getLogger(_DEFAULT_LOGGER).info('List %s created with id: %d',
74 | list_name, list_id)
75 | else:
76 | list_id = response.entries[0]['id']
77 | logging.getLogger(_DEFAULT_LOGGER).info('List found %s with id: %d',
78 | list_name, list_id)
79 |
80 | return str(list_id)
81 |
82 | # just to facilitate mocking
83 | def _get_user_list_service(self, customer_id):
84 | return utils.get_ads_service('AdwordsUserListService', 'v201809',
85 | self.oauth_credentials,
86 | self.developer_token.get(), customer_id)
87 |
88 | def _assert_execution_is_valid(self, execution) -> None:
89 | destination = execution.destination.destination_metadata
90 |
91 | # The number of parameters vary by upload. This test could be parameterized
92 | if not destination[0]:
93 | raise ValueError('Missing destination information. Received {}'.format(
94 | str(destination)))
95 |
96 | @utils.safe_process(logger=logging.getLogger(_DEFAULT_LOGGER))
97 | def process(self, batch: Batch, **kwargs) -> None:
98 | if not self.active:
99 | logging.getLogger(_DEFAULT_LOGGER).warning(
100 | 'Skipping upload to ads, parameters not configured.')
101 | return
102 |
103 | execution = batch.execution
104 |
105 | self._assert_execution_is_valid(execution)
106 |
107 | user_list_service = self._get_user_list_service(
108 | execution.account_config.google_ads_account_id)
109 | list_id = self._create_list_if_it_does_not_exist(
110 | user_list_service, execution.destination.destination_metadata[0],
111 | self.get_list_definition(
112 | execution.account_config,
113 | execution.destination.destination_metadata))
114 |
115 | rows = self.get_filtered_rows(
116 | batch.elements, self.get_row_keys())
117 |
118 | mutate_members_operation = {
119 | 'operand': {
120 | 'userListId': list_id,
121 | 'membersList': rows
122 | },
123 | 'operator': execution.destination.destination_metadata[1]
124 | }
125 |
126 | utils.safe_call_api(self.call_api, logging, user_list_service, [mutate_members_operation])
127 |
128 | def call_api(self, service, operations):
129 | service.mutateMembers(operations)
130 |
131 | def get_filtered_rows(self, rows: List[Any],
132 | keys: List[str]) -> List[Dict[str, Any]]:
133 | return [{key: row.get(key) for key in keys} for row in rows]
134 |
135 | def get_list_definition(self, account_config: AccountConfig,
136 | destination_metadata: List[str]) -> Dict[str, Any]:
137 | pass
138 |
139 | def get_row_keys(self) -> List[str]:
140 | pass
141 |
142 | def get_action_type(self) -> DestinationType:
143 | pass
144 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_ads/customer_match/contact_info_uploader.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import apache_beam as beam
16 | import logging
17 |
18 | from typing import Dict, Any, List
19 |
20 | from uploaders.google_ads.customer_match.abstract_uploader import GoogleAdsCustomerMatchAbstractUploaderDoFn
21 | from uploaders import utils
22 | from models.execution import DestinationType, AccountConfig
23 |
24 |
25 | class GoogleAdsCustomerMatchContactInfoUploaderDoFn(GoogleAdsCustomerMatchAbstractUploaderDoFn):
26 | def get_list_definition(self, account_config: AccountConfig, destination_metadata: List[str]) -> Dict[str, Any]:
27 | list_name = destination_metadata[0]
28 | return {
29 | 'operand': {
30 | 'xsi_type': 'CrmBasedUserList',
31 | 'name': list_name,
32 | 'description': list_name,
33 | # CRM-based user lists can use a membershipLifeSpan of 10000 to indicate
34 | # unlimited; otherwise normal values apply.
35 | 'membershipLifeSpan': 10000,
36 | 'uploadKeyType': 'CONTACT_INFO'
37 | }
38 | }
39 |
40 | def get_row_keys(self) -> List[str]:
41 | return ['hashedEmail', 'addressInfo', 'hashedPhoneNumber']
42 |
43 | def get_action_type(self) -> DestinationType:
44 | return DestinationType.ADS_CUSTOMER_MATCH_CONTACT_INFO_UPLOAD
45 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_ads/customer_match/mobile_uploader.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import apache_beam as beam
16 | import logging
17 |
18 | from typing import List, Dict, Any
19 |
20 | from uploaders.google_ads.customer_match.abstract_uploader import GoogleAdsCustomerMatchAbstractUploaderDoFn
21 | from uploaders import utils as utils
22 | from models.execution import DestinationType, AccountConfig
23 | from models.oauth_credentials import OAuthCredentials
24 |
25 |
26 | class GoogleAdsCustomerMatchMobileUploaderDoFn(GoogleAdsCustomerMatchAbstractUploaderDoFn):
27 | def get_list_definition(self, account_config: AccountConfig, destination_metadata: List[str]) -> Dict[str, Any]:
28 | list_name = destination_metadata[0]
29 | app_id = account_config.app_id
30 |
31 | #overwrite app_id from default to custom
32 | if len(destination_metadata) >=4 and len(destination_metadata[3]) > 0:
33 | app_id = destination_metadata[3]
34 |
35 | return {
36 | 'operand': {
37 | 'xsi_type': 'CrmBasedUserList',
38 | 'name': list_name,
39 | 'description': list_name,
40 | # CRM-based user list_name can use a membershipLifeSpan of 10000 to indicate
41 | # unlimited; otherwise normal values apply.
42 | 'membershipLifeSpan': 10000,
43 | 'appId': app_id,
44 | 'uploadKeyType': 'MOBILE_ADVERTISING_ID'
45 | }
46 | }
47 |
48 | def get_row_keys(self) -> List[str]:
49 | return ['mobileId']
50 |
51 | def get_action_type(self) -> DestinationType:
52 | return DestinationType.ADS_CUSTOMER_MATCH_MOBILE_DEVICE_ID_UPLOAD
53 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_ads/customer_match/user_id_uploader.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import Dict, Any, List
15 |
16 | from uploaders import utils
17 | from uploaders.google_ads.customer_match.abstract_uploader import GoogleAdsCustomerMatchAbstractUploaderDoFn
18 | from models.execution import DestinationType, AccountConfig
19 |
20 |
21 | class GoogleAdsCustomerMatchUserIdUploaderDoFn(
22 | GoogleAdsCustomerMatchAbstractUploaderDoFn):
23 |
24 | def get_list_definition(
25 | self,
26 | account_config: AccountConfig,
27 | destination_metadata: List[str]) -> Dict[str, Any]:
28 | list_name = destination_metadata[0]
29 | return {
30 | 'operand': {
31 | 'xsi_type': 'CrmBasedUserList',
32 | 'name': list_name,
33 | 'description': list_name,
34 | # CRM-based user list_name can use a membershipLifeSpan of 10000 to indicate
35 | # unlimited; otherwise normal values apply.
36 | 'membershipLifeSpan': 10000,
37 | 'uploadKeyType': 'CRM_ID'
38 | }
39 | }
40 |
41 | def get_row_keys(self) -> List[str]:
42 | return ['userId']
43 |
44 | def get_action_type(self) -> DestinationType:
45 | return DestinationType.ADS_CUSTOMER_MATCH_USER_ID_UPLOAD
46 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_analytics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DP6/marketing-data-sync/e65fd6627612281143f0696461bd6475e793785d/megalist_dataflow/uploaders/google_analytics/__init__.py
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_analytics/google_analytics_4_measurement_protocol.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the 'License');
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an 'AS IS' BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | import logging
17 | from typing import Dict, Any
18 | from urllib.parse import quote
19 |
20 | import apache_beam as beam
21 | import requests
22 | import json
23 |
24 | from uploaders import utils
25 | from models.execution import DestinationType, Batch
26 |
27 |
28 | class GoogleAnalytics4MeasurementProtocolUploaderDoFn(beam.DoFn):
29 | def __init__(self):
30 | super().__init__()
31 | self.API_URL = 'https://www.google-analytics.com/mp/collect'
32 |
33 | def start_bundle(self):
34 | pass
35 |
36 | @staticmethod
37 | def _str2bool(s: str) -> bool:
38 | return s.lower() == 'true'
39 |
40 | @staticmethod
41 | def _exactly_one_of(a: Any, b: Any) -> bool:
42 | return (a and not b) or (not a and b)
43 |
44 | @utils.safe_process(logger=logging.getLogger('megalista.GoogleAnalytics4MeasurementProtocolUploader'))
45 | def process(self, batch: Batch, **kwargs):
46 | execution = batch.execution
47 |
48 | api_secret = execution.destination.destination_metadata[0]
49 | is_event = self._str2bool(execution.destination.destination_metadata[1])
50 | is_user_property = self._str2bool(execution.destination.destination_metadata[2])
51 | non_personalized_ads = self._str2bool(execution.destination.destination_metadata[3])
52 |
53 | firebase_app_id = None
54 | if len(execution.destination.destination_metadata) >= 5:
55 | firebase_app_id = execution.destination.destination_metadata[4]
56 |
57 | measurement_id = None
58 | if len(execution.destination.destination_metadata) >= 6:
59 | measurement_id = execution.destination.destination_metadata[5]
60 |
61 | if not self._exactly_one_of(firebase_app_id, measurement_id):
62 | raise ValueError(
63 | 'GA4 MP should be called either with a firebase_app_id (for apps) or a measurement_id (for web)')
64 |
65 | if not self._exactly_one_of(is_event, is_user_property):
66 | raise ValueError(
67 | 'GA4 MP should be called either for sending events or a user properties')
68 |
69 | payload: Dict[str, Any] = {
70 | 'nonPersonalizedAds': non_personalized_ads
71 | }
72 |
73 | accepted_elements = []
74 |
75 | for row in batch.elements:
76 | app_instance_id = row.get('app_instance_id')
77 | client_id = row.get('client_id')
78 | user_id = row.get('user_id')
79 |
80 | if not self._exactly_one_of(app_instance_id, client_id):
81 | raise ValueError(
82 | 'GA4 MP should be called either with an app_instance_id (for apps) or a client_id (for web)')
83 |
84 | if is_event:
85 | params = {k: v for k, v in row.items() if k not in ('name', 'app_instance_id', 'client_id', 'uuid', 'user_id')}
86 | payload['events'] = [{'name': row['name'], 'params': params}]
87 |
88 | if is_user_property:
89 | payload['userProperties'] = {k: {'value': v} for k, v in row.items() if k not in ('app_instance_id', 'client_id', 'uuid', 'user_id')}
90 | payload['events'] = {'name': 'user_property_addition_event', 'params': {}}
91 |
92 | url_container = [f'{self.API_URL}?api_secret={api_secret}']
93 |
94 | if firebase_app_id:
95 | url_container.append(f'&firebase_app_id={firebase_app_id}')
96 | if not app_instance_id:
97 | raise ValueError(
98 | 'GA4 MP needs an app_instance_id parameter when used for an App Stream.')
99 | payload['app_instance_id'] = app_instance_id
100 |
101 | if measurement_id:
102 | url_container.append(f'&measurement_id={measurement_id}')
103 | if not client_id:
104 | raise ValueError(
105 | 'GA4 MP needs a client_id parameter when used for a Web Stream.')
106 | payload['client_id'] = client_id
107 |
108 | if user_id:
109 | payload['user_id'] = user_id
110 |
111 | url = ''.join(url_container)
112 | response = requests.post(url,data=json.dumps(payload))
113 | if response.status_code != 204:
114 | logging.getLogger('megalista.GoogleAnalytics4MeasurementProtocolUploader').error(
115 | f'Error calling GA4 MP {response.status_code}: {response.raw}')
116 | else:
117 | accepted_elements.append(row)
118 |
119 | logging.getLogger('megalista.GoogleAnalytics4MeasurementProtocolUploader').info(
120 | f'Successfully uploaded {len(accepted_elements)}/{len(batch.elements)} events.')
121 | yield Batch(execution, accepted_elements)
122 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_analytics/google_analytics_4_measurement_protocol_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the 'License');
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an 'AS IS' BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 | from apache_beam.options.value_provider import StaticValueProvider
17 |
18 | from uploaders.google_analytics.google_analytics_4_measurement_protocol import GoogleAnalytics4MeasurementProtocolUploaderDoFn
19 | from models.execution import Execution, SourceType, DestinationType, Source, AccountConfig, Destination, Batch
20 |
21 | import requests
22 | import requests_mock
23 |
24 | from unittest import mock
25 |
26 |
27 | _account_config = AccountConfig('account_id', False, 'ga_account_id', '', '')
28 |
29 |
30 | @pytest.fixture
31 | def uploader():
32 | return GoogleAnalytics4MeasurementProtocolUploaderDoFn()
33 |
34 |
35 | def test_exception_event_and_user_property(uploader, caplog):
36 | with requests_mock.Mocker() as m:
37 | m.post(requests_mock.ANY, status_code=204)
38 | destination = Destination(
39 | 'dest1', DestinationType.GA_4_MEASUREMENT_PROTOCOL, [
40 | 'api_secret',
41 | 'True',
42 | 'True',
43 | '',
44 | 'some_id',
45 | ''
46 | ])
47 | source = Source('orig1', SourceType.BIG_QUERY, [])
48 | execution = Execution(_account_config, source, destination)
49 | with pytest.raises(ValueError, match='GA4 MP should be called either for sending events'):
50 | next(uploader.process(Batch(execution, [])))
51 |
52 |
53 | def test_exception_no_event_nor_user_property(uploader, caplog):
54 | with requests_mock.Mocker() as m:
55 | m.post(requests_mock.ANY, status_code=204)
56 | destination = Destination(
57 | 'dest1', DestinationType.GA_4_MEASUREMENT_PROTOCOL, [
58 | 'api_secret',
59 | 'False',
60 | 'False',
61 | '',
62 | 'some_id',
63 | ''
64 | ])
65 | source = Source('orig1', SourceType.BIG_QUERY, [])
66 | execution = Execution(_account_config, source, destination)
67 | with pytest.raises(ValueError, match='GA4 MP should be called either for sending events'):
68 | next(uploader.process(Batch(execution, [])))
69 |
70 |
71 | def test_exception_app_and_web(uploader, caplog):
72 | with requests_mock.Mocker() as m:
73 | m.post(requests_mock.ANY, status_code=204)
74 | destination = Destination(
75 | 'dest1', DestinationType.GA_4_MEASUREMENT_PROTOCOL, [
76 | 'api_secret',
77 | 'False',
78 | 'True',
79 | '',
80 | 'some_app_id',
81 | 'some_web_id'
82 | ])
83 | source = Source('orig1', SourceType.BIG_QUERY, [])
84 | execution = Execution(_account_config, source, destination)
85 | with pytest.raises(ValueError, match='GA4 MP should be called either with a firebase_app_id'):
86 | next(uploader.process(Batch(execution, [{
87 | 'name': 'event_name',
88 | }])))
89 |
90 |
91 | def test_exception_no_id(uploader, caplog):
92 | with requests_mock.Mocker() as m:
93 | m.post(requests_mock.ANY, status_code=204)
94 | destination = Destination(
95 | 'dest1', DestinationType.GA_4_MEASUREMENT_PROTOCOL, [
96 | 'api_secret',
97 | 'False',
98 | 'True',
99 | '',
100 | '',
101 | ''
102 | ])
103 | source = Source('orig1', SourceType.BIG_QUERY, [])
104 | execution = Execution(_account_config, source, destination)
105 | with pytest.raises(ValueError, match='GA4 MP should be called either with a firebase_app_id'):
106 | next(uploader.process(Batch(execution, [{
107 | 'name': 'event_name',
108 | 'value': '123'
109 | }])))
110 |
111 | def test_exception_app_event_without_app_instance_id(uploader, caplog):
112 | with requests_mock.Mocker() as m:
113 | m.post(requests_mock.ANY, status_code=204)
114 | destination = Destination(
115 | 'dest1', DestinationType.GA_4_MEASUREMENT_PROTOCOL, [
116 | 'api_secret',
117 | 'True',
118 | 'False',
119 | '',
120 | 'some_id',
121 | ''
122 | ])
123 | source = Source('orig1', SourceType.BIG_QUERY, [])
124 | execution = Execution(_account_config, source, destination)
125 | with pytest.raises(ValueError, match='GA4 MP needs an app_instance_id parameter when used for an App Stream.'):
126 | next(uploader.process(Batch(execution, [{
127 | 'client_id': '123',
128 | 'name': 'event_name',
129 | 'value': '42',
130 | 'important_event': 'False'
131 | }])))
132 |
133 | def test_exception_web_event_without_client_id(uploader, caplog):
134 | with requests_mock.Mocker() as m:
135 | m.post(requests_mock.ANY, status_code=204)
136 | destination = Destination(
137 | 'dest1', DestinationType.GA_4_MEASUREMENT_PROTOCOL, [
138 | 'api_secret',
139 | 'True',
140 | 'False',
141 | '',
142 | '',
143 | 'some_id'
144 | ])
145 | source = Source('orig1', SourceType.BIG_QUERY, [])
146 | execution = Execution(_account_config, source, destination)
147 | with pytest.raises(ValueError, match='GA4 MP needs a client_id parameter when used for a Web Stream.'):
148 | next(uploader.process(Batch(execution, [{
149 | 'app_instance_id': '123',
150 | 'name': 'event_name',
151 | 'value': '42',
152 | 'important_event': 'False'
153 | }])))
154 |
155 | def test_succesful_app_event_call(uploader, caplog):
156 | with requests_mock.Mocker() as m:
157 | m.post(requests_mock.ANY, status_code=204)
158 | destination = Destination(
159 | 'dest1', DestinationType.GA_4_MEASUREMENT_PROTOCOL, [
160 | 'api_secret',
161 | 'True',
162 | 'False',
163 | '',
164 | 'some_id',
165 | ''
166 | ])
167 | source = Source('orig1', SourceType.BIG_QUERY, [])
168 | execution = Execution(_account_config, source, destination)
169 | next(uploader.process(Batch(execution, [{
170 | 'app_instance_id': '123',
171 | 'name': 'event_name',
172 | 'value': '42',
173 | 'important_event': 'False'
174 | }])))
175 |
176 | assert m.call_count == 1
177 | assert m.last_request.json()['events'][0]['params']['value'] == '42'
178 |
179 |
180 | def test_succesful_app_event_call_with_user_id(uploader, caplog):
181 | with requests_mock.Mocker() as m:
182 | m.post(requests_mock.ANY, status_code=204)
183 | destination = Destination(
184 | 'dest1', DestinationType.GA_4_MEASUREMENT_PROTOCOL, [
185 | 'api_secret',
186 | 'True',
187 | 'False',
188 | '',
189 | 'some_id',
190 | ''
191 | ])
192 | source = Source('orig1', SourceType.BIG_QUERY, [])
193 | execution = Execution(_account_config, source, destination)
194 | next(uploader.process(Batch(execution, [{
195 | 'app_instance_id': '123',
196 | 'name': 'event_name',
197 | 'value': '42',
198 | 'user_id': 'Id42'
199 | }])))
200 |
201 | assert m.call_count == 1
202 | assert m.last_request.json()['user_id'] == 'Id42'
203 |
204 |
205 | def test_succesful_web_user_property_call(uploader, caplog):
206 | with requests_mock.Mocker() as m:
207 | m.post(requests_mock.ANY, status_code=204)
208 | destination = Destination(
209 | 'dest1', DestinationType.GA_4_MEASUREMENT_PROTOCOL, [
210 | 'api_secret',
211 | 'False',
212 | 'True',
213 | '',
214 | '',
215 | 'some_id'
216 | ])
217 | source = Source('orig1', SourceType.BIG_QUERY, [])
218 | execution = Execution(_account_config, source, destination)
219 | next(uploader.process(Batch(execution, [{
220 | 'user_ltv': '42',
221 | 'client_id': 'some_id'
222 | },
223 | {
224 | 'user_will_churn': 'Maybe',
225 | 'client_id': 'some_id'
226 | }
227 | ])))
228 |
229 | assert m.call_count == 2
230 | assert m.last_request.json(
231 | )['userProperties']['user_will_churn']['value'] == 'Maybe'
232 |
233 | def test_succesful_web_user_property_call_with_user_id(uploader, caplog):
234 | with requests_mock.Mocker() as m:
235 | m.post(requests_mock.ANY, status_code=204)
236 | destination = Destination(
237 | 'dest1', DestinationType.GA_4_MEASUREMENT_PROTOCOL, [
238 | 'api_secret',
239 | 'False',
240 | 'True',
241 | '',
242 | '',
243 | 'some_id'
244 | ])
245 | source = Source('orig1', SourceType.BIG_QUERY, [])
246 | execution = Execution(_account_config, source, destination)
247 | next(uploader.process(Batch(execution, [{
248 | 'user_ltv': '42',
249 | 'user_id': 'Id42',
250 | 'client_id': 'someId'
251 | }
252 | ])))
253 |
254 | assert m.call_count == 1
255 | assert m.last_request.json(
256 | )['user_id'] == 'Id42'
257 |
258 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_analytics/google_analytics_data_import_eraser.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 |
17 | import apache_beam as beam
18 | from google.cloud import bigquery
19 | from google.oauth2.credentials import Credentials
20 | from googleapiclient.discovery import build
21 |
22 | from models.execution import DestinationType, Batch
23 |
24 |
25 | class GoogleAnalyticsDataImportEraser(beam.DoFn):
26 | """
27 | Clean up every file in a Custom Data Import.
28 |
29 | If you are changing this code, be very careful, since this class deletes ALL FILES within a Data Import.
30 | Make sure you're not deleting files from the wrong Data Import.
31 | Also, make sure that all unit tests pass and write new ones as you feel appropriated.
32 | """
33 |
34 | def __init__(self, oauth_credentials):
35 | super().__init__()
36 | self.oauth_credentials = oauth_credentials
37 |
38 | def _get_analytics_service(self):
39 | credentials = Credentials(
40 | token=self.oauth_credentials.get_access_token(),
41 | refresh_token=self.oauth_credentials.get_refresh_token(),
42 | client_id=self.oauth_credentials.get_client_id(),
43 | client_secret=self.oauth_credentials.get_client_secret(),
44 | token_uri='https://accounts.google.com/o/oauth2/token',
45 | scopes=["https://www.googleapis.com/auth/analytics.edit", 'https://www.googleapis.com/auth/adwords'])
46 |
47 | return build('analytics', 'v3', credentials=credentials)
48 |
49 | def start_bundle(self):
50 | pass
51 |
52 | @staticmethod
53 | def _assert_all_list_names_are_present(any_execution):
54 | destination = any_execution.destination.destination_metadata
55 | if len(destination) < 2:
56 | raise ValueError('Missing destination information. Found {}'.format(len(destination)))
57 |
58 | if not destination[0] or not destination[1]:
59 | raise ValueError('Missing destination information. Received {}'.format(str(destination)))
60 |
61 | def process(self, batch: Batch, **kwargs):
62 | execution = batch.execution
63 | self._assert_all_list_names_are_present(execution)
64 |
65 | ga_account_id = execution.account_config.google_analytics_account_id
66 |
67 | # Reads all metadata parameters
68 | metadata = execution.destination.destination_metadata
69 |
70 | web_property_id = metadata[0]
71 | data_import_name = metadata[1]
72 |
73 | analytics = self._get_analytics_service()
74 | data_sources = analytics.management().customDataSources().list(
75 | accountId=ga_account_id, webPropertyId=web_property_id).execute()['items']
76 | data_source_results = list(
77 | filter(lambda data_source: data_source['name'] == data_import_name, data_sources))
78 |
79 | if len(data_source_results) == 1:
80 | data_source_id = data_source_results[0]['id']
81 | try:
82 | self._call_delete_api(analytics, data_import_name, ga_account_id, data_source_id, web_property_id)
83 | yield batch
84 | except Exception as e:
85 | logging.getLogger("megalista.GoogleAnalyticsDataImportUploader").error(
86 | 'Error while delete GA Data Import files: %s' % e)
87 | else:
88 | logging.getLogger("megalista.GoogleAnalyticsDataImportUploader").error(
89 | "%s - data import not found, please configure it in Google Analytics" % data_import_name)
90 |
91 | @staticmethod
92 | def _is_table_empty(execution):
93 | table_name = execution.source.source_metadata[0] + '.' + execution.source.source_metadata[1]
94 | client = bigquery.Client()
95 | query = "select count(*) from " + table_name + " data"
96 | logging.getLogger().info('Counting rows from table %s for Execution (%s)', table_name, str(execution))
97 |
98 | # Get count value from BigQuery response
99 | return list(client.query(query).result())[0][0] == 0
100 |
101 | @staticmethod
102 | def _call_delete_api(analytics, data_import_name, ga_account_id, data_source_id, web_property_id):
103 | logging.getLogger("megalista.GoogleAnalyticsDataImportUploader").info(
104 | "Listing files from %s - %s" % (data_import_name, data_source_id))
105 |
106 | uploads = analytics.management().uploads().list(
107 | accountId=ga_account_id,
108 | webPropertyId=web_property_id,
109 | customDataSourceId=data_source_id
110 | ).execute()
111 |
112 | file_ids = [upload.get('id') for upload in uploads.get('items', [])]
113 | if len(file_ids) == 0:
114 | logging.getLogger("megalista.GoogleAnalyticsDataImportUploader").error(
115 | "Data Source %s had no files to delete" % data_import_name)
116 |
117 | else:
118 | logging.getLogger("megalista.GoogleAnalyticsDataImportUploader").info(
119 | "File Ids: %s" % file_ids)
120 |
121 | logging.getLogger("megalista.GoogleAnalyticsDataImportUploader").info(
122 | "Deleting %s files from %s - %s" % (len(file_ids), data_import_name, data_source_id))
123 | analytics.management().uploads().deleteUploadData(
124 | accountId=ga_account_id,
125 | webPropertyId=web_property_id,
126 | customDataSourceId=data_source_id,
127 | body={
128 | 'customDataImportUids': file_ids
129 | }
130 | ).execute()
131 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_analytics/google_analytics_data_import_eraser_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | import pytest
17 | from apache_beam.options.value_provider import StaticValueProvider
18 | from models.oauth_credentials import OAuthCredentials
19 | from models.execution import Execution, SourceType, DestinationType, Source, AccountConfig, Destination, Batch
20 | from uploaders.google_analytics.google_analytics_data_import_eraser import GoogleAnalyticsDataImportEraser
21 |
22 |
23 | @pytest.fixture
24 | def eraser(mocker):
25 | mocker.patch('googleads.oauth2.GoogleRefreshTokenClient')
26 | mocker.patch('googleads.adwords.AdWordsClient')
27 | client_id = StaticValueProvider(str, "id")
28 | secret = StaticValueProvider(str, "secret")
29 | access = StaticValueProvider(str, "access")
30 | refresh = StaticValueProvider(str, "refresh")
31 | credentials = OAuthCredentials(client_id, secret, access, refresh)
32 | return GoogleAnalyticsDataImportEraser(credentials)
33 |
34 |
35 | def test_analytics_has_not_data_sources(mocker, eraser, caplog):
36 | service = mocker.MagicMock()
37 |
38 | mocker.patch.object(eraser, '_get_analytics_service')
39 | eraser._get_analytics_service.return_value = service
40 |
41 | mocker.patch.object(eraser, '_is_table_empty')
42 | eraser._is_table_empty.return_value = False
43 |
44 | service.management().customDataSources().list().execute.return_value = {
45 | 'items': []
46 | }
47 |
48 | execution = Execution(AccountConfig('', False, '', '', ''),
49 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']),
50 | Destination('dest1', DestinationType.GA_DATA_IMPORT, ['web_property', 'data_import_name']))
51 | # Act
52 | try:
53 | next(eraser.process(Batch(execution, [])))
54 | except StopIteration:
55 | pass
56 |
57 | assert 'data_import_name - data import not found, please configure it in Google Analytics' in caplog.text
58 |
59 |
60 | def test_data_source_not_found(mocker, eraser, caplog):
61 | service = mocker.MagicMock()
62 |
63 | mocker.patch.object(eraser, '_get_analytics_service')
64 | eraser._get_analytics_service.return_value = service
65 |
66 | mocker.patch.object(eraser, '_is_table_empty')
67 | eraser._is_table_empty.return_value = False
68 |
69 | service.management().customDataSources().list().execute.return_value = {
70 | 'items': [{'id': 1, 'name': 'wrong_name'}]
71 | }
72 |
73 | execution = Execution(AccountConfig('', False, '', '', ''),
74 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']),
75 | Destination('dest1', DestinationType.GA_DATA_IMPORT, ['web_property', 'data_import_name']))
76 | # Act
77 | try:
78 | next(eraser.process(Batch(execution, [])))
79 | except StopIteration:
80 | pass
81 |
82 | assert 'data_import_name - data import not found, please configure it in Google Analytics' in caplog.text
83 |
84 |
85 | def test_no_files_found(mocker, eraser):
86 | service = mocker.MagicMock()
87 |
88 | mocker.patch.object(eraser, '_get_analytics_service')
89 | eraser._get_analytics_service.return_value = service
90 |
91 | mocker.patch.object(eraser, '_is_table_empty')
92 | eraser._is_table_empty.return_value = False
93 |
94 | service.management().customDataSources().list().execute.return_value = {
95 | 'items': [{'id': 1, 'name': 'data_import_name'},
96 | {'id': 2, 'name': 'data_import_name2'}]
97 | }
98 |
99 | execution = Execution(AccountConfig('', False, '', '', ''),
100 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']),
101 | Destination('dest1', DestinationType.GA_DATA_IMPORT, ['web_property', 'data_import_name']))
102 |
103 | # Add mock to side effect of list uploads
104 | service.management().uploads().list().execute.return_value = {'items': []}
105 |
106 | # Add mock to side effect of deleteUploadData
107 | delete_call_mock = mocker.MagicMock()
108 | service.management().uploads().deleteUploadData.side_effect = delete_call_mock
109 |
110 | # Act
111 | next(eraser.process(Batch(execution, [])))
112 |
113 | # Called once
114 | delete_call_mock.assert_not_called()
115 |
116 |
117 | def test_files_deleted(mocker, eraser):
118 | service = mocker.MagicMock()
119 |
120 | mocker.patch.object(eraser, '_get_analytics_service')
121 | eraser._get_analytics_service.return_value = service
122 |
123 | mocker.patch.object(eraser, '_is_table_empty')
124 | eraser._is_table_empty.return_value = False
125 |
126 | service.management().customDataSources().list().execute.return_value = {
127 | 'items': [{'id': 1, 'name': 'data_import_name'},
128 | {'id': 2, 'name': 'data_import_name2'}]
129 | }
130 |
131 | execution = Execution(AccountConfig('', False, '', '', ''),
132 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']),
133 | Destination('dest1', DestinationType.GA_DATA_IMPORT, ['web_property', 'data_import_name']))
134 |
135 | # Add mock to side effect of list uploads
136 | service.management().uploads().list().execute.return_value = {'items': [{'id': 'ab'}, {'id': 'cd'}]}
137 |
138 | # Add mock to side effect of deleteUploadData
139 | delete_call_mock = mocker.MagicMock()
140 | service.management().uploads().deleteUploadData.side_effect = delete_call_mock
141 |
142 | # Act
143 | next(eraser.process(Batch(execution, [])))
144 |
145 | # Called once
146 | delete_call_mock.assert_called_once()
147 |
148 | # Intercept args called
149 | _, kwargs = delete_call_mock.call_args
150 |
151 | # Check if really sent values from custom field
152 | ids = kwargs['body']
153 |
154 | # assert
155 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_analytics/google_analytics_data_import_uploader.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 |
16 | import logging
17 | from typing import List, Dict
18 |
19 | import apache_beam as beam
20 | from google.oauth2.credentials import Credentials
21 | from googleapiclient.discovery import build
22 | from googleapiclient.http import MediaInMemoryUpload
23 |
24 | from uploaders import utils
25 | from models.execution import DestinationType, Batch, Union
26 |
27 |
28 | class GoogleAnalyticsDataImportUploaderDoFn(beam.DoFn):
29 | """
30 | This uploader uploads csv files to Google Analytics Data Import.
31 | The csv headers are the dict received keys.
32 | Only one Execution can ben handled at a time, meaning that only one data
33 | import can be handled at a time.
34 |
35 | """
36 |
37 | def __init__(self, oauth_credentials):
38 | super().__init__()
39 | self.oauth_credentials = oauth_credentials
40 |
41 | def _get_analytics_service(self):
42 | credentials = Credentials(
43 | token=self.oauth_credentials.get_access_token(),
44 | refresh_token=self.oauth_credentials.get_refresh_token(),
45 | client_id=self.oauth_credentials.get_client_id(),
46 | client_secret=self.oauth_credentials.get_client_secret(),
47 | token_uri='https://accounts.google.com/o/oauth2/token',
48 | scopes=[
49 | 'https://www.googleapis.com/auth/analytics.edit',
50 | 'https://www.googleapis.com/auth/adwords'
51 | ])
52 |
53 | service = build('analytics', 'v3', credentials=credentials)
54 | return service
55 |
56 | def start_bundle(self):
57 | pass
58 |
59 | @staticmethod
60 | def _assert_all_list_names_are_present(any_execution):
61 | destination = any_execution.destination.destination_metadata
62 | if len(destination) < 2:
63 | raise ValueError('Missing destination information. Found '
64 | f'{len(destination)}')
65 |
66 | if not destination[0] or not destination[1]:
67 | raise ValueError('Missing destination information. Received {}'.format(
68 | str(destination)))
69 |
70 | @utils.safe_process(
71 | logger=logging.getLogger('megalista.GoogleAnalyticsDataImportUploader'))
72 | def process(self, batch: Batch, **kwargs):
73 | execution = batch.execution
74 | self._assert_all_list_names_are_present(execution)
75 |
76 | ga_account_id = execution.account_config.google_analytics_account_id
77 |
78 | # Reads all metadata parameters
79 | metadata = execution.destination.destination_metadata
80 |
81 | web_property_id = metadata[0]
82 | data_import_name = metadata[1]
83 |
84 | self._do_upload_data(web_property_id, data_import_name,
85 | ga_account_id, batch.elements)
86 |
87 | def _do_upload_data(self, web_property_id, data_import_name, ga_account_id,
88 | rows: List[Dict[str, Union[str, Dict[str, str]]]]):
89 | analytics = self._get_analytics_service()
90 | data_sources = analytics.management().customDataSources().list(
91 | accountId=ga_account_id,
92 | webPropertyId=web_property_id).execute()['items']
93 |
94 | data_source_results = list(
95 | filter(lambda x: x['name'] == data_import_name, data_sources))
96 |
97 | if data_source_results:
98 |
99 | data_source_id = data_source_results[0]['id']
100 |
101 | try:
102 | self._call_upload_api(analytics, data_import_name, ga_account_id,
103 | data_source_id, rows, web_property_id)
104 | except Exception as e:
105 | logging.getLogger('megalista.GoogleAnalyticsDataImportUploader').error(
106 | 'Error while uploading GA Data: %s', e)
107 | raise
108 | else:
109 | logging.getLogger('megalista.GoogleAnalyticsDataImportUploader').error(
110 | '%s - data import not found, please configure it in Google Analytics'
111 | % data_import_name)
112 |
113 | @staticmethod
114 | def prepare_csv(rows):
115 | """
116 | Transform a input into this format:
117 | sample = [{'col1': 'val1a', 'col2': 'val2a', 'col3':
118 | 'val3a'},
119 | {'col1': 'val1b', 'col2': 'val2b', 'col3': 'val3b'},
120 | {'col1': 'val1c', 'col2': 'val2c', 'col3': 'val3c'}]
121 | into a csv:
122 | col1,col2,col3
123 | val1a,val2a,val3a
124 | val1b,val2b,val3b
125 | val1c,val2c,val3c
126 | """
127 | column_names = ['ga:' + columnName for columnName in rows[0].keys()]
128 | header = ','.join(column_names)
129 | body = '\n'.join([
130 | ','.join(
131 | ['' if element is None else element
132 | for element in row.values()])
133 | for row in rows
134 | ])
135 | return '\n'.join([header, body])
136 |
137 | def _call_upload_api(self, analytics, data_import_name, ga_account_id,
138 | data_source_id, rows, web_property_id):
139 | logging.getLogger('megalista.GoogleAnalyticsDataImportUploader').info(
140 | 'Adding data to %s - %s' % (data_import_name, data_source_id))
141 | csv = self.prepare_csv(rows)
142 |
143 | media = MediaInMemoryUpload(
144 | bytes(csv, 'UTF-8'),
145 | mimetype='application/octet-stream',
146 | resumable=True)
147 |
148 | analytics.management().uploads().uploadData(
149 | accountId=ga_account_id,
150 | webPropertyId=web_property_id,
151 | customDataSourceId=data_source_id,
152 | media_body=media).execute()
153 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_analytics/google_analytics_data_import_uploader_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 | from apache_beam.options.value_provider import StaticValueProvider
17 |
18 | from models.oauth_credentials import OAuthCredentials
19 | from uploaders.google_analytics.google_analytics_data_import_uploader import GoogleAnalyticsDataImportUploaderDoFn
20 | from models.execution import AccountConfig
21 | from models.execution import Destination
22 | from models.execution import DestinationType
23 | from models.execution import Execution
24 | from models.execution import Source
25 | from models.execution import SourceType
26 | from models.execution import Batch
27 |
28 | _account_config = AccountConfig('1234567890', False, '1234567890', '', '')
29 |
30 |
31 | @pytest.fixture
32 | def uploader(mocker):
33 | mocker.patch('googleads.oauth2.GoogleRefreshTokenClient')
34 | mocker.patch('googleads.adwords.AdWordsClient')
35 | client_id = StaticValueProvider(str, 'id')
36 | secret = StaticValueProvider(str, 'secret')
37 | access = StaticValueProvider(str, 'access')
38 | refresh = StaticValueProvider(str, 'refresh')
39 | credentials = OAuthCredentials(client_id, secret, access, refresh)
40 | return GoogleAnalyticsDataImportUploaderDoFn(credentials)
41 |
42 |
43 | def test_get_service(uploader):
44 | assert uploader._get_analytics_service() is not None
45 |
46 |
47 | def test_elements_uploading(mocker, uploader):
48 | service = mocker.MagicMock()
49 |
50 | mocker.patch.object(uploader, '_get_analytics_service')
51 | uploader._get_analytics_service.return_value = service
52 |
53 | service.management().customDataSources().list().execute.return_value = {
54 | 'items': [{
55 | 'id': 1,
56 | 'name': 'data_import_name'
57 | }]
58 | }
59 |
60 | execution = Execution(
61 | _account_config,
62 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']),
63 | Destination('dest1', DestinationType.GA_DATA_IMPORT,
64 | ['web_property', 'data_import_name']))
65 |
66 | # Add mock to side effect of uploadData()
67 | my_mock = mocker.MagicMock()
68 | service.management().uploads().uploadData.side_effect = my_mock
69 |
70 | # Act
71 | uploader.process(Batch(execution, [{
72 | 'user_id': '12',
73 | 'cd1': 'value1a',
74 | 'cd2': 'value2a'
75 | },
76 | {
77 | 'user_id': '34',
78 | 'cd1': 'value1b',
79 | 'cd2': 'value2b'
80 | },
81 | {
82 | 'user_id': '56',
83 | 'cd1': None,
84 | 'cd2': ''
85 | }]))
86 |
87 |
88 | # Called once
89 | my_mock.assert_called_once()
90 |
91 | # Intercept args called
92 | _, kwargs = my_mock.call_args
93 |
94 | # Check if really sent values from custom field
95 | media_bytes = kwargs['media_body'].getbytes(0, -1)
96 |
97 | print(media_bytes)
98 | assert media_bytes == b'ga:user_id,ga:cd1,ga:cd2\n' \
99 | b'12,value1a,value2a\n' \
100 | b'34,value1b,value2b\n' \
101 | b'56,,'
102 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_analytics/google_analytics_measurement_protocol.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | import logging
17 | from typing import Dict, Any
18 | from urllib.parse import quote
19 |
20 | import apache_beam as beam
21 | import requests
22 | import re
23 |
24 | from uploaders import utils
25 | from models.execution import DestinationType, Batch
26 |
27 |
28 | class GoogleAnalyticsMeasurementProtocolUploaderDoFn(beam.DoFn):
29 | def __init__(self):
30 | super().__init__()
31 | self.API_URL = "https://www.google-analytics.com/batch"
32 | self.UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
33 |
34 | def start_bundle(self):
35 | pass
36 |
37 | def _format_hit(self, payload: Dict[str, Any]) -> str:
38 | return "&".join([key + "=" + quote(str(value)) for key, value in payload.items() if value is not None])
39 |
40 | @utils.safe_process(logger=logging.getLogger("megalista.GoogleAnalyticsMeasurementProtocolUploader"))
41 | def process(self, batch: Batch, **kwargs):
42 | execution = batch.execution
43 | rows = batch.elements
44 | payloads = [{
45 | "v": 1,
46 | "tid": execution.destination.destination_metadata[0],
47 | "ni": execution.destination.destination_metadata[1],
48 | "t": "event",
49 | "ds": "mp - megalista",
50 | **{'cid': row[key] for key in row.keys() if key.startswith("client_id")},
51 | **{'uid': row[key] for key in row.keys() if key.startswith("user_id")},
52 | "ea": row['event_action'],
53 | "ec": row['event_category'],
54 | "ev": row.get('event_value'),
55 | "el": row.get('event_label'),
56 | "ua": self.UA,
57 | **{key: row[key] for key in row.keys() if re.match('c[dm]\d+',key)}
58 | } for row in rows]
59 |
60 | encoded = [self._format_hit(payload) for payload in payloads]
61 |
62 | payload = '\n'.join(encoded)
63 | response = requests.post(url=self.API_URL, data=payload)
64 | if response.status_code != 200:
65 | raise Exception(
66 | f"Error uploading to Analytics HTTP {response.status_code}: {response.raw}")
67 | else:
68 | yield batch
69 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_analytics/google_analytics_user_list_uploader.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | import logging
17 |
18 | import apache_beam as beam
19 | from google.oauth2.credentials import Credentials
20 | from googleapiclient.discovery import build
21 | from googleapiclient.http import MediaInMemoryUpload
22 |
23 | from uploaders import utils
24 | from models.execution import Batch, DestinationType
25 |
26 |
27 | class GoogleAnalyticsUserListUploaderDoFn(beam.DoFn):
28 | def __init__(self,
29 | oauth_credentials):
30 | super().__init__()
31 | self.oauth_credentials = oauth_credentials
32 |
33 | def _get_analytics_service(self):
34 | credentials = Credentials(
35 | token=self.oauth_credentials.get_access_token(),
36 | refresh_token=self.oauth_credentials.get_refresh_token(),
37 | client_id=self.oauth_credentials.get_client_id(),
38 | client_secret=self.oauth_credentials.get_client_secret(),
39 | token_uri='https://accounts.google.com/o/oauth2/token',
40 | scopes=["https://www.googleapis.com/auth/analytics.edit", 'https://www.googleapis.com/auth/adwords'])
41 |
42 | service = build('analytics', 'v3', credentials=credentials)
43 | return service
44 |
45 | def _create_list_if_doesnt_exist(self, analytics, web_property_id, view_ids, list_name, list_definition,
46 | ga_account_id, ads_customer_id, mcc):
47 | lists = analytics.management().remarketingAudience().list(
48 | accountId=ga_account_id, webPropertyId=web_property_id).execute()['items']
49 | results = list(
50 | filter(lambda x: x['name'] == list_name, lists))
51 | if len(results) == 0:
52 | logging.getLogger().info('%s list does not exist, creating...' % list_name)
53 |
54 | response = analytics.management().remarketingAudience().insert(
55 | accountId=ga_account_id,
56 | webPropertyId=web_property_id,
57 | body={
58 | 'name': list_name,
59 | 'linkedViews': view_ids,
60 | 'linkedAdAccounts': [{
61 | 'type': 'MCC_LINKS' if mcc else 'ADWORDS_LINKS',
62 | 'linkedAccountId': ads_customer_id
63 | }],
64 | **list_definition
65 | }).execute()
66 | id = response['id']
67 | logging.getLogger().info('%s created with id: %s' % (list_name, id))
68 | else:
69 | id = results[0]['id']
70 | logging.getLogger().info('%s found with id: %s' % (list_name, id))
71 | return id
72 |
73 | def start_bundle(self):
74 | pass
75 |
76 | def _create_list(self, web_property_id, view_id, user_id_list_name, buyer_custom_dim, ga_account_id,
77 | ads_customer_id,
78 | mcc):
79 | analytics = self._get_analytics_service()
80 | view_ids = [view_id]
81 | self._create_list_if_doesnt_exist(analytics, web_property_id, view_ids, user_id_list_name, {
82 | 'audienceType': 'SIMPLE',
83 | 'audienceDefinition': {
84 | 'includeConditions': {
85 | 'kind': 'analytics#includeConditions',
86 | 'isSmartList': False,
87 | 'segment': 'users::condition::%s==buyer' % buyer_custom_dim,
88 | 'membershipDurationDays': 365
89 | }
90 | }
91 | }, ga_account_id, ads_customer_id, mcc)
92 |
93 | @staticmethod
94 | def _assert_all_list_names_are_present(any_execution):
95 | destination = any_execution.destination.destination_metadata
96 | if len(destination) < 6:
97 | raise ValueError('Missing destination information. Found {}'.format(len(destination)))
98 |
99 | if not destination[0] \
100 | or not destination[1] \
101 | or not destination[2] \
102 | or not destination[4] \
103 | or not destination[5]:
104 | raise ValueError('Missing destination information. Received {}'.format(str(destination)))
105 |
106 | @utils.safe_process(logger=logging.getLogger("megalista.GoogleAnalyticsUserListUploader"))
107 | def process(self, batch: Batch, **kwargs):
108 | execution = batch.execution
109 | self._assert_all_list_names_are_present(execution)
110 |
111 | ads_customer_id = execution.account_config.google_ads_account_id
112 | mcc = execution.account_config.mcc
113 | ga_account_id = execution.account_config.google_analytics_account_id
114 |
115 | # Reads all metadata parameters
116 | metadata = execution.destination.destination_metadata
117 |
118 | web_property_id = metadata[0]
119 | view_id = metadata[1]
120 | data_import_name = metadata[2]
121 | user_id_list_name = metadata[3]
122 | user_id_custom_dim = metadata[4]
123 | buyer_custom_dim = metadata[5]
124 |
125 | # Optional parameter
126 | custom_dim_field = metadata[6] if len(metadata) > 6 else None
127 |
128 | self._do_upload_data(web_property_id, view_id, data_import_name, user_id_list_name, user_id_custom_dim,
129 | buyer_custom_dim, custom_dim_field, ga_account_id, ads_customer_id, mcc,
130 | batch.elements)
131 |
132 | def _do_upload_data(self, web_property_id, view_id, data_import_name, user_id_list_name, user_id_custom_dim,
133 | buyer_custom_dim, custom_dim_field, ga_account_id, ads_customer_id, mcc, rows):
134 |
135 | if user_id_list_name:
136 | self._create_list(web_property_id, view_id, user_id_list_name, buyer_custom_dim, ga_account_id,
137 | ads_customer_id, mcc)
138 |
139 | analytics = self._get_analytics_service()
140 | data_sources = analytics.management().customDataSources().list(
141 | accountId=ga_account_id, webPropertyId=web_property_id).execute()['items']
142 | results = list(
143 | filter(lambda x: x['name'] == data_import_name, data_sources))
144 |
145 | if len(results) == 1:
146 |
147 | id = results[0]['id']
148 |
149 | logging.getLogger().info("Adding data to %s - %s" % (data_import_name, id))
150 | body = '\n'.join([
151 | '%s,%s' % (user_id_custom_dim, buyer_custom_dim),
152 | *['%s,%s' % (row['user_id'], row[custom_dim_field] if custom_dim_field else 'buyer') for row in rows]
153 | ])
154 |
155 | try:
156 | media = MediaInMemoryUpload(bytes(body, 'UTF-8'),
157 | mimetype='application/octet-stream',
158 | resumable=True)
159 | analytics.management().uploads().uploadData(
160 | accountId=ga_account_id,
161 | webPropertyId=web_property_id,
162 | customDataSourceId=id,
163 | media_body=media).execute()
164 | except Exception as e:
165 | logging.getLogger().error('Error while uploading GA Data: %s' % e)
166 | else:
167 | logging.getLogger().error(
168 | "%s - data import not found, please configure it in Google Analytics" % data_import_name)
169 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/google_analytics/google_analytics_user_list_uploader_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 | from apache_beam.options.value_provider import StaticValueProvider
17 |
18 | from uploaders.google_analytics.google_analytics_user_list_uploader import GoogleAnalyticsUserListUploaderDoFn
19 | from models.oauth_credentials import OAuthCredentials
20 | from models.execution import Execution, SourceType, DestinationType, Source, AccountConfig, Destination, Batch
21 |
22 |
23 | @pytest.fixture
24 | def uploader(mocker):
25 | mocker.patch('googleads.oauth2.GoogleRefreshTokenClient')
26 | mocker.patch('googleads.adwords.AdWordsClient')
27 | client_id = StaticValueProvider(str, 'id')
28 | secret = StaticValueProvider(str, 'secret')
29 | access = StaticValueProvider(str, 'access')
30 | refresh = StaticValueProvider(str, 'refresh')
31 | credentials = OAuthCredentials(client_id, secret, access, refresh)
32 | return GoogleAnalyticsUserListUploaderDoFn(credentials)
33 |
34 |
35 | def test_get_service(uploader):
36 | assert uploader._get_analytics_service() is not None
37 |
38 |
39 | def test_list_already_exists(mocker, uploader):
40 | service = mocker.MagicMock()
41 | service.management().remarketingAudience().list().execute = mocker.Mock(
42 | return_value={'items': [{
43 | 'id': 1,
44 | 'name': 'list'
45 | }]})
46 |
47 | mocker.patch.object(uploader, '_get_analytics_service')
48 | uploader._get_analytics_service.return_value = service
49 |
50 | execution = Execution(
51 | AccountConfig('', False, '', '', ''),
52 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']),
53 | Destination('dest1', DestinationType.GA_USER_LIST_UPLOAD,
54 | ['a', 'b', 'c', 'list', 'd', 'e']))
55 |
56 | uploader.process(Batch(execution, []))
57 |
58 | uploader._get_analytics_service().management().remarketingAudience(
59 | ).insert.assert_not_called()
60 |
61 |
62 | def test_list_creation_not_mcc(mocker, uploader):
63 | ads_account_id = 'xxx-yyy-zzzz'
64 | ga_account_id = 'acc'
65 |
66 | service = mocker.MagicMock()
67 |
68 | mocker.patch.object(uploader, '_get_analytics_service')
69 | uploader._get_analytics_service.return_value = service
70 |
71 | service.management().remarketingAudience().insert().execute.return_value = {
72 | 'id': 1
73 | }
74 |
75 | execution = Execution(
76 | AccountConfig(ads_account_id, False, ga_account_id, '', ''),
77 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']),
78 | Destination(
79 | 'dest1', DestinationType.GA_USER_LIST_UPLOAD,
80 | ['web_property', 'view', 'c', 'list', 'd', 'buyers_custom_dim']))
81 | uploader.process(Batch(execution, []))
82 |
83 | service.management().remarketingAudience().insert.assert_any_call(
84 | accountId=ga_account_id,
85 | webPropertyId='web_property',
86 | body={
87 | 'name': 'list',
88 | 'linkedViews': ['view'],
89 | 'linkedAdAccounts': [{
90 | 'type': 'ADWORDS_LINKS',
91 | 'linkedAccountId': ads_account_id
92 | }],
93 | 'audienceType': 'SIMPLE',
94 | 'audienceDefinition': {
95 | 'includeConditions': {
96 | 'kind':
97 | 'analytics#includeConditions',
98 | 'isSmartList':
99 | False,
100 | 'segment':
101 | 'users::condition::%s==buyer' % 'buyers_custom_dim',
102 | 'membershipDurationDays':
103 | 365
104 | }
105 | }
106 | })
107 |
108 |
109 | def test_list_creation_mcc(mocker, uploader):
110 | ads_account_id = 'xxx-yyy-zzzz'
111 | ga_account_id = 'acc'
112 |
113 | service = mocker.MagicMock()
114 |
115 | mocker.patch.object(uploader, '_get_analytics_service')
116 | uploader._get_analytics_service.return_value = service
117 |
118 | service.management().remarketingAudience().insert().execute.return_value = {
119 | 'id': 1
120 | }
121 |
122 | execution = Execution(
123 | AccountConfig(ads_account_id, True, ga_account_id, '', ''),
124 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']),
125 | Destination(
126 | 'dest1', DestinationType.GA_USER_LIST_UPLOAD,
127 | ['web_property', 'view', 'c', 'list', 'd', 'buyers_custom_dim']))
128 | uploader.process(Batch(execution, []))
129 |
130 | service.management().remarketingAudience().insert.assert_any_call(
131 | accountId=ga_account_id,
132 | webPropertyId='web_property',
133 | body={
134 | 'name': 'list',
135 | 'linkedViews': ['view'],
136 | 'linkedAdAccounts': [{
137 | 'type': 'MCC_LINKS',
138 | 'linkedAccountId': ads_account_id
139 | }],
140 | 'audienceType': 'SIMPLE',
141 | 'audienceDefinition': {
142 | 'includeConditions': {
143 | 'kind':
144 | 'analytics#includeConditions',
145 | 'isSmartList':
146 | False,
147 | 'segment':
148 | 'users::condition::%s==buyer' % 'buyers_custom_dim',
149 | 'membershipDurationDays':
150 | 365
151 | }
152 | }
153 | })
154 |
155 |
156 | def test_avoid_list_creation_when_name_blank(mocker, uploader):
157 | ads_account_id = 'xxx-yyy-zzzz'
158 | ga_account_id = 'acc'
159 |
160 | service = mocker.MagicMock()
161 |
162 | mocker.patch.object(uploader, '_get_analytics_service')
163 | uploader._get_analytics_service.return_value = service
164 |
165 | execution = Execution(
166 | AccountConfig(ads_account_id, True, ga_account_id, '', ''),
167 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']),
168 | Destination('dest1', DestinationType.GA_USER_LIST_UPLOAD,
169 | ['web_property', 'view', 'c', '', 'd', 'buyers_custom_dim']))
170 |
171 | uploader.process(Batch(execution, []))
172 |
173 | service.management().remarketingAudience().insert.assert_not_called()
174 |
175 |
176 | def test_elements_uploading(mocker, uploader):
177 | service = mocker.MagicMock()
178 |
179 | mocker.patch.object(uploader, '_get_analytics_service')
180 | uploader._get_analytics_service.return_value = service
181 |
182 | service.management().customDataSources().list().execute.return_value = {
183 | 'items': [{
184 | 'id': 1,
185 | 'name': 'data_import_name'
186 | }]
187 | }
188 |
189 | execution = Execution(
190 | AccountConfig('', False, '', '', ''),
191 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']),
192 | Destination('dest1', DestinationType.GA_USER_LIST_UPLOAD, [
193 | 'web_property', 'b', 'data_import_name', 'd', 'user_id_custom_dim',
194 | 'buyer_custom_dim'
195 | ]))
196 |
197 | # Add mock to side effect of uploadData()
198 | my_mock = mocker.MagicMock()
199 | service.management().uploads().uploadData.side_effect = my_mock
200 |
201 | # Act
202 | uploader.process(Batch(execution, [{
203 | 'user_id': '12'
204 | }, {
205 | 'user_id': '34'
206 | }]))
207 |
208 | # Called once
209 | my_mock.assert_called_once()
210 |
211 | # Intercept args called
212 | _, kwargs = my_mock.call_args
213 |
214 | # Check if really sent values from custom field
215 | media_bytes = kwargs['media_body'].getbytes(0, -1)
216 |
217 | assert media_bytes == b'user_id_custom_dim,buyer_custom_dim\n12,buyer\n34,buyer'
218 |
219 |
220 | def test_elements_uploading_custom_field(mocker, uploader):
221 | service = mocker.MagicMock()
222 |
223 | mocker.patch.object(uploader, '_get_analytics_service')
224 | uploader._get_analytics_service.return_value = service
225 |
226 | service.management().customDataSources().list().execute.return_value = {
227 | 'items': [{
228 | 'id': 1,
229 | 'name': 'data_import_name'
230 | }]
231 | }
232 |
233 | execution = Execution(
234 | AccountConfig('', False, '', '', ''),
235 | Source('orig1', SourceType.BIG_QUERY, ['dt1', 'buyers']),
236 | Destination('dest1', DestinationType.GA_USER_LIST_UPLOAD, [
237 | 'web_property', 'b', 'data_import_name', 'd', 'user_id_custom_dim',
238 | 'buyer_custom_dim', 'my_field'
239 | ]))
240 |
241 | # Add mock to side effect of uploadData()
242 | my_mock = mocker.MagicMock()
243 | service.management().uploads().uploadData.side_effect = my_mock
244 |
245 | # Act
246 | uploader.process(Batch(execution, [{
247 | 'user_id': '12',
248 | 'my_field': '11'
249 | }, {
250 | 'user_id': '34',
251 | 'my_field': '22'
252 | }]))
253 |
254 | # Called once
255 | my_mock.assert_called_once()
256 |
257 | # Intercept args called
258 | _, kwargs = my_mock.call_args
259 |
260 | # Check if really sent values from custom field
261 | media_bytes = kwargs['media_body'].getbytes(0, -1)
262 |
263 | assert media_bytes == b'user_id_custom_dim,buyer_custom_dim\n12,11\n34,22'
264 |
--------------------------------------------------------------------------------
/megalist_dataflow/uploaders/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import datetime
16 | from models.execution import DestinationType
17 | from models.execution import Execution
18 | import pytz
19 |
20 | MAX_RETRIES = 3
21 |
22 | timezone = pytz.timezone('America/Sao_Paulo')
23 |
24 |
25 | def get_ads_service(service_name, version, oauth_credentials, developer_token,
26 | customer_id):
27 | from googleads import adwords
28 | from googleads import oauth2
29 | oauth2_client = oauth2.GoogleRefreshTokenClient(
30 | oauth_credentials.get_client_id(), oauth_credentials.get_client_secret(),
31 | oauth_credentials.get_refresh_token())
32 | client = adwords.AdWordsClient(
33 | developer_token,
34 | oauth2_client,
35 | 'Mds Dataflow',
36 | client_customer_id=customer_id)
37 | client.partial_failure = True
38 | return client.GetService(service_name, version=version)
39 |
40 |
41 | def format_date(date):
42 | if isinstance(date, datetime.datetime):
43 | pdate = date
44 | else:
45 | pdate = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%S.%f')
46 |
47 | return f'{datetime.datetime.strftime(pdate, "%Y%m%d %H%M%S")} {timezone.zone}'
48 |
49 |
50 | def safe_process(logger):
51 | def deco(func):
52 | def inner(*args, **kwargs):
53 | batch = args[1]
54 | if not batch:
55 | logger.warning('Skipping upload, received no elements.')
56 | return
57 | logger.info(f'Uploading {len(batch.elements)} rows...')
58 | try:
59 | return func(*args, **kwargs)
60 | except Exception as e:
61 | logger.error(f'Error uploading data for :{batch.elements}')
62 | logger.error(e, exc_info=True)
63 | logger.exception('Error uploading data.')
64 |
65 | return inner
66 |
67 | return deco
68 |
69 |
70 | def safe_call_api(function, logger, *args, **kwargs):
71 | current_retry = 1
72 | _do_safe_call_api(function, logger, current_retry, *args, **kwargs)
73 |
74 |
75 | def _do_safe_call_api(function, logger, current_retry, *args, **kwargs):
76 | try:
77 | return function(*args, *kwargs)
78 | except Exception as e:
79 | if current_retry < MAX_RETRIES:
80 | logger.exception(
81 | f'Fail number {current_retry}. Stack track follows. Trying again.')
82 | current_retry += 1
83 | return _do_safe_call_api(function, logger, current_retry, *args, **kwargs)
84 |
85 |
86 | def convert_datetime_tz(dt, origin_tz, destination_tz):
87 | datetime_obj = pytz.timezone(origin_tz).localize(dt)
88 | return datetime_obj.astimezone(pytz.timezone(destination_tz))
89 |
--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | python_version = 3.8
3 | warn_return_any = True
4 | warn_unused_configs = True
5 | ignore_missing_imports = True
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.semantic_release]
2 | upload_to_pypi = false
3 | branch = 'master'
4 | version_variable = [
5 | 'megalist_dataflow/setup.py:__version__'
6 | ]
7 | version_source = 'tag'
8 | build_command = false
9 |
--------------------------------------------------------------------------------
/run_cloud.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2021 Google LLC
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # https://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
17 | if [ $# != 3 ]; then
18 | echo "Usage: $0 gcp_project_id bucket_name region"
19 | exit 1
20 | fi
21 |
22 | gcloud config set project $1
23 | token=$(gcloud auth application-default print-access-token)
24 | curl -H "Authorization: Bearer $token" -H "Content-Type:application/json" "https://dataflow.googleapis.com/v1b3/projects/$1/locations/$3/templates:launch?gcsPath=gs://$2/templates/mds" --data-binary "@cloud_config/scheduler.json"
25 |
--------------------------------------------------------------------------------
/run_tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2021 Google LLC
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # https://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | python3 -m mypy megalist_dataflow
17 | python3 -m pytest -vv --cov=megalist_dataflow -W ignore::DeprecationWarning
18 |
--------------------------------------------------------------------------------
/terraform/external.tf:
--------------------------------------------------------------------------------
1 | #create detaflow metadata
2 | resource "null_resource" "bucket_megalista_metadata" {
3 | provisioner "local-exec" {
4 | command = "sh ./scripts/deploy_cloud.sh ${data.google_client_config.current.project} ${var.bucket_name} ${var.region}"
5 | }
6 |
7 | depends_on = [google_storage_bucket.my_storage]
8 | }
--------------------------------------------------------------------------------
/terraform/main.tf:
--------------------------------------------------------------------------------
1 | data "google_client_config" "current" {
2 | }
3 |
4 | data "google_client_openid_userinfo" "me" {
5 | }
6 |
7 | resource "google_bigquery_dataset" "dataset" {
8 | dataset_id = var.bq_ops_dataset
9 | location = var.location
10 | description = "Auxliary bigquery dataset for Megalista operations to create"
11 | delete_contents_on_destroy = true
12 | }
13 |
14 | locals {
15 | scheduler_body = <