├── .circleci └── config.yml ├── .dockerignore ├── .flake8 ├── .gce_boto ├── .gcloudignore ├── .github ├── CODEOWNERS ├── dependabot.yml └── workflows │ ├── glean.yaml │ └── update-fog.yml ├── .gitignore ├── .yamllint ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── conftest.py ├── docker-compose.yml ├── docs.png ├── docs └── common-failures.md ├── fog-updater ├── Dockerfile ├── README.md ├── action.yml ├── requirements.txt └── src │ ├── fog_update.py │ └── test_util.py ├── main.py ├── notebooks └── load_and_run.ipynb ├── probe_scraper ├── __init__.py ├── check_repositories.py ├── emailer.py ├── exc.py ├── fog_checks.py ├── glean_checks.py ├── glean_push.py ├── model_validation.py ├── parsers │ ├── __init__.py │ ├── events.py │ ├── histograms.py │ ├── metrics.py │ ├── pings.py │ ├── repositories.py │ ├── scalars.py │ ├── tags.py │ ├── third_party │ │ ├── __init__.py │ │ ├── histogram_tools.py │ │ ├── parse_events.py │ │ ├── parse_scalars.py │ │ ├── shared_telemetry_utils.py │ │ └── usecounters.py │ └── utils.py ├── ping_expiry_alert.py ├── probe_expiry_alert.py ├── remote_storage.py ├── runner.py ├── scrapers │ ├── __init__.py │ ├── buildhub.py │ ├── git_scraper.py │ └── moz_central_scraper.py ├── transform_probes.py └── transform_revisions.py ├── probeinfo_api.yaml ├── pytest.ini ├── repositories.yaml ├── requirements.txt ├── setup.py ├── test_requirements.txt └── tests ├── __init__.py ├── resources ├── Histograms.json ├── UseCounters.conf ├── metrics.yaml ├── nsDeprecatedOperationList.h ├── test_events.yaml ├── test_repo_files │ ├── duplicate │ │ └── 0 │ │ │ └── metrics.yaml │ ├── expired │ │ └── 0 │ │ │ └── metrics.yaml │ ├── improper │ │ └── 0 │ │ │ └── metrics.yaml │ └── normal │ │ ├── 0 │ │ └── metrics.yaml │ │ ├── 1 │ │ └── metrics.yaml │ │ └── 2 │ │ ├── metrics.yaml │ │ └── tags.yaml └── test_scalars.yaml ├── test_buildhub.py ├── test_event_parser.py ├── test_fog_checks.py ├── test_git_scraper.py ├── test_glean_checks.py ├── test_glean_limit_date.py ├── test_glean_push.py ├── test_histogram_parser.py ├── test_library_refs.py ├── test_metrics_parser.py ├── test_moz_central_scraper.py ├── test_ping_expiry_alert.py ├── test_probe_expiry_alert.py ├── test_repositories_parser.py ├── test_runner.py ├── test_scalar_parser.py └── test_transform_probes.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | version: 2 4 | jobs: 5 | build_and_test: 6 | machine: 7 | image: default 8 | working_directory: ~/mozilla/probe-scraper 9 | steps: 10 | - checkout 11 | - run: make build 12 | - run: make lint 13 | - run: make check-repos 14 | - run: make test 15 | - run: make burnham-dryrun 16 | 17 | deploy_docker: 18 | docker: &gcloud-image 19 | - image: gcr.io/google.com/cloudsdktool/cloud-sdk:398.0.0-alpine 20 | working_directory: ~/mozilla/probe-scraper 21 | steps: 22 | - checkout 23 | - setup_remote_docker: 24 | docker_layer_caching: true 25 | - run: 26 | name: Build container 27 | command: | 28 | docker build -t app:build . 29 | - run: 30 | name: Configure gcloud 31 | command: | 32 | echo $GCLOUD_SERVICE_KEY | gcloud auth activate-service-account --key-file=- 33 | gcloud --quiet config set project ${GOOGLE_PROJECT_ID} 34 | gcloud --quiet config set compute/zone ${GOOGLE_COMPUTE_ZONE} 35 | gcloud auth configure-docker 36 | - run: 37 | name: Deploy to GCR 38 | command: | 39 | DOCKER_IMAGE="gcr.io/${GOOGLE_PROJECT_ID}/probe-scraper" 40 | # Deploy main 41 | if [ "${CIRCLE_BRANCH}" == main ]; then 42 | docker tag app:build "${DOCKER_IMAGE}:latest" 43 | docker push "${DOCKER_IMAGE}:latest" 44 | elif [ ! -z "${CIRCLE_TAG}" ]; then 45 | # Deploy a release tag... 46 | echo "${DOCKER_IMAGE}:${CIRCLE_TAG}" 47 | docker tag app:build "${DOCKER_IMAGE}:${CIRCLE_TAG}" 48 | docker images 49 | docker push "${DOCKER_IMAGE}:${CIRCLE_TAG}" 50 | fi 51 | 52 | deploy_cloud_function: 53 | docker: *gcloud-image 54 | steps: 55 | - checkout 56 | - run: 57 | name: Install jq 58 | command: apk add jq 59 | - run: 60 | name: Activate Credentials 61 | command: | 62 | KEY="$(echo "$GCLOUD_SERVICE_KEY_PROD_B64" | base64 -d)" 63 | echo "$KEY" | gcloud --quiet auth activate-service-account --key-file=- 64 | gcloud --quiet config set project "$(echo "$KEY" | jq -r .project_id)" 65 | - run: 66 | # `--source=.` in the command below is a workaround for deployment issues 67 | # See DENG-3665 68 | name: Deploy Google Cloud Function 69 | command: > 70 | gcloud functions deploy glean-push 71 | --region=us-west1 72 | --allow-unauthenticated 73 | --entry-point=glean_push 74 | --memory=2048 75 | --runtime=python310 76 | --set-env-vars=BOTO_PATH=.gce_boto,OUTPUT_BUCKET=gs://probe-scraper-prod-artifacts/ 77 | --trigger-http 78 | --service-account=$PROD_SERVICE_ACCOUNT_INVOKER 79 | --timeout=540s 80 | --source=. 81 | 82 | docs_build: 83 | docker: 84 | - image: cimg/node:lts 85 | steps: 86 | - checkout 87 | - run: 88 | name: Install redoc 89 | command: | 90 | npm install @redocly/cli 91 | - run: 92 | name: Build docs 93 | command: | 94 | npx @redocly/cli build-docs probeinfo_api.yaml -o docs/index.html \ 95 | --theme.openapi.expandResponses="200,201" \ 96 | --theme.openapi.jsonSampleExpandLevel=2 97 | - persist_to_workspace: 98 | root: docs 99 | paths: index.html 100 | docs_deploy: 101 | docker: 102 | - image: cimg/node:lts 103 | steps: 104 | - checkout 105 | - attach_workspace: 106 | at: docs/ 107 | - run: 108 | name: Disable jekyll builds 109 | command: touch docs/.nojekyll 110 | - run: 111 | name: Install and configure dependencies 112 | command: | 113 | npm install gh-pages@2.0.1 114 | git config user.email "ci-build@mozilla.com" 115 | git config user.name "ci-build" 116 | - add_ssh_keys: 117 | fingerprints: 118 | - "37:cd:ad:cf:75:1f:96:9f:9b:ce:e0:6c:b4:09:26:4d" 119 | - run: 120 | name: Deploy docs to gh-pages branch 121 | command: npx gh-pages --dotfiles --message "[skip ci] Updates" --dist docs/ 122 | 123 | workflows: 124 | version: 2 125 | build-test-deploy: 126 | jobs: 127 | - build_and_test: 128 | filters: 129 | tags: 130 | only: /.*/ 131 | - docs_build 132 | - docs_deploy: 133 | requires: 134 | - docs_build 135 | filters: 136 | branches: 137 | only: main 138 | - deploy_docker: 139 | context: data-eng-airflow-gcr 140 | requires: 141 | - build_and_test 142 | filters: 143 | tags: 144 | only: /.*/ 145 | branches: 146 | only: main 147 | - deploy_cloud_function: 148 | context: probe-scraper 149 | requires: 150 | - build_and_test 151 | filters: 152 | branches: 153 | only: main 154 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # idea project settings 86 | .idea/ 87 | *.iml 88 | 89 | # Spyder project settings 90 | .spyderproject 91 | 92 | # Rope project settings 93 | .ropeproject 94 | 95 | # Cache files 96 | probe_scraper_errors_cache.json 97 | probe_scraper_cache.sqlite 98 | 99 | # Temp files. 100 | temp/ 101 | _tmp/ 102 | _out/ 103 | _temp/ 104 | 105 | *.json 106 | !tests/resources/*.json 107 | !schemas/*.json 108 | 109 | # Generated files 110 | index.html 111 | emails.txt 112 | .repositories.yaml 113 | 114 | # Ignore .DS_Store 115 | .DS_Store 116 | 117 | # A convenient place for the cache 118 | .scraper_cache/ 119 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # See http://pep8.readthedocs.io/en/latest/intro.html#configuration 3 | max-line-length = 100 4 | filename = *.py, +.lint 5 | exclude = probe_scraper/parsers/third_party/* venv 6 | -------------------------------------------------------------------------------- /.gce_boto: -------------------------------------------------------------------------------- 1 | [GoogleCompute] 2 | service_account = default 3 | 4 | [GSUtil] 5 | state_dir = /tmp/gsutil 6 | -------------------------------------------------------------------------------- /.gcloudignore: -------------------------------------------------------------------------------- 1 | .git/ 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *,cover 48 | .hypothesis/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # IPython Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # dotenv 81 | .env 82 | 83 | # virtualenv 84 | venv/ 85 | ENV/ 86 | 87 | # idea project settings 88 | .idea/ 89 | *.iml 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # Cache files 98 | probe_scraper_errors_cache.json 99 | probe_scraper_cache.sqlite 100 | 101 | # Temp files. 102 | temp/ 103 | _tmp/ 104 | _out/ 105 | _temp/ 106 | 107 | *.json 108 | !tests/resources/*.json 109 | !schemas/*.json 110 | 111 | # Generated files 112 | index.html 113 | emails.txt 114 | .repositories.yaml 115 | 116 | # Ignore .DS_Store 117 | .DS_Store 118 | 119 | # A convenient place for the cache 120 | .scraper_cache/ 121 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Adding/changing the data in repositories.yaml can have large downstream 2 | # effects, so we're a little stricter on who can sign off on changes here. 3 | repositories.yaml @chutten @akkomar @whd @mikaeld @dexterp37 @badboy @travis79 4 | 5 | # The exclusion list in git_scraper.py can cause similar problems (see e.g. 6 | # https://bugzilla.mozilla.org/show_bug.cgi?id=1745771) 7 | probe_scraper/scrapers/git_scraper.py @chutten @akkomar @whd @mikaeld @dexterp37 @badboy @travis79 8 | 9 | fog_updater/* @chutten @badboy 10 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | # Auto-update to next glean-parser major version 4 | - package-ecosystem: "pip" 5 | directory: "/" 6 | schedule: 7 | interval: "daily" 8 | versioning-strategy: lockfile-only 9 | reviewers: 10 | - "mozilla/glean" 11 | versioning-strategy: increase-if-necessary 12 | allow: 13 | - dependency-name: "glean-parser" 14 | -------------------------------------------------------------------------------- /.github/workflows/glean.yaml: -------------------------------------------------------------------------------- 1 | name: Glean probe-scraper 2 | 3 | on: 4 | workflow_call: 5 | 6 | jobs: 7 | probe-scraper: 8 | name: Glean probe-scraper 9 | runs-on: ubuntu-22.04 10 | steps: 11 | - name: Validate Glean metrics via probe-scraper, and if appropriate publish changes 12 | run: |- 13 | curl --fail-with-body https://us-west1-moz-fx-data-probe-s-prod-2bc3.cloudfunctions.net/glean-push --data '{ 14 | "url": "${{github.server_url}}/${{github.repository}}", 15 | "commit":"${{github.sha}}", 16 | "branch":"${{github.ref_name}}" 17 | }' 18 | 19 | -------------------------------------------------------------------------------- /.github/workflows/update-fog.yml: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/ 4 | 5 | # This workflow periodically calls the fog-update-bot action to update 6 | # the list of FOG metrics.yaml and ping.yaml files from its metrics_index.py 7 | 8 | 9 | name: "Update FOG" 10 | 11 | permissions: 12 | contents: write 13 | pull-requests: write 14 | 15 | on: 16 | schedule: 17 | # 04:20 UTC - every morning 18 | # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#schedule 19 | - cron: '20 4 * * *' 20 | workflow_dispatch: 21 | 22 | jobs: 23 | main: 24 | name: "Update FOG" 25 | runs-on: ubuntu-22.04 26 | steps: 27 | - name: Checkout 28 | uses: actions/checkout@v3 29 | - name: "Update FOG" 30 | uses: ./fog-updater 31 | id: fog-updater 32 | if: github.repository == 'mozilla/probe-scraper' 33 | env: 34 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # idea project settings 86 | .idea/ 87 | *.iml 88 | 89 | # Spyder project settings 90 | .spyderproject 91 | 92 | # Rope project settings 93 | .ropeproject 94 | 95 | # Cache files 96 | probe_scraper_errors_cache.json 97 | probe_scraper_cache.sqlite 98 | 99 | # Temp files. 100 | temp/ 101 | _tmp/ 102 | _out/ 103 | _temp/ 104 | 105 | *.json 106 | !tests/resources/*.json 107 | !schemas/*.json 108 | 109 | # Generated files 110 | index.html 111 | emails.txt 112 | .repositories.yaml 113 | 114 | # Ignore .DS_Store 115 | .DS_Store 116 | 117 | # A convenient place for the cache 118 | .scraper_cache/ 119 | 120 | # JavaScript tooling 121 | node_modules/ 122 | -------------------------------------------------------------------------------- /.yamllint: -------------------------------------------------------------------------------- 1 | extends: default 2 | 3 | rules: 4 | line-length: 5 | allow-non-breakable-words: true 6 | allow-non-breakable-inline-mappings: true 7 | ignore: | 8 | .circleci/config.yml 9 | probeinfo_api.yaml 10 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Community Participation Guidelines 2 | 3 | This repository is governed by Mozilla's code of conduct and etiquette guidelines. 4 | For more details, please read the 5 | [Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). 6 | 7 | ## How to Report 8 | For more information on how to report violations of the Community Participation Guidelines, please read our '[How to Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' page. 9 | 10 | 16 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Thank you for your interest in contributing to probe-scraper! 2 | This document tries to codify some best practices for contribution to this 3 | repository. 4 | 5 | ## Participation guidelines 6 | 7 | All communication is expected to follow the 8 | [Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). 9 | For more information, see the [code of conduct document](./CODE_OF_CONDUCT.md) 10 | in the root of this repository. 11 | 12 | ## Filing issues 13 | 14 | File an issue if you have a bug report or feature request that you (personally) 15 | do not intend to work on right away _or_ you would like additional feedback on 16 | your approach before starting implementation work. If you found a bug (or small 17 | missing feature) and you want to start implementing it immediately (or already 18 | have a solution), go ahead and skip straight to making a pull request (see 19 | below). 20 | 21 | To help with triage, issues should have a descriptive title. Examples of good 22 | issue titles: 23 | 24 | - Require channels be unique for applications 25 | - "Telemetry Probe Expiry" emails sometimes don't include list of filed bugs 26 | 27 | In the issue itself, provide as much information as necessary to help someone 28 | reading it understand the nature of the problem (and provide feedback). For 29 | examples of this, look at some of the 30 | [fixed issues](https://github.com/mozilla/probe-scraper/issues?q=is%3Aissue+is%3Aclosed) 31 | filed by the project maintainers. 32 | 33 | Occasionally, probe-scraper bugs are tracked inside Bugzilla, especially for issues 34 | which might affect other parts of the pipeline. 35 | 36 | ## Opening pull requests 37 | 38 | Like issues, pull requests should have a descriptive title to help with triage. 39 | However there are two things that are different: 40 | 41 | - Instead of pointing out a problem, they should describe the solution 42 | - If a pull request fixes a specific issue, the title should specify 43 | `(fixes #X)` (where X refers to the issue number) 44 | 45 | For example, a pull request to fix an issue entitled `"Telemetry Probe Expiry" emails sometimes don't include list of filed bugs` could be named `Include list of filed bugs in "Telemetry Probe Expiry" emails (fixes #1234)`. 46 | 47 | When a pull request fixes a bug in Bugzilla, prepend the bug number to the title with 48 | the keyword `Bug ` in the format `Bug XXXX - `. 49 | This allow the [Bugzilla PR Linker] to link to this PR automatically in bugzilla. 50 | For example, `Bug 1234 - Include list of filed bugs in "Telemetry Probe Expiry" emails`. 51 | 52 | As much as possible, each pull request should attempt to solve _one problem_. 53 | For logically separate changes, file multiple PRs. 54 | 55 | Make sure that the pull request passes continuous integration (including linter 56 | errors) and that there are no merge conflicts before asking for review. If you 57 | want some feedback on a work-in-progress (where these conditions are not yet 58 | met), mark your pull request as a draft. 59 | 60 | [bugzilla pr linker]: https://github.com/mozilla/github-bugzilla-pr-linker 61 | 62 | ## Dangerous changes 63 | 64 | This repository is central to the ingestion and processing of Telemetry data at 65 | Mozilla. 66 | Changes made to probe-scraper can have large downstream consequences, such as unwanted changes to our BigQuery tables. 67 | In particular, adding new Glean repositories (`repositories.yaml` at the root 68 | of this repository) needs to be done with care. 69 | 70 | Things to bear in mind: 71 | 72 | - Once probe scraper has successfully run, there is no changing or rewriting history of the metrics files, as this will cause problems downstream with [mozilla-schema-generator]. 73 | - There is currently no provision for deleting a repository once added (see [bug 1747811]). 74 | 75 | As such, testing of works in progress should happen locally with a probe-scraper checkout (see the "dry run" instructions in the README) and/or evaluating test pings via the [Glean Debug Ping Viewer]. 76 | Under no circumstances should you add a testing application to "see what happens". 77 | If you only want part of the history of a repository processed by probe-scraper, you can set a "start 78 | date" in `probe_scraper/scrapers/git_scraper.py` _before_ the first successful run of probe-scraper 79 | against it (i.e. the changes to `git_scraper.py` and `repositories.yaml` should land as a unit). 80 | 81 | To try and prevent incidents from occurring, changes to these files must go through people who have extensive 82 | experience debugging and reasoning about the schema generation portions of the data pipeline, documented in `.github/CODEOWNERS`. 83 | If you submit a pull request, these people will automatically be flagged for review. 84 | 85 | [mozilla-schema-generator]: https://github.com/mozilla/mozilla-schema-generator 86 | [bug 1747811]: https://bugzilla.mozilla.org/show_bug.cgi?id=1747811 87 | [glean debug ping viewer]: https://mozilla.github.io/glean/book/user/debugging/index.html#glean-debug-view 88 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11 2 | 3 | ENV PYTHONUNBUFFERED=1 4 | 5 | ARG APP_NAME=probe-scraper 6 | ENV APP_NAME=${APP_NAME} 7 | 8 | # Guidelines here: https://github.com/mozilla-services/Dockerflow/blob/master/docs/building-container.md 9 | ARG USER_ID="10001" 10 | ARG GROUP_ID="app" 11 | ARG HOME="/app" 12 | 13 | ENV HOME=${HOME} 14 | RUN groupadd --gid ${USER_ID} ${GROUP_ID} && \ 15 | useradd --create-home --uid ${USER_ID} --gid ${GROUP_ID} --home-dir /app ${GROUP_ID} 16 | 17 | # List packages here 18 | RUN apt-get update && \ 19 | apt-get install -y --no-install-recommends \ 20 | file \ 21 | gcc \ 22 | libwww-perl && \ 23 | apt-get autoremove -y && \ 24 | apt-get clean 25 | 26 | # Upgrade pip 27 | RUN pip install --upgrade pip 28 | # Ensure setuptools is new enough, to avoid issues with wheels 29 | RUN pip install 'setuptools>=70.1' 30 | 31 | WORKDIR ${HOME} 32 | 33 | COPY requirements.txt ${HOME}/ 34 | RUN pip install -r requirements.txt 35 | 36 | COPY test_requirements.txt ${HOME}/ 37 | RUN pip install -r test_requirements.txt 38 | 39 | COPY . ${HOME} 40 | RUN pip install . 41 | 42 | # Drop root and change ownership of the application folder to the user 43 | RUN chown -R ${USER_ID}:${GROUP_ID} ${HOME} 44 | USER ${USER_ID} 45 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: help clean lint test build docker-rm shell run stop apidoc 2 | 3 | help: 4 | @echo " apidoc Render the API documentation locally to index.html" 5 | @echo " clean Remove build artifacts" 6 | @echo " check-repos Verify all repositories in repositories.yaml are scrapable" 7 | @echo " lint Check style with flake8" 8 | @echo " format Format code with black and isort" 9 | @echo " test Run tests quickly with the default Python" 10 | @echo " build Builds the docker images for the docker-compose setup" 11 | @echo " docker-rm Stops and removes all docker containers" 12 | @echo " shell Opens a Bash shell" 13 | @echo " run Run a command. Can run scripts, e.g. make run COMMAND=\"./scripts/schema_generator.sh\"" 14 | @echo " stop Stop docker compose" 15 | 16 | clean: clean-build clean-pyc docker-rm 17 | 18 | clean-build: 19 | rm -fr build/ 20 | rm -fr dist/ 21 | rm -fr *.egg-info 22 | 23 | clean-pyc: 24 | find . -name '*.pyc' -exec rm -f {} + 25 | find . -name '*.pyo' -exec rm -f {} + 26 | find . -name '*~' -exec rm -f {} + 27 | 28 | apidoc: 29 | # Keep in sync with doc task in .circleci/config.yml 30 | docker run --rm \ 31 | -v ${PWD}:/local \ 32 | cimg/node:lts \ 33 | sh -c "npm install @redocly/cli; npx @redocly/cli build-docs /local/probeinfo_api.yaml -o /local/index.html --theme.openapi.expandResponses='200,201' --theme.openapi.jsonSampleExpandLevel=2" 34 | 35 | format: 36 | python3 -m black probe_scraper tests ./*.py 37 | python3 -m isort --profile black probe_scraper tests ./*.py 38 | 39 | lint: build 40 | docker-compose run app flake8 . 41 | docker-compose run app yamllint repositories.yaml .circleci 42 | docker-compose run app python -m black --check probe_scraper tests ./*.py 43 | docker-compose run app python -m isort --profile black --check-only probe_scraper tests ./*.py 44 | 45 | check-repos: 46 | docker-compose run app python -m probe_scraper.check_repositories 47 | 48 | test: build 49 | docker-compose run app pytest tests/ --run-web-tests 50 | 51 | # For this test, we scrape glean-core and burnham. 52 | # Even though burnham is deprecated, it should still be valid to be scraped 53 | # See also mozilla/probe-scraper#283. 54 | # We set a limit date due to more strict parsing. 55 | # glean-core's metrics.yaml prior to 2023-10-20 cannot be parsed with a modern glean-parser. 56 | burnham-dryrun: 57 | docker-compose run app python -m probe_scraper.runner --glean --glean-repo glean-core --glean-repo glean-android --glean-repo burnham --glean-limit-date 2023-10-20 --dry-run 58 | 59 | build: 60 | docker-compose build 61 | 62 | docker-rm: stop 63 | docker-compose rm -f 64 | 65 | shell: 66 | docker-compose run --entrypoint "/bin/bash" app 67 | 68 | run: build 69 | docker-compose run app $(COMMAND) 70 | 71 | stop: 72 | docker-compose down 73 | docker-compose stop 74 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # probe-scraper 2 | Scrape Telemetry probe data from Firefox repositories. 3 | 4 | This extracts per-version Telemetry probe data for Firefox and other Mozilla products from registry files like Histograms.json and Scalars.yaml. 5 | The data allows answering questions like "which Firefox versions is this Telemetry probe in anyway?". 6 | Also, probes outside of Histograms.json - like the CSS use counters - are included in the output data. 7 | 8 | The data is pulled from two different sources: 9 | - From [`hg.mozilla.org`](https://hg.mozilla.org) for Firefox data. 10 | - From a [configurable set of Github repositories](repositories.yaml) that use [Glean](https://github.com/mozilla-mobile/android-components/tree/master/components/service/glean). 11 | 12 | Probe Scraper outputs JSON to https://probeinfo.telemetry.mozilla.org. 13 | Effectively, this creates a REST API which can be used by downstream tools like 14 | [mozilla-schema-generator](https://github.com/mozilla/mozilla-schema-generator) 15 | and various data dictionary type applications (see below). 16 | 17 | An [OpenAPI reference](https://mozilla.github.io/probe-scraper/) to this API is available: 18 | 19 | ![probeinfo API docs](docs.png) 20 | 21 | A web tool to explore the Firefox-related data is available at [probes.telemetry.mozilla.org](https://probes.telemetry.mozilla.org/). A project to develop a similar view for Glean-based data 22 | is under development in the [Glean Dictionary](https://github.com/mozilla/glean-dictionary). 23 | 24 | ## Deprecation 25 | 26 | Deprecation is an important step in an application lifecycle. Because of the backwards-compatible nature of our pipeline, we do not 27 | remove Glean apps or variants from the `repositories.yaml` file - instead, we mark them as deprecated. 28 | 29 | ### Marking an App Variant as deprecated 30 | 31 | When an app variant is marked as deprecated (see this [example from Fenix](https://github.com/mozilla/probe-scraper/blob/213055f967b4903933667002ec376cd69cdf5a77/repositories.yaml#L415-L431)), the following happens: 32 | - It shows as `[Deprecated]` in the Glean Dictionary, in the `Access` section (see e.g. [Fenix's client_id metric](https://dictionary.telemetry.mozilla.org/apps/fenix/metrics/client_id)). 33 | 34 | ### Marking an App as deprecated 35 | 36 | When an app is marked as deprecated (see this [example of Firefox for Fire TV](https://github.com/mozilla/probe-scraper/blob/213055f967b4903933667002ec376cd69cdf5a77/repositories.yaml#L501-L504)), the following happens: 37 | - It no longer shows by default in the Glean Dictionary. (Deprecated apps can be viewed by clicking the `Show deprecated applications` checkbox) 38 | 39 | ## Adding a New Glean Repository 40 | 41 | To scrape a git repository for probe definitions, an entry needs to be added in `repositories.yaml`. 42 | The exact format of the entry depends on whether you are adding an application or a library. See below for details. 43 | 44 | ### Adding an application 45 | 46 | For a given application, Glean metrics are emitted by the application itself, any libraries it uses 47 | that also use Glean, as well as the Glean library proper. Therefore, probe scraper needs a way to 48 | find all of the dependencies to determine all of the metrics emitted by 49 | that application. 50 | 51 | Therefore, each application should specify a `dependencies` parameter, which is a list of Glean-using libraries used by the application. Each entry should be a library name as specified by the library's `library_names` parameter. 52 | 53 | For Android applications, if you're not sure what the dependencies of the application are, you can run the following command at the root of the project folder: 54 | 55 | ```bash 56 | $ ./gradlew :app:dependencies 57 | ``` 58 | 59 | See the full [application schema documentation](https://mozilla.github.io/probe-scraper/#tag/application) 60 | for descriptions of all the available parameters. 61 | 62 | ### Adding a library 63 | 64 | Probe scraper also needs a way to map dependencies back to an entry in the 65 | `repositories.yaml` file. Therefore, any libraries defined should also include 66 | their build-system-specific library names in the `library_names` parameter. 67 | 68 | See the full [library schema documentation](https://mozilla.github.io/probe-scraper/#tag/library) 69 | for descriptions of all the available parameters. 70 | 71 | ## Developing the probe-scraper 72 | 73 | You can choose to develop using the container, or locally. Using the container will be slower, since changes will trigger a rebuild of the container. 74 | But using the container method will ensure that your PR passes CircleCI build/test phases. 75 | 76 | ### Local development 77 | 78 | You may wish to, 79 | instead of installing all these requirements in your global Python environment, 80 | start by generating and activating a 81 | [Python virtual environment](https://docs.python.org/3/library/venv.html). 82 | The `.gitignore` expects it to be called `ENV` or `venv`: 83 | ```console 84 | python -m venv venv 85 | . venv/bin/activate 86 | ``` 87 | 88 | Install the requirements: 89 | ``` 90 | pip install -r requirements.txt 91 | pip install -r test_requirements.txt 92 | python setup.py develop 93 | ``` 94 | 95 | Run tests. This by default does not run tests that require a web connection: 96 | ``` 97 | pytest tests/ 98 | ``` 99 | 100 | To run all tests, including those that require a web connection: 101 | ``` 102 | pytest tests/ --run-web-tests 103 | ``` 104 | 105 | To test whether the code conforms to the style rules, you can run: 106 | ``` 107 | python -m black --check probe_scraper tests ./*.py 108 | flake8 --max-line-length 100 probe_scraper tests ./*.py 109 | yamllint repositories.yaml .circleci 110 | python -m isort --profile black --check-only probe_scraper tests ./*.py 111 | ``` 112 | 113 | To render API documentation locally to `index.html`: 114 | ``` 115 | make apidoc 116 | ``` 117 | 118 | ### Developing using the container 119 | 120 | Run tests in container. This does not run tests that require a web connection: 121 | ``` 122 | export COMMAND='pytest tests/' 123 | make run 124 | ``` 125 | 126 | To run all tests, including those that require a web connection: 127 | ``` 128 | make test 129 | ``` 130 | 131 | To test whether the code conforms to the style rules, you can run: 132 | ``` 133 | make lint 134 | ``` 135 | 136 | ### Tests with Web Dependencies 137 | 138 | Any tests that require a web connection to run should be marked with `@pytest.mark.web_dependency`. 139 | 140 | These will not run by default, but will run on CI. 141 | 142 | ### Performing a Dry-Run 143 | 144 | Before opening a PR, it's good to test the code you wrote on the production data. You can specify a specific Firefox 145 | version to run on by using `first-version`: 146 | ``` 147 | export COMMAND='python -m probe_scraper.runner --firefox-version 65 --dry-run' 148 | make run 149 | ``` 150 | or locally via: 151 | ``` 152 | python -m probe_scraper.runner --firefox-version 65 --dry-run 153 | ``` 154 | 155 | Including `--dry-run` means emails will not be sent. 156 | 157 | Additionally, you can test just on Glean repositories: 158 | ``` 159 | export COMMAND='python -m probe_scraper.runner --glean --dry-run' 160 | make run 161 | ``` 162 | 163 | By default that will test against every Glean repository, which might take a while. If you want to test against just one (e.g. a new repository you're adding), you can use the `--glean-repo` argument to just test the repositories you care about: 164 | ``` 165 | export COMMAND='python -m probe_scraper.runner --glean --glean-repo glean-core --glean-repo glean-android --glean-repo burnham --dry-run' 166 | make run 167 | ``` 168 | 169 | Replace burnham in the example above with your repository and its dependencies. 170 | 171 | You can also do the dry-run locally: 172 | 173 | ``` 174 | python -m probe_scraper.runner --glean --glean-repo glean-core --glean-repo glean-android --glean-repo burnham --dry-run 175 | ``` 176 | 177 | ## Module overview 178 | 179 | The module is built around the following data flow: 180 | 181 | - scrape registry files from mozilla-central, clone files from repositories directory 182 | - extract probe data from the files 183 | - transform probe data into output formats 184 | - save to disk 185 | 186 | The code layout consists mainly of: 187 | 188 | - `probe_scraper` 189 | - `runner.py` - the central script, ties the other pieces together 190 | - `scrapers` 191 | - `buildhub.py` - pull build info from the [BuildHub](https://buildhub.moz.tools) service 192 | - `moz_central_scraper.py` - loads probe registry files for multiple versions from mozilla-central 193 | - `git_scraper.py` - loads probe registry files from a git repository (no version or channel support yet, just per-commit) 194 | - `parsers/` - extract probe data from the registry files 195 | - `third_party` - these are imported parser scripts from [mozilla-central](https://dxr.mozilla.org/mozilla-central/source/toolkit/components/telemetry/) 196 | - `transform_*.py` - transform the extracted raw data into output formats 197 | - `tests/` - the unit tests 198 | 199 | ## Accessing the data files 200 | The processed probe data is serialized to the disk in a directory hierarchy starting from the provided output directory. The directory layout resembles a REST-friendly structure. 201 | 202 | |-- product 203 | |-- general 204 | |-- revisions 205 | |-- channel (or "all") 206 | |-- ping type 207 | |-- probe type (or "all_probes") 208 | 209 | For example, all the JSON probe data in the [main ping]() for the *Firefox Nightly* channel can be accessed with the following path: `firefox/nightly/main/all_probes`. The probe data for all the channels (same product and ping) can be accessed instead using `firefox/all/main/all_probes`. 210 | 211 | The root directory for the output generated from the scheduled job can be found at . 212 | All the probe data for Firefox coming from the main ping can be found at . 213 | 214 | ## Accessing `Glean` metrics data 215 | Glean data is generally laid out as follows: 216 | 217 | ``` 218 | | -- glean 219 | | -- repositories 220 | | -- general 221 | | -- repository-name 222 | | -- general 223 | | -- metrics 224 | ``` 225 | 226 | For example, the data for a repository called `fenix` would be found at [`/glean/fenix/metrics`](https://probeinfo.telemetry.mozilla.org/glean/fenix/metrics). The time the data was last updated for that project can be found at [`glean/fenix/general`](https://probeinfo.telemetry.mozilla.org/glean/fenix/general). 227 | 228 | A list of available repositories is at [`/glean/repositories`](https://probeinfo.telemetry.mozilla.org/glean/repositories). 229 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def pytest_addoption(parser): 5 | parser.addoption( 6 | "--run-web-tests", 7 | action="store_true", 8 | default=False, 9 | help="Run tests that require a web connection", 10 | ) 11 | 12 | 13 | def pytest_collection_modifyitems(config, items): 14 | if config.getoption("--run-web-tests"): 15 | return 16 | skip_web = pytest.mark.skip(reason="Need --run-web-tests option to run") 17 | for item in items: 18 | if "web_dependency" in item.keywords: 19 | item.add_marker(skip_web) 20 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | app: 3 | build: 4 | context: . 5 | dockerfile: Dockerfile 6 | restart: "no" 7 | command: "true" 8 | -------------------------------------------------------------------------------- /docs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/probe-scraper/0506e31f38e61ddc662c0eab18826b370314896e/docs.png -------------------------------------------------------------------------------- /docs/common-failures.md: -------------------------------------------------------------------------------- 1 | # Common failures of probe-scraper runs and how to solve them 2 | 3 | `probe-scraper` runs every week day in pull mode for some repositories, such as `mozilla-central`. 4 | It looks at all commits changing metric and ping definition files (`metrics.yaml`, `pings.yaml`). 5 | This can fail for a variety of reasons. 6 | 7 | ## Backouts 8 | 9 | Commits adding new metric or ping files get backed out, thus removing the file again. 10 | 11 | ### Solution 12 | 13 | Add the offending commits to the the `SKIP_COMMITS` list of the product in [`probe_scraper/scrapers/git_scraper.py`][skipcommits]. 14 | 15 | 16 | [skipcommits]: https://github.com/mozilla/probe-scraper/blob/1d23fcf4d041ea7fdf3e2c0c79252151f472ad0b/probe_scraper/scrapers/git_scraper.py 17 | 18 | 19 | ## Invalid metric definitions files 20 | 21 | A new commit changes a `metrics.yaml` file in a way that fails to parse. 22 | That is fixed in a subsequent commit. 23 | 24 | ### Solution 25 | 26 | Add the offending commit(s) to `SKIP_COMMITS` as above for [Backouts](#backouts). 27 | 28 | ## Invalid metric definitions files in the past 29 | 30 | A `metrics.yaml` is already available in old commits in a project, but invalid. 31 | At some point later the file is fixed and correct. 32 | 33 | ### Solution 34 | 35 | Add a minimal date from which to start parsing the file in `MIN_DATES` in [`probe-scraper/probe_scraper/scrapers/git_scraper.py`][mindates]. 36 | 37 | [mindates]: https://github.com/mozilla/probe-scraper/blob/1d23fcf4d041ea7fdf3e2c0c79252151f472ad0b/probe_scraper/scrapers/git_scraper.py#L29 38 | -------------------------------------------------------------------------------- /fog-updater/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10 2 | 3 | WORKDIR /usr/src/app 4 | 5 | COPY requirements.txt ./ 6 | RUN pip install --no-cache-dir -r requirements.txt 7 | 8 | COPY src/* ./ 9 | 10 | ENTRYPOINT ["/usr/src/app/fog_update.py"] 11 | -------------------------------------------------------------------------------- /fog-updater/README.md: -------------------------------------------------------------------------------- 1 | # fog-update-bot 2 | 3 | Automation to update `repositories.yaml` of `probe-scraper` with the latest `metrics_index.py` list. 4 | 5 | Fetches and parses the `metrics_index.py` from `mozilla-firefox/firefox`, extracts the relevant list of YAML files 6 | and creates a new Pull Request against `probe-scraper` if it contains any changes. 7 | 8 | ## Environment variables 9 | 10 | | Name | Description | 11 | | ---- | ----------- | 12 | | `DEBUG` | If set enables debug logging | 13 | | `DRY_RUN` | If set to `True` will not create a PR | 14 | | `GITHUB_REPOSITORY_OWNER` | The owner of the `probe-scraper` repository | 15 | | `AUTHOR_NAME` | The name to use for the commit | 16 | | `AUTHOR_EMAIL` | The email to use for the commit | 17 | 18 | ## Running with Docker 19 | 20 | ``` 21 | $ docker build -t fog-update . 22 | $ docker run -it --rm fog-update 23 | ``` 24 | 25 | ## Development 26 | 27 | ``` 28 | $ python3 -m venv env 29 | $ pip install -r requirements.txt 30 | $ pip install pytest 31 | ``` 32 | 33 | ## Testing 34 | 35 | You can run the tests: 36 | 37 | ``` 38 | pytest 39 | ``` 40 | 41 | Manual runs of the updater requires a `GITHUB_TOKEN`. 42 | Go to and create a new token (no additional scopes necessary). 43 | Set it in your shell: 44 | 45 | ``` 46 | export GITHUB_TOKEN= 47 | ``` 48 | 49 | ## Code of Conduct 50 | 51 | This repository is governed by Mozilla's code of conduct and etiquette guidelines. 52 | For more details, please read the 53 | [Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). 54 | 55 | See [CODE_OF_CONDUCT.md](../CODE_OF_CONDUCT.md) 56 | 57 | ## License 58 | 59 | This Source Code Form is subject to the terms of the Mozilla Public 60 | License, v. 2.0. If a copy of the MPL was not distributed with this 61 | file, You can obtain one at http://mozilla.org/MPL/2.0/ 62 | 63 | See [LICENSE](../LICENSE). 64 | -------------------------------------------------------------------------------- /fog-updater/action.yml: -------------------------------------------------------------------------------- 1 | name: 'fog-updater' 2 | description: "Update repositories.yaml with FOG's metric files" 3 | runs: 4 | using: 'docker' 5 | image: 'Dockerfile' 6 | -------------------------------------------------------------------------------- /fog-updater/requirements.txt: -------------------------------------------------------------------------------- 1 | PyGithub==2.5.0 2 | requests==2.32.3 3 | PyYAML==6.0.2 4 | -------------------------------------------------------------------------------- /fog-updater/src/fog_update.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # This Source Code Form is subject to the terms of the Mozilla Public 4 | # License, v. 2.0. If a copy of the MPL was not distributed with this 5 | # file, You can obtain one at http://mozilla.org/MPL/2.0/ 6 | 7 | from github import Github, GithubException, InputGitAuthor, enable_console_debug_logging 8 | import datetime 9 | import difflib 10 | import io 11 | import os 12 | import requests 13 | import sys 14 | import yaml 15 | 16 | DEFAULT_ORGANIZATION = "mozilla" 17 | DEFAULT_AUTHOR_NAME = "data-updater" 18 | DEFAULT_AUTHOR_EMAIL = "telemetry-alerts@mozilla.com" 19 | USAGE = "usage: fog-update" 20 | HTTP_HEADERS = { 21 | "user-agent": "probe-scraper/1.0", 22 | } 23 | INDEX_URL = "https://raw.githubusercontent.com/mozilla-firefox/firefox/main/toolkit/components/glean/metrics_index.py" # noqa 24 | FFX_IOS_INDEX_URL = "https://raw.githubusercontent.com/mozilla-mobile/firefox-ios/main/firefox-ios/Client/Glean/glean_index.yaml" # noqa 25 | BODY_TEMPLATE = f"""This (automated) patch updates the list from metrics_index.py. 26 | 27 | For reviewers: 28 | 29 | * Canonical source for the index: <{INDEX_URL}> 30 | * Please double-check that the changes here are valid and that the referenced files exist. 31 | * If the referenced files do not exist, schema deploys will fail 32 | * Delete this branch after merging or closing the PR. 33 | 34 | --- 35 | 36 | The source code of this automation bot lives in . 37 | """ # noqa 38 | 39 | 40 | class UnmodifiedException(Exception): 41 | pass 42 | 43 | 44 | def ts(): 45 | return str(datetime.datetime.now()) 46 | 47 | 48 | def eval_extract(code): 49 | """ 50 | Eval `code` and return a map of variables and their values. 51 | 52 | `code` should be valid Python code. 53 | Only the builtins `list` and `set` are provided. 54 | 55 | Note: this executes arbitrary Python code. 56 | Because of the limited builtins list this should be reasonably safe. 57 | Still only use this with known valid code! 58 | """ 59 | 60 | # Allow `list` and `set`, so `list(set(a+b+c))` works. 61 | globs = {"__builtins__": {"list": list, "set": set, "sorted": sorted}} 62 | exec(code, globs) 63 | globs.pop("__builtins__") 64 | return globs 65 | 66 | 67 | def swap_file_list(content, app, files, metrics_or_pings, library=False): 68 | """ 69 | Replace the list of `metrics_files` or `ping_files` in `content` with `files` 70 | for the given app or library.. 71 | Returns the changed content. 72 | 73 | All other content is left untouched. 74 | YAML syntax is assumed. 75 | File entries are correctly indented. 76 | """ 77 | output = io.StringIO() 78 | state = None 79 | if library: 80 | app = f"- library_name: {app}" 81 | else: 82 | app = f"- app_name: {app}" 83 | indent = 0 84 | 85 | lines = content.split("\n") 86 | 87 | # Remove trailing newlines. 88 | while not lines[-1]: 89 | lines.pop() 90 | 91 | for line in lines: 92 | if state is None and line.strip() == app: 93 | state = "app" 94 | elif ( 95 | state == "app" 96 | and metrics_or_pings == "metrics" 97 | and "metrics_files:" in line 98 | ): 99 | state = "files" 100 | elif state == "app" and metrics_or_pings == "pings" and "ping_files:" in line: 101 | state = "files" 102 | elif state == "app" and metrics_or_pings == "tags" and "tag_files:" in line: 103 | state = "files" 104 | elif state == "files": 105 | if line.strip().startswith("-"): 106 | indent = line.find("-") 107 | continue 108 | else: 109 | for file in files: 110 | print(" " * indent, file=output, end="") 111 | print(f"- {file}\n", file=output, end="") 112 | state = None 113 | 114 | print(line, file=output) 115 | 116 | return output.getvalue() 117 | 118 | 119 | def get_latest_metrics_index(): 120 | r = requests.get(INDEX_URL, headers=HTTP_HEADERS) 121 | r.raise_for_status() 122 | return r.text 123 | 124 | 125 | def get_latest_ios_metrics_index(): 126 | r = requests.get(FFX_IOS_INDEX_URL, headers=HTTP_HEADERS) 127 | r.raise_for_status() 128 | return r.text 129 | 130 | 131 | def _rewrite_repositories_yaml(repo, branch, data, debug=False): 132 | contents = repo.get_contents("repositories.yaml", ref=branch) 133 | content = contents.decoded_content.decode("utf-8") 134 | 135 | new_content = content 136 | for item in data: 137 | name, metrics_or_pings, library, files = item 138 | new_content = swap_file_list( 139 | new_content, name, files, metrics_or_pings, library 140 | ) 141 | 142 | if content == new_content: 143 | raise UnmodifiedException( 144 | "Update to repositories.yaml resulted in no changes: maybe the file was already up to date?" # noqa 145 | ) 146 | 147 | if debug: 148 | diff = difflib.unified_diff( 149 | content.splitlines(keepends=True), 150 | new_content.splitlines(keepends=True), 151 | fromfile="old/repositories.yaml", 152 | tofile="new/repositories.yaml", 153 | ) 154 | sys.stdout.writelines(diff) 155 | 156 | return new_content 157 | 158 | 159 | def _commit_repositories_yaml(repo, branch, author, new_content): 160 | contents = repo.get_contents("repositories.yaml", ref=branch) 161 | 162 | repo.update_file( 163 | contents.path, 164 | "Update repositories.yaml with new FOG metrics_yamls list", 165 | new_content, 166 | contents.sha, 167 | branch=branch, 168 | author=author, 169 | ) 170 | 171 | return True 172 | 173 | 174 | def main(argv, repo, author, debug=False, dry_run=False): 175 | if len(argv) < 1: 176 | print(USAGE) 177 | sys.exit(1) 178 | 179 | release_branch_name = "main" 180 | short_version = "main" 181 | 182 | metrics_index = get_latest_metrics_index() 183 | data = eval_extract(metrics_index) 184 | gecko_metrics = sorted(data["gecko_metrics"]) 185 | gecko_pings = sorted(data["gecko_pings"]) 186 | firefox_desktop_metrics = sorted(data["firefox_desktop_metrics"]) 187 | firefox_desktop_pings = sorted(data["firefox_desktop_pings"]) 188 | background_update_metrics = sorted(data["background_update_metrics"]) 189 | background_update_pings = sorted(data["background_update_pings"]) 190 | background_tasks_metrics = sorted(data["background_tasks_metrics"]) 191 | background_tasks_pings = sorted(data["background_tasks_pings"]) 192 | 193 | ios_metrics_index = get_latest_ios_metrics_index() 194 | data = yaml.safe_load(ios_metrics_index) 195 | firefox_ios_pings = sorted(data["ping_files"]) 196 | firefox_ios_metrics = sorted(data["metrics_files"]) 197 | firefox_ios_tags = sorted(data["tag_files"]) 198 | 199 | data = [ 200 | # Name, metrics/pings, library?, files 201 | ["gecko", "metrics", True, gecko_metrics], 202 | ["gecko", "pings", True, gecko_pings], 203 | ["firefox_desktop", "metrics", False, firefox_desktop_metrics], 204 | ["firefox_desktop", "pings", False, firefox_desktop_pings], 205 | [ 206 | "firefox_desktop_background_update", 207 | "metrics", 208 | False, 209 | background_update_metrics, 210 | ], 211 | ["firefox_desktop_background_update", "pings", False, background_update_pings], 212 | [ 213 | "firefox_desktop_background_tasks", 214 | "metrics", 215 | False, 216 | background_tasks_metrics, 217 | ], 218 | ["firefox_desktop_background_tasks", "pings", False, background_tasks_pings], 219 | ["firefox_ios", "pings", False, firefox_ios_pings], 220 | ["firefox_ios", "metrics", False, firefox_ios_metrics], 221 | ["firefox_ios", "tags", False, firefox_ios_tags], 222 | ] 223 | 224 | print(f"{ts()} Updating repositories.yaml") 225 | try: 226 | new_content = _rewrite_repositories_yaml( 227 | repo, release_branch_name, data, debug=dry_run or debug 228 | ) 229 | except UnmodifiedException as e: 230 | print(f"{ts()} {e}") 231 | return 232 | except Exception as e: 233 | print(f"{ts()} {e}") 234 | raise 235 | 236 | if dry_run: 237 | print(f"{ts()} Dry-run so not continuing.") 238 | return 239 | 240 | # Create a non unique PR branch name for work on this ac release branch. 241 | pr_branch_name = f"fog-update/update-metrics-index-{short_version}" 242 | 243 | try: 244 | pr_branch = repo.get_branch(pr_branch_name) 245 | if pr_branch: 246 | print(f"{ts()} The PR branch {pr_branch_name} already exists. Exiting.") 247 | return 248 | except GithubException: 249 | # TODO Only ignore a 404 here, fail on others 250 | pass 251 | 252 | release_branch = repo.get_branch(release_branch_name) 253 | print(f"{ts()} Last commit on {release_branch_name} is {release_branch.commit.sha}") 254 | 255 | print(f"{ts()} Creating branch {pr_branch_name} on {release_branch.commit.sha}") 256 | repo.create_git_ref( 257 | ref=f"refs/heads/{pr_branch_name}", sha=release_branch.commit.sha 258 | ) 259 | print(f"{ts()} Created branch {pr_branch_name} on {release_branch.commit.sha}") 260 | 261 | _commit_repositories_yaml(repo, pr_branch_name, author, new_content) 262 | 263 | print(f"{ts()} Creating pull request") 264 | pr = repo.create_pull( 265 | title=f"Update to latest metrics_index list on {release_branch_name}", 266 | body=BODY_TEMPLATE, 267 | head=pr_branch_name, 268 | base=release_branch_name, 269 | ) 270 | print(f"{ts()} Pull request at {pr.html_url}") 271 | 272 | 273 | if __name__ == "__main__": 274 | debug = os.getenv("DEBUG") is not None 275 | if debug: 276 | enable_console_debug_logging() 277 | 278 | github_access_token = os.getenv("GITHUB_TOKEN") 279 | if not github_access_token: 280 | print("No GITHUB_TOKEN set. Exiting.") 281 | sys.exit(1) 282 | 283 | github = Github(github_access_token) 284 | if github.get_user() is None: 285 | print("Could not get authenticated user. Exiting.") 286 | sys.exit(1) 287 | 288 | dry_run = os.getenv("DRY_RUN") == "True" 289 | 290 | organization = os.getenv("GITHUB_REPOSITORY_OWNER") or DEFAULT_ORGANIZATION 291 | 292 | repo = github.get_repo(f"{organization}/probe-scraper") 293 | 294 | author_name = os.getenv("AUTHOR_NAME") or DEFAULT_AUTHOR_NAME 295 | author_email = os.getenv("AUTHOR_EMAIL") or DEFAULT_AUTHOR_EMAIL 296 | author = InputGitAuthor(author_name, author_email) 297 | 298 | print( 299 | f"{ts()} This is fog-update working on https://github.com/{organization} as {author_email} / {author_name}" # noqa 300 | ) 301 | 302 | main(sys.argv, repo, author, debug, dry_run) 303 | -------------------------------------------------------------------------------- /fog-updater/src/test_util.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/ 4 | 5 | 6 | from fog_update import eval_extract, swap_file_list 7 | 8 | 9 | REPOSITORIES_YAML = """ 10 | --- 11 | version: "2" 12 | libraries: 13 | - library_name: gecko 14 | description: The browser engine developed by Mozilla 15 | notification_emails: 16 | - chutten@mozilla.com 17 | url: https://github.com/mozilla-firefox/firefox 18 | metrics_files: 19 | - LIB_METRICS_FILES 20 | ping_files: 21 | - LIB_PING_FILES 22 | 23 | applications: 24 | - app_name: firefox_desktop 25 | metrics_files: 26 | - METRICS_FILES 27 | ping_files: 28 | - PING_FILES 29 | - app_name: firefox_desktop_background_update 30 | metrics_files: 31 | - OTHER_METRICS_FILES 32 | ping_files: 33 | - OTHER_PING_FILES 34 | """ 35 | 36 | METRICS_INDEX = """ 37 | # -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- 38 | # vim: set filetype=python: 39 | 40 | first_yamls = ["A", "B"] 41 | second_yamls = ["B", "C"] 42 | metrics_yamls = sorted(list(set(first_yamls + second_yamls))) 43 | 44 | pings_yamls = [ 45 | "D", 46 | "E", 47 | "F" 48 | ] 49 | """ 50 | 51 | 52 | def test_eval_metrics_index(): 53 | data = eval_extract(METRICS_INDEX) 54 | assert data["first_yamls"] == ["A", "B"] 55 | assert data["second_yamls"] == ["B", "C"] 56 | assert data["metrics_yamls"] == ["A", "B", "C"] 57 | assert data["pings_yamls"] == ["D", "E", "F"] 58 | 59 | 60 | def test_swap_repositories_yaml(): 61 | data = eval_extract(METRICS_INDEX) 62 | metrics_files = data["metrics_yamls"] 63 | output = swap_file_list( 64 | REPOSITORIES_YAML, "firefox_desktop", metrics_files, "metrics" 65 | ) 66 | 67 | # New files added. 68 | assert "- METRICS_FILES" not in output 69 | assert "- A" in output 70 | assert "- B" in output 71 | assert "- C" in output 72 | # ping files untouched. 73 | assert "- PING_FILES" in output 74 | 75 | # Other app untouched 76 | assert "- OTHER_METRICS_FILES" in output 77 | assert "- OTHER_PING_FILES" in output 78 | 79 | 80 | def test_swap_ping_files(): 81 | data = eval_extract(METRICS_INDEX) 82 | metrics_files = data["pings_yamls"] 83 | output = swap_file_list( 84 | REPOSITORIES_YAML, "firefox_desktop", metrics_files, "pings" 85 | ) 86 | 87 | # metrics files untouched. 88 | assert "- METRICS_FILES" in output 89 | # New files added. 90 | assert "- PING_FILES" not in output 91 | assert "- D" in output 92 | assert "- E" in output 93 | assert "- F" in output 94 | 95 | # Other app untouched 96 | assert "- OTHER_METRICS_FILES" in output 97 | assert "- OTHER_PING_FILES" in output 98 | 99 | 100 | def test_swap_repositories_yaml_unchanged(): 101 | metrics_files = ["METRICS_FILES"] 102 | output = swap_file_list( 103 | REPOSITORIES_YAML, "firefox_desktop", metrics_files, "metrics" 104 | ) 105 | 106 | # New files added. 107 | assert "- METRICS_FILES" in output 108 | assert "- A" not in output 109 | # ping files untouched. 110 | assert "- PING_FILES" in output 111 | 112 | # Other app untouched 113 | assert "- OTHER_METRICS_FILES" in output 114 | assert "- OTHER_PING_FILES" in output 115 | 116 | 117 | def test_libraries(): 118 | data = eval_extract(METRICS_INDEX) 119 | metrics_files = data["metrics_yamls"] 120 | output = swap_file_list( 121 | REPOSITORIES_YAML, "gecko", metrics_files, "metrics", library=True 122 | ) 123 | 124 | # New files added. 125 | assert "- LIB_METRICS_FILES" not in output 126 | assert "- A" in output 127 | assert "- B" in output 128 | assert "- C" in output 129 | # ping files untouched. 130 | assert "- LIB_PING_FILES" in output 131 | 132 | # Other app untouched 133 | assert "- METRICS_FILES" in output 134 | assert "- PING_FILES" in output 135 | assert "- OTHER_METRICS_FILES" in output 136 | assert "- OTHER_PING_FILES" in output 137 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | """Google Cloud Function entry points. 2 | 3 | These must be in main.py in same directory as requirements.txt and cannot be nested 4 | inside another package. 5 | """ 6 | 7 | from probe_scraper.glean_push import main as glean_push 8 | 9 | __all__ = ["glean_push"] 10 | -------------------------------------------------------------------------------- /notebooks/load_and_run.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# This comes from https://github.com/harterrt/cookiecutter-python-etl/\n", 12 | "# Thanks Harter!" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "import boto3\n", 24 | "import botocore\n", 25 | "import os\n", 26 | "\n", 27 | "from io import BytesIO\n", 28 | "from gzip import GzipFile" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "repo_dir = \"probe-scraper\"\n", 40 | "output_dir = \"/home/hadoop/analyses/probe_data\"\n", 41 | "cache_dir = \"/home/hadoop/analyses/probe_cache\"\n", 42 | "repo_https_url = \"https://github.com/mozilla/probe-scraper\"\n", 43 | "\n", 44 | "S3_PUBLIC_BUCKET = \"telemetry-public-analysis-2\"\n", 45 | "S3_DATA_PATH = \"probe-scraper/data-rest/\"" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "collapsed": true 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "!rm -rf $repo_dir" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "!rm -rf $output_dir" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "collapsed": true 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "!rm -rf $cache_dir" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "collapsed": true 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "!git config --global user.email \"gfritzsche@mozilla.com\" && \\\n", 90 | "git config --global user.name \"Georg Fritzsche\"" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": { 97 | "collapsed": false 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "!git clone $repo_https_url $repo_dir" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": { 108 | "collapsed": false 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "!cd $repo_dir && git pull origin master && python setup.py bdist_egg" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "!mkdir $output_dir && mkdir $cache_dir" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "collapsed": false 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "!cd $repo_dir && pip install -r requirements.txt && python probe_scraper/runner.py --outdir $output_dir --tempdir $cache_dir" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": { 140 | "collapsed": true 141 | }, 142 | "source": [ 143 | "## Upload the output to S3." 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": { 150 | "collapsed": true 151 | }, 152 | "outputs": [], 153 | "source": [ 154 | "# Get access to the S3 connect API.\n", 155 | "client = boto3.client('s3', 'us-west-2')" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": { 162 | "collapsed": true 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "def gzip_compress(source_file):\n", 167 | " \"\"\" Apply GZIP compression to the content of the provided file.\n", 168 | "\n", 169 | " :param source_file: the absolute path of the file to compress.\n", 170 | " :return: The gzip compressed content of the input file.\n", 171 | " \"\"\"\n", 172 | " with open(source_file) as fi:\n", 173 | " text_body = fi.read().decode(\"utf-8\")\n", 174 | "\n", 175 | " gz_body = BytesIO()\n", 176 | " gz = GzipFile(None, 'wb', 9, gz_body)\n", 177 | " gz.write(text_body.encode('utf-8'))\n", 178 | " gz.close()\n", 179 | " \n", 180 | " return gz_body.getvalue()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": { 187 | "collapsed": false 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "for path, subdirs, files in os.walk(output_dir):\n", 192 | " relative_path = os.path.relpath(path, output_dir)\n", 193 | " # GZIP-compress the files, then copy them to S3. Allow caching for 8 hours.\n", 194 | " for file_name in files:\n", 195 | " source_path = os.path.join(path, file_name)\n", 196 | " key_path = os.path.join(S3_DATA_PATH, relative_path, file_name)\n", 197 | " print \"uploading \" + file_name + \" to s3: \" + key_path\n", 198 | " client.put_object(ACL='public-read', Bucket=S3_PUBLIC_BUCKET,\n", 199 | " Key=key_path, Body=gzip_compress(source_path),\n", 200 | " ContentEncoding='gzip', CacheControl='max-age=28800',\n", 201 | " ContentType='application/json')" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": { 208 | "collapsed": true 209 | }, 210 | "outputs": [], 211 | "source": [] 212 | } 213 | ], 214 | "metadata": { 215 | "anaconda-cloud": {}, 216 | "kernelspec": { 217 | "display_name": "Python [conda root]", 218 | "language": "python", 219 | "name": "conda-root-py" 220 | }, 221 | "language_info": { 222 | "codemirror_mode": { 223 | "name": "ipython", 224 | "version": 2 225 | }, 226 | "file_extension": ".py", 227 | "mimetype": "text/x-python", 228 | "name": "python", 229 | "nbconvert_exporter": "python", 230 | "pygments_lexer": "ipython2", 231 | "version": "2.7.12" 232 | } 233 | }, 234 | "nbformat": 4, 235 | "nbformat_minor": 1 236 | } 237 | -------------------------------------------------------------------------------- /probe_scraper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/probe-scraper/0506e31f38e61ddc662c0eab18826b370314896e/probe_scraper/__init__.py -------------------------------------------------------------------------------- /probe_scraper/check_repositories.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from collections import defaultdict 4 | from typing import Set, Tuple 5 | 6 | import git 7 | import requests as reqs 8 | 9 | from .parsers.repositories import RepositoriesParser 10 | 11 | GIT = git.Git() 12 | GIT_BRANCH_PATTERN = re.compile("ref: refs/heads/([^\t]+)\tHEAD") 13 | GITHUB_RAW_URL = "https://raw.githubusercontent.com" 14 | REPOSITORIES = os.path.join( 15 | os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "repositories.yaml" 16 | ) 17 | EXPECTED_MISSING_FILES: Set[Tuple[str, str]] = { 18 | ("support-migration", "components/support/migration/metrics.yaml"), 19 | ("viu-politica", "source/telemetry/metrics.yaml"), 20 | } 21 | validation_errors = [] 22 | repos = RepositoriesParser().parse(REPOSITORIES) 23 | 24 | app_id_channels = defaultdict(lambda: defaultdict(lambda: 0)) 25 | 26 | repo_by_library_name = {} 27 | for repo in repos: 28 | for library_name in repo.library_names or []: 29 | repo_by_library_name[library_name] = repo.name 30 | 31 | for repo in repos: 32 | metrics_files = repo.get_metrics_file_paths() 33 | temp_errors = [] 34 | 35 | if repo.app_id and repo.channel and not repo.deprecated: 36 | app_id_channels[repo.app_id][repo.channel] += 1 37 | 38 | for metric_file in metrics_files: 39 | if repo.deprecated: 40 | continue # ignore missing files for deprecated apps 41 | 42 | if (repo.name, metric_file) in EXPECTED_MISSING_FILES: 43 | continue # ignore missing files 44 | 45 | branch = repo.branch 46 | if branch is None: 47 | match = GIT_BRANCH_PATTERN.match( 48 | GIT.ls_remote("--symref", repo.url, "HEAD") 49 | ) 50 | if match is None: 51 | temp_errors += ["Failed to get default branch from git for " + repo.url] 52 | continue 53 | branch = match.groups()[0] 54 | 55 | temp_url = ( 56 | repo.url.replace("https://github.com", GITHUB_RAW_URL) 57 | + "/" 58 | + branch 59 | + "/" 60 | + metric_file 61 | ) 62 | response = reqs.get(temp_url) 63 | if response.status_code != 200: 64 | temp_errors += ["Metrics file was not found at " + temp_url] 65 | 66 | for library_name in repo.dependencies: 67 | if library_name not in repo_by_library_name: 68 | temp_errors.append(f"Dependency not found: {library_name}") 69 | if temp_errors and not repo.prototype: 70 | validation_errors.append({"repo": repo.name, "errors": temp_errors}) 71 | 72 | # Ensure non-deprecated channels are uniquely named 73 | duplication_errors = [] 74 | for app_id, channels in app_id_channels.items(): 75 | temp_errors = [] 76 | for channel_name, num in channels.items(): 77 | if num > 1: 78 | duplication_errors.append( 79 | f"Non-deprecated channel names must be unique, found {channel_name} {num} " 80 | f"times for {app_id}" 81 | ) 82 | 83 | if validation_errors: 84 | print("\nSummary of validation errors:\n") 85 | print(f"{len(validation_errors)} repositories had problems\n") 86 | for error in validation_errors: 87 | print(f"\nErrors found in {error['repo']}:\n") 88 | for line_errors in error["errors"]: 89 | print(line_errors) 90 | 91 | if duplication_errors: 92 | print("\nDuplicate channel names found:\n") 93 | for duplication_error in duplication_errors: 94 | print(duplication_error) 95 | 96 | if validation_errors or duplication_errors: 97 | exit(1) 98 | -------------------------------------------------------------------------------- /probe_scraper/emailer.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | from email.mime.application import MIMEApplication 6 | from email.mime.multipart import MIMEMultipart 7 | from email.mime.text import MIMEText 8 | from pathlib import Path 9 | 10 | import boto3 11 | import yaml 12 | 13 | EMAIL_FILE = Path("emails.txt") 14 | 15 | 16 | def send_ses( 17 | fromaddr, subject, body, recipients, filename="", dryrun=True, email_file=None 18 | ): 19 | """Send an email via the Amazon SES service. Can specify a single or list of 20 | recipients. 21 | 22 | Saves emails to `emails.txt`. 23 | 24 | Examples: 25 | ``` 26 | send_ses('me@example.com', 'greetings', "Hi!", 'you@example.com') 27 | ``` 28 | 29 | ``` 30 | send_ses('me@example.com', 'greetings', "Hi!", ['a@example.com`, 'b@example.com']) 31 | ``` 32 | 33 | Raises a RuntimeError if the message did not send correctly.""" 34 | 35 | if isinstance(recipients, list): 36 | recipients = ",".join(recipients) 37 | 38 | email_data = [ 39 | {"from": fromaddr, "to": recipients, "subject": subject, "body": body} 40 | ] 41 | 42 | if email_file is None: 43 | email_file = EMAIL_FILE 44 | 45 | with open(email_file, "a") as f: 46 | f.write(yaml.dump(email_data, default_flow_style=False)) 47 | 48 | if dryrun: 49 | email_txt = "\n".join( 50 | [ 51 | "New Email", 52 | " From: " + fromaddr, 53 | " To: " + recipients, 54 | " Subject: " + subject, 55 | " Body: " + body, 56 | ] 57 | ) 58 | print(email_txt) 59 | return 60 | 61 | msg = MIMEMultipart() 62 | msg["Subject"] = subject 63 | msg["From"] = fromaddr 64 | msg["To"] = recipients 65 | msg.attach(MIMEText(body)) 66 | 67 | if filename: 68 | attachment = open(filename, "rb").read() 69 | part = MIMEApplication(attachment) 70 | part.add_header("Content-Disposition", "attachment", filename=filename) 71 | msg.attach(part) 72 | 73 | ses = boto3.client("ses", region_name="us-west-2") 74 | result = ses.send_raw_email(RawMessage={"Data": msg.as_string()}) 75 | 76 | if "ErrorResponse" in result: 77 | raise RuntimeError("Error sending email: " + result) 78 | -------------------------------------------------------------------------------- /probe_scraper/exc.py: -------------------------------------------------------------------------------- 1 | class ProbeScraperError(Exception): 2 | """Exception type for returning errors in push mode.""" 3 | 4 | def __init__(self, message, status_code): 5 | self.message = message 6 | self.status_code = status_code 7 | 8 | 9 | class ProbeScraperInvalidRequest(ProbeScraperError): 10 | """Exception type for returning HTTP 4XX in push mode.""" 11 | 12 | def __init__(self, message, status_code=400): 13 | super().__init__(message, status_code) 14 | 15 | 16 | class ProbeScraperServerError(ProbeScraperError): 17 | """Exception type for returning HTTP 5XX in push mode.""" 18 | 19 | def __init__(self, message, status_code=500): 20 | super().__init__(message, status_code) 21 | -------------------------------------------------------------------------------- /probe_scraper/fog_checks.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | """ 6 | This file contains various checks for Firefox on Glean (FOG). 7 | 8 | FOG is Glean, yes, but is sufficiently different that it benefits from doing 9 | its own expiry checks. Sending its own emails. Filing its own bugs. 10 | """ 11 | 12 | import re 13 | from collections import defaultdict 14 | from typing import Any, Dict, List, Optional, Set, TypedDict 15 | 16 | from probe_scraper import probe_expiry_alert 17 | 18 | from .glean_checks import get_current_metrics_by_repo 19 | from .parsers.repositories import Repository 20 | 21 | EXPIRED_METRICS_EMAIL_TEMPLATE = """ 22 | Each metric in the following list will soon expire at the end of Firefox {version}. 23 | For your convenience, we've filed bugs to track the work of removing or renewing them: 24 | 25 | {expiring_bugs_list} 26 | 27 | What to do about this: 28 | 29 | 1. If the metric is no longer needed, remove it from its `metrics.yaml` file. 30 | 2. If the metric is still required, extend its expiration. 31 | 32 | If you have any problems, please ask for help on the #glean Matrix channel[1]. We'll give you a hand. 33 | 34 | What happens if you don't fix this: 35 | 36 | The expiring metric will expire, causing a test failure which 37 | * makes sheriffs unhappy, 38 | * prevents developers from landing code, and 39 | * generally makes for a bad time. 40 | 41 | You will continue to get this e-mail as a reminder to clean up. 42 | 43 | Your Friendly Neighbourhood Glean Team 44 | 45 | [1] https://chat.mozilla.org/#/room/#glean:mozilla.org 46 | 47 | This is an automated message sent from probe-scraper. See https://github.com/mozilla/probe-scraper for details. 48 | """ # noqa 49 | 50 | 51 | ### 52 | # Types for Annotations: 53 | ### 54 | class Email(TypedDict): 55 | subject: str 56 | message: str 57 | 58 | 59 | class EmailInfo(TypedDict): 60 | addresses: List[str] 61 | emails: List[Email] 62 | 63 | 64 | # The full list of all repos that are FOG style. Must: 65 | # * Expire based on Firefox Desktop Nightly Version, and 66 | # * Use Bugzilla for its bug urls 67 | FOG_REPOS: Set[str] = {"firefox-desktop", "gecko"} 68 | 69 | 70 | # The BMO whiteboard tag to use for auto-filed bugs 71 | BUG_WHITEBOARD_TAG = "[metric-expiry-alert]" 72 | # The BMO Title, templated by version and metric family 73 | BUG_SUMMARY_TEMPLATE = ( 74 | "Remove or update metrics expiring at the end of Firefox {version}: {probe}" 75 | ) 76 | # BE ALERT: We regex on this template to find existing bugs. 77 | # SEE probe_expiry_alert.find_existing_bugs FOR DETAILS. 78 | # IF YOU MODIFY THIS WITHOUT CARE WE WILL FILE DUPLICATE BUGS. 79 | # Please be kind to your Sheriffs and only modify with care. 80 | BUG_DESCRIPTION_TEMPLATE = """ 81 | The following metrics will expire at the end of Firefox Nightly release: [version {version}][1]. 82 | 83 | ``` 84 | {probes} 85 | ``` 86 | 87 | {notes} 88 | 89 | What to do about this: 90 | 1. If one, some, or all of the metrics are no longer needed, please remove them from their `metrics.yaml` definition file. 91 | 2. If one, some, or all of the metrics are still required, please submit a patch to extend their expiry. 92 | 93 | If you have any problems, please ask for help on the [#glean Matrix room](https://chat.mozilla.org/#/room/#glean:mozilla.org) or the #data-help Slack channel. 94 | We'll give you a hand. 95 | 96 | Your Friendly Neighbourhood Glean Team 97 | 98 | [1]: https://wiki.mozilla.org/Release_Management/Calendar 99 | 100 | --- 101 | This bug was auto-filed by [probe-scraper](https://github.com/mozilla/probe-scraper). 102 | """ # noqa 103 | 104 | 105 | BUG_NUMBER_PATTERN = re.compile(r"\d+") 106 | 107 | 108 | def get_expiring_metrics( 109 | metrics: Dict[str, Dict], latest_nightly_version: str 110 | ) -> Dict[str, Dict]: 111 | """ 112 | Filter the provided dict of metric name to metric info to just the expiring ones. 113 | """ 114 | 115 | # We start warning one version ahead. 116 | target_version = int(latest_nightly_version) + 1 117 | 118 | expiring_metrics = {} 119 | for metric_name, metric in metrics.items(): 120 | if metric["expires"] == "never": 121 | continue 122 | 123 | if metric["expires"] == "expired": 124 | # Also include manually-expired ones. 125 | # This is not only technically correct, but makes testing easier. 126 | expiring_metrics[metric_name] = metric 127 | continue 128 | 129 | try: 130 | expiry_version = int(metric["expires"]) 131 | except ValueError: 132 | # Expires cannot be parsed as a version. Treat as unexpired. 133 | # TODO: Should we send emails for unparseable expiry versions? 134 | continue 135 | 136 | if expiry_version == target_version: 137 | expiring_metrics[metric_name] = metric 138 | 139 | return expiring_metrics 140 | 141 | 142 | def bug_number_from_url(url: str) -> Optional[int]: 143 | """ 144 | Given a bug url, get its bug number. 145 | If we can't figure out a reasonable bug number, return None. 146 | """ 147 | if "bugz" not in url: 148 | # Not a bugzilla url. We don't understand you. 149 | print(f"Can't figure out bug number for non-bugzilla url: {url}") 150 | return None 151 | 152 | bug = BUG_NUMBER_PATTERN.search(url) 153 | if bug is not None: 154 | try: 155 | bug = int(bug[0]) 156 | except Exception: 157 | print(f"Can't figure out bug number for url: {url}") 158 | return None 159 | return bug 160 | 161 | 162 | def file_bugs( 163 | expiring_metrics: Dict[str, Dict], 164 | latest_nightly_version: str, 165 | bugzilla_api_key: str, 166 | dry_run: bool = True, 167 | ) -> Dict[str, List[str]]: 168 | """ 169 | Find existing and file new Bugzilla bugs for expiring metrics. 170 | Needs a network connection. 171 | If `dry_run`, doesn't file any new bugs, returning a fake bug url for all expiring metrics. 172 | """ 173 | 174 | next_version = str(int(latest_nightly_version) + 1) 175 | 176 | # We try our best to reuse pieces of probe_expiry_alert. 177 | # Swizzle and filter expiring_metrics into a list of ProbeDetails structs. 178 | expiring_probes: List[probe_expiry_alert.ProbeDetails] = [] 179 | for metric_name, metric in expiring_metrics.items(): 180 | bug_numbers: List[Optional[int]] = [ 181 | bug_number_from_url(url) for url in metric["bugs"] 182 | ] 183 | biggest_bug_number: Optional[int] = max( 184 | [bug for bug in bug_numbers if bug is not None], default=None 185 | ) 186 | if biggest_bug_number is not None: 187 | product, component = probe_expiry_alert.get_bug_component( 188 | biggest_bug_number, bugzilla_api_key 189 | ) 190 | else: 191 | product, component = None, None 192 | if product is None and component is None: 193 | product = probe_expiry_alert.BUG_DEFAULT_PRODUCT 194 | component = probe_expiry_alert.BUG_DEFAULT_COMPONENT 195 | 196 | expiring_probes.append( 197 | probe_expiry_alert.ProbeDetails( 198 | metric_name, 199 | product, 200 | component, 201 | metric.get("notification_emails", []), 202 | biggest_bug_number, 203 | ) 204 | ) 205 | 206 | # Debug print time 207 | print(f"Found {len(expiring_probes)} 'probes' expiring in nightly {next_version}:") 208 | print([probe.name for probe in expiring_probes]) 209 | 210 | metrics_to_bug_numbers = probe_expiry_alert.file_bugs( 211 | expiring_probes, 212 | str(latest_nightly_version), 213 | bugzilla_api_key, 214 | dry_run, 215 | BUG_WHITEBOARD_TAG, 216 | BUG_SUMMARY_TEMPLATE, 217 | BUG_DESCRIPTION_TEMPLATE, 218 | ) 219 | 220 | # Swizzle out to a metric_name -> List[bug urls] dict 221 | bug_urls_to_metrics = defaultdict(list) 222 | for metric_name, bug_number in metrics_to_bug_numbers.items(): 223 | bug_urls_to_metrics[ 224 | probe_expiry_alert.BUGZILLA_BUG_LINK_TEMPLATE.format(bug_id=bug_number) 225 | ].append(metric_name) 226 | 227 | if dry_run: 228 | return {"https://example.com/fake_bug_url/": expiring_metrics.keys()} 229 | 230 | return bug_urls_to_metrics 231 | 232 | 233 | def file_bugs_and_get_emails_for_expiring_metrics( 234 | repositories: List[Repository], 235 | metrics_by_repo: Dict[str, Dict[str, Dict[str, Any]]], 236 | bugzilla_api_key: Optional[str], 237 | dry_run: bool = True, 238 | ) -> Optional[Dict[str, EmailInfo]]: 239 | """ 240 | If the provided repositories and metrics contain FOG-using repos: 241 | * Determine which metrics are expiring in the next version. 242 | * File bugs in Bugzilla for them, in the product and component of the most recent bug. 243 | At most one bug per metric category. (Doesn't happen if you don't provide an API key.) 244 | * Return a list of emails to send. At most one per FOG repo. 245 | """ 246 | 247 | if len(FOG_REPOS & metrics_by_repo.keys()) == 0: 248 | print("No FOG-using repositories. Nothing to do.") 249 | return None 250 | 251 | # Glean repositories have a default list of notification emails we should include as well. 252 | repo_addresses = { 253 | repo.name: repo.notification_emails 254 | for repo in repositories 255 | if repo.name in FOG_REPOS 256 | } 257 | 258 | current_metrics_by_repo = get_current_metrics_by_repo(metrics_by_repo) 259 | 260 | emails = {} 261 | for fog_repo in FOG_REPOS: 262 | if fog_repo not in metrics_by_repo: 263 | continue 264 | current_metrics: Dict[str, Dict] = current_metrics_by_repo[fog_repo] 265 | latest_nightly_version: str = probe_expiry_alert.get_latest_nightly_version() 266 | expiring_metrics: Dict[str, Dict] = get_expiring_metrics( 267 | current_metrics, latest_nightly_version 268 | ) 269 | 270 | print(f"Found {len(expiring_metrics)} expiring metrics in {fog_repo}.") 271 | if len(expiring_metrics) == 0: 272 | continue 273 | 274 | metrics_addresses = set(repo_addresses[fog_repo]) 275 | for metric in expiring_metrics.values(): 276 | metrics_addresses.update(metric["notification_emails"]) 277 | addresses = list(metrics_addresses) 278 | 279 | filed_bugs: Dict[str, List[str]] = file_bugs( 280 | expiring_metrics, latest_nightly_version, bugzilla_api_key, dry_run 281 | ) 282 | 283 | expiring_bugs_list = [] 284 | for bug_url, bug_metrics in filed_bugs.items(): 285 | # Sort the metric names for easier reading 286 | bug_metrics = list(bug_metrics) 287 | bug_metrics.sort() 288 | 289 | expiring_metrics_list_str = "\n".join(bug_metrics) 290 | expiring_bugs_list.append(f"{bug_url}:\n{expiring_metrics_list_str}") 291 | 292 | # Nothing expiring? No emails needed. 293 | if len(expiring_bugs_list) == 0: 294 | continue 295 | 296 | emails[f"expired_metrics_{fog_repo}"] = EmailInfo( 297 | emails=[ 298 | { 299 | "subject": f"Expired metrics in {fog_repo}", 300 | "message": EXPIRED_METRICS_EMAIL_TEMPLATE.format( 301 | expiring_bugs_list="\n".join(expiring_bugs_list), 302 | version=int(latest_nightly_version), 303 | ), 304 | } 305 | ], 306 | addresses=addresses, 307 | ) 308 | 309 | return emails 310 | -------------------------------------------------------------------------------- /probe_scraper/glean_push.py: -------------------------------------------------------------------------------- 1 | """Google Cloud Function for scraping glean probes from a single commit.""" 2 | 3 | import argparse 4 | import json 5 | import os 6 | import tempfile 7 | from pathlib import Path 8 | from unittest.mock import Mock 9 | 10 | from flask import Request, Response 11 | 12 | from . import runner 13 | from .exc import ProbeScraperError 14 | 15 | 16 | def main(request: Request) -> Response: 17 | """Scrape probes from a single glean commit.""" 18 | output_bucket = os.environ.get("OUTPUT_BUCKET", None) 19 | if output_bucket is None: 20 | return Response("Cloud function has no configured output bucket\n", 500) 21 | 22 | args = request.get_json(force=True) 23 | if not isinstance(args, dict): 24 | return Response(f"request body must be a JSON object but got: {args}\n", 400) 25 | try: 26 | url = args["url"] 27 | commit = args["commit"] 28 | branch = args["branch"] 29 | except KeyError as e: 30 | return Response(f"request JSON missing key: {e}\n", 400) 31 | 32 | if not isinstance(url, str): 33 | return Response("Error: url must be a string\n", 400) 34 | if not isinstance(commit, str): 35 | return Response("Error: commit must be a string\n", 400) 36 | if not isinstance(branch, str): 37 | return Response("Error: branch must be a string\n", 400) 38 | 39 | with tempfile.TemporaryDirectory() as tmpdirname: 40 | tmp = Path(tmpdirname) 41 | out_dir = tmp / "output" 42 | cache_dir = tmp / "cache" 43 | email_file = tmp / "emails.txt" 44 | out_dir.mkdir() 45 | cache_dir.mkdir() 46 | try: 47 | updated_paths = runner.main( 48 | cache_dir=cache_dir, 49 | out_dir=out_dir, 50 | firefox_version=None, 51 | min_firefox_version=None, 52 | process_moz_central_probes=False, 53 | process_glean_metrics=True, 54 | repositories_file=Path("repositories.yaml"), 55 | dry_run=True, 56 | glean_repos=None, 57 | firefox_channel=None, 58 | output_bucket=output_bucket, 59 | cache_bucket=None, 60 | env="prod", 61 | bugzilla_api_key=None, 62 | glean_urls=[url], 63 | glean_commit=commit, 64 | glean_commit_branch=branch, 65 | update=True, 66 | email_file=email_file, 67 | ) 68 | except ProbeScraperError as e: 69 | return Response(f"Error: {e.message}\n", e.status_code) 70 | if updated_paths: 71 | updates = ", ".join(str(p.relative_to(out_dir)) for p in updated_paths) 72 | message = f"update published for {updates}\n" 73 | else: 74 | message = "update is valid, but not published\n" 75 | try: 76 | emails = email_file.read_text() 77 | except FileNotFoundError: 78 | pass # no emails means no warnings or expiring metrics, which is good 79 | else: 80 | message += f"additional messages: {emails}\n" 81 | return Response(message, 200) 82 | 83 | 84 | if __name__ == "__main__": 85 | _parser = argparse.ArgumentParser() 86 | _parser.add_argument( 87 | "data", 88 | help="JSON format data describing the glean commit or branch to push", 89 | type=str, 90 | ) 91 | _args = _parser.parse_args() 92 | _data = json.loads(_args.data) 93 | _request = Mock(get_json=Mock(return_value=_data), args=_data) 94 | _response = main(_request) 95 | print(f"HTTP {_response.status_code}: {_response.data.decode()}") 96 | -------------------------------------------------------------------------------- /probe_scraper/model_validation.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | from jsonschema import Draft7Validator, RefResolver, validators 3 | 4 | API_FILENAME = "probeinfo_api.yaml" 5 | with open(API_FILENAME, "r") as f: 6 | API = yaml.load(f, Loader=yaml.SafeLoader) 7 | SCHEMAS = API["components"]["schemas"] 8 | RESOLVER = RefResolver("", API) 9 | 10 | 11 | def extend_with_default(validator_class): 12 | """ 13 | Apply default values from the schema when not present. 14 | 15 | See https://python-jsonschema.readthedocs.io/en/stable/faq/ 16 | """ 17 | validate_properties = validator_class.VALIDATORS["properties"] 18 | 19 | def set_defaults(validator, properties, instance, schema): 20 | for property, subschema in properties.items(): 21 | if "default" in subschema: 22 | instance.setdefault(property, subschema["default"]) 23 | 24 | for error in validate_properties( 25 | validator, 26 | properties, 27 | instance, 28 | schema, 29 | ): 30 | yield error 31 | 32 | return validators.extend( 33 | validator_class, 34 | {"properties": set_defaults}, 35 | ) 36 | 37 | 38 | Validator = extend_with_default(Draft7Validator) 39 | 40 | 41 | def validate_as(instance, model_name): 42 | schema = SCHEMAS[model_name] 43 | Draft7Validator(schema, resolver=RESOLVER).validate(instance) 44 | 45 | 46 | def apply_defaults_and_validate(instance, model_name): 47 | schema = SCHEMAS[model_name] 48 | Validator(schema, resolver=RESOLVER).validate(instance) 49 | # Send through validation again to be sure any inject default values 50 | # still validate with the schema. 51 | validate_as(instance, model_name) 52 | -------------------------------------------------------------------------------- /probe_scraper/parsers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/probe-scraper/0506e31f38e61ddc662c0eab18826b370314896e/probe_scraper/parsers/__init__.py -------------------------------------------------------------------------------- /probe_scraper/parsers/events.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | from .third_party import parse_events 6 | from .utils import get_major_version, set_in_nested_dict 7 | 8 | 9 | def extract_events_data(e): 10 | props = { 11 | # source_field: target_field 12 | # TODO: extract description. 13 | "description": "description", 14 | "expiry_version": "expiry_version", 15 | "expiry_day": "expiry_day", 16 | "cpp_guard": "cpp_guard", 17 | "bug_numbers": "bug_numbers", 18 | "methods": "details/methods", 19 | "objects": "details/objects", 20 | "record_in_processes": "details/record_in_processes", 21 | # TODO: extract key descriptions too. 22 | "extra_keys": "details/extra_keys", 23 | } 24 | 25 | defaults = { 26 | "expiry_version": "never", 27 | "expiry_day": "never", 28 | "name": e.methods[0], 29 | "description": e.description, 30 | "cpp_guard": None, 31 | "bug_numbers": [], 32 | } 33 | 34 | data = {"details": {}} 35 | 36 | for source_field, target_field in props.items(): 37 | value = getattr(e, source_field, e._definition.get(source_field, None)) 38 | if value is None and source_field in defaults: 39 | value = defaults[source_field] 40 | set_in_nested_dict(data, target_field, value) 41 | 42 | # We only care about opt-out or opt-in really. 43 | optout = getattr(e, "dataset", "").endswith("_OPTOUT") 44 | data["optout"] = optout 45 | 46 | # Normalize some field values. 47 | data["expiry_version"] = get_major_version(data["expiry_version"]) 48 | if data["expiry_version"] == "default": 49 | data["expiry_version"] = "never" 50 | 51 | return data 52 | 53 | 54 | class EventsParser: 55 | def parse(self, filenames, version=None, channel=None): 56 | # Events.yaml had a format change in 53, see bug 1329620. 57 | # We don't have important event usage yet, so lets skip 58 | # backwards compatibility for now. 59 | if (version and channel) and ( 60 | ( 61 | (channel != "nightly" and version < 53) 62 | or (channel == "nightly" and version < 54) 63 | ) 64 | ): 65 | return {} 66 | 67 | if len(filenames) > 1: 68 | raise Exception("We don't support loading from more than one file.") 69 | 70 | events = parse_events.load_events(filenames[0], strict_type_checks=False) 71 | 72 | # Get the probe information in a standard format. 73 | out = {} 74 | for e in events: 75 | full_name = e.category + "." + e.methods[0] 76 | if getattr(e, "name", None): 77 | full_name += "#" + e.name 78 | out[full_name] = extract_events_data(e) 79 | 80 | return out 81 | -------------------------------------------------------------------------------- /probe_scraper/parsers/histograms.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | from .third_party import histogram_tools 6 | from .utils import get_major_version, set_in_nested_dict 7 | 8 | 9 | def extract_histogram_data(histogram, version): 10 | props = { 11 | # source_field: target_field 12 | "cpp_guard": "cpp_guard", 13 | "description": "description", 14 | "expiration": "expiry_version", 15 | "bug_numbers": "bug_numbers", 16 | "alert_emails": "notification_emails", 17 | "n_buckets": "details/n_buckets", 18 | "low": "details/low", 19 | "high": "details/high", 20 | "keyed": "details/keyed", 21 | "kind": "details/kind", 22 | "record_in_processes": "details/record_in_processes", 23 | "record_into_store": "details/record_into_store", 24 | } 25 | 26 | defaults = { 27 | "cpp_guard": None, 28 | "keyed": False, 29 | "expiration": "never", 30 | "bug_numbers": [], 31 | "alert_emails": [], 32 | } 33 | 34 | data = {"details": {}} 35 | 36 | for source_field, target_field in props.items(): 37 | value = None 38 | if hasattr(histogram, source_field): 39 | value = getattr(histogram, source_field)() 40 | elif source_field in histogram._definition: 41 | value = histogram._definition.get(source_field) 42 | elif source_field in defaults: 43 | value = defaults[source_field] 44 | set_in_nested_dict(data, target_field, value) 45 | 46 | # Only include labels if the histogram is categorical. 47 | if histogram.kind() == "categorical": 48 | set_in_nested_dict(data, "details/labels", histogram.labels()) 49 | 50 | # We only care about opt-out or opt-in really. 51 | optout = False 52 | if hasattr(histogram, "dataset"): 53 | optout = getattr(histogram, "dataset")().endswith("_OPTOUT") 54 | 55 | # Use Counters are shipped on release since 65. 56 | # If the parsers would set this flag, we couldn't differentiate between versions. 57 | if int(version) >= 65: 58 | if histogram.name().startswith("USE_COUNTER2_"): 59 | optout = True 60 | 61 | data["optout"] = optout 62 | 63 | # Normalize some field values. 64 | data["expiry_version"] = get_major_version(data["expiry_version"]) 65 | if data["expiry_version"] == "default": 66 | data["expiry_version"] = "never" 67 | if data["details"]["keyed"] == "true": 68 | data["details"]["keyed"] = True 69 | 70 | # TODO: Fixup old non-number values & expressions. 71 | # History: bug 920169, bug 1245910 72 | # "JS::gcreason::NUM_TELEMETRY_REASONS" 73 | # "JS::gcreason::NUM_TELEMETRY_REASONS+1" 74 | # "mozilla::StartupTimeline::MAX_EVENT_ID" 75 | 76 | return data 77 | 78 | 79 | def transform_probe_info(probes, version): 80 | return dict( 81 | (probe.name(), extract_histogram_data(probe, version)) for probe in probes 82 | ) 83 | 84 | 85 | class HistogramsParser: 86 | def parse(self, filenames, version=None, channel=None): 87 | # Call the histogram tools for each file. 88 | parsed_probes = list(histogram_tools.from_files(filenames)) 89 | 90 | # Get the probe information in a standard format. 91 | return transform_probe_info(parsed_probes, version) 92 | -------------------------------------------------------------------------------- /probe_scraper/parsers/metrics.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | from pathlib import Path 6 | 7 | from glean_parser.parser import parse_objects 8 | 9 | from .pings import normalize_ping_name 10 | from .utils import get_source_url 11 | 12 | 13 | class GleanMetricsParser: 14 | """ 15 | Use the [Glean Parser] 16 | (https://mozilla.github.io/glean_parser) 17 | to parse the metrics.yaml files. 18 | """ 19 | 20 | def parse(self, filenames, config, repo_url=None, commit_hash=None): 21 | config = config.copy() 22 | config["do_not_disable_expired"] = True 23 | 24 | paths = [Path(fname) for fname in filenames] 25 | paths = [path for path in paths if path.is_file()] 26 | results = parse_objects(paths, config) 27 | errors = [err for err in results] 28 | 29 | metrics = { 30 | metric.identifier(): metric.serialize() 31 | for category, probes in results.value.items() 32 | for probe_name, metric in probes.items() 33 | } 34 | 35 | for v in metrics.values(): 36 | v["send_in_pings"] = [normalize_ping_name(p) for p in v["send_in_pings"]] 37 | if repo_url and commit_hash: 38 | v["source_url"] = get_source_url(v["defined_in"], repo_url, commit_hash) 39 | # the 'defined_in' structure is no longer needed 40 | del v["defined_in"] 41 | return metrics, errors 42 | -------------------------------------------------------------------------------- /probe_scraper/parsers/pings.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | from pathlib import Path 6 | 7 | from glean_parser.parser import parse_objects 8 | 9 | from .utils import get_source_url 10 | 11 | PING_NAME_NORMALIZATION = { 12 | "deletion_request": "deletion-request", 13 | "bookmarks_sync": "bookmarks-sync", 14 | "history_sync": "history-sync", 15 | "session_end": "session-end", 16 | } 17 | 18 | 19 | def normalize_ping_name(name): 20 | return PING_NAME_NORMALIZATION.get(name, name) 21 | 22 | 23 | def generate_definition(ping_data, repo_url, commit_hash): 24 | serialized = ping_data.serialize() 25 | if repo_url and commit_hash: 26 | serialized["source_url"] = get_source_url( 27 | serialized["defined_in"], repo_url, commit_hash 28 | ) 29 | # the 'defined_in' structure is no longer needed 30 | del serialized["defined_in"] 31 | return serialized 32 | 33 | 34 | class GleanPingsParser: 35 | """ 36 | Use the [Glean Parser] 37 | (https://mozilla.github.io/glean_parser) 38 | to parse the pings.yaml files. 39 | """ 40 | 41 | def parse(self, filenames, config, repo_url=None, commit_hash=None): 42 | config = config.copy() 43 | paths = [Path(fname) for fname in filenames] 44 | paths = [path for path in paths if path.is_file()] 45 | results = parse_objects(paths, config) 46 | errors = [err for err in results] 47 | 48 | pings = { 49 | normalize_ping_name(ping_name): generate_definition( 50 | ping_data, repo_url, commit_hash 51 | ) 52 | for category, pings in results.value.items() 53 | for ping_name, ping_data in pings.items() 54 | } 55 | 56 | return pings, errors 57 | -------------------------------------------------------------------------------- /probe_scraper/parsers/repositories.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import copy 6 | 7 | import yaml 8 | 9 | from probe_scraper import model_validation 10 | 11 | REPOSITORIES_FILENAME = "repositories.yaml" 12 | 13 | 14 | def remove_none(obj): 15 | """ 16 | Recursively traverses a dict or list, removing all dict items where the value 17 | is None. This helps us meet the existing probeinfo API contract and sidesteps 18 | an awkward incompatibility between JSON schemas and OpenAPI schemas, which use 19 | incompatible constructs for marking fields as nullable. 20 | 21 | Implementation from https://stackoverflow.com/a/20558778 22 | """ 23 | if isinstance(obj, (list, tuple, set)): 24 | return type(obj)(remove_none(x) for x in obj if x is not None) 25 | elif isinstance(obj, dict): 26 | return type(obj)( 27 | (remove_none(k), remove_none(v)) 28 | for k, v in obj.items() 29 | if k is not None and v is not None 30 | ) 31 | else: 32 | return obj 33 | 34 | 35 | class Repository(object): 36 | """ 37 | A class representing a repository, read in from `repositories.yaml` 38 | """ 39 | 40 | def __init__(self, name, definition): 41 | self.name = name 42 | self.url = definition.get("url") 43 | self.branch = definition.get("branch", None) 44 | self.notification_emails = definition.get("notification_emails") 45 | self.app_id = definition.get("app_id") 46 | self.description = definition.get("description") 47 | self.channel = definition.get("channel") 48 | self.deprecated = definition.get("deprecated", False) 49 | self.metrics_file_paths = definition.get("metrics_files", []) 50 | self.ping_file_paths = definition.get("ping_files", []) 51 | self.tag_file_paths = definition.get("tag_files", []) 52 | self.library_names = definition.get("library_names", None) 53 | self.dependencies = definition.get("dependencies", []) 54 | self.prototype = definition.get("prototype", False) 55 | self.retention_days = definition.get("retention_days", None) 56 | self.encryption = definition.get("encryption", None) 57 | self.skip_documentation = definition.get("skip_documentation", False) 58 | self.moz_pipeline_metadata_defaults = definition.get( 59 | "moz_pipeline_metadata_defaults", {} 60 | ) 61 | self.moz_pipeline_metadata = definition.get("moz_pipeline_metadata", {}) 62 | 63 | def get_metrics_file_paths(self): 64 | return self.metrics_file_paths 65 | 66 | def get_ping_file_paths(self): 67 | return self.ping_file_paths 68 | 69 | def get_change_files(self): 70 | return self.metrics_file_paths + self.ping_file_paths + self.tag_file_paths 71 | 72 | def get_dependencies(self): 73 | return self.dependencies 74 | 75 | def to_dict(self): 76 | # Remove null elements 77 | # https://google.github.io/styleguide/jsoncstyleguide.xml#Empty/Null_Property_Values 78 | return {k: v for k, v in list(self.__dict__.items()) if v is not None} 79 | 80 | 81 | class RepositoriesParser(object): 82 | """ 83 | A parser for `repositories.yaml` files, which both validates and retrieves Repository objects 84 | """ 85 | 86 | def _get_repos(self, filename=None): 87 | if filename is None: 88 | filename = REPOSITORIES_FILENAME 89 | 90 | with open(filename, "r") as f: 91 | repos = yaml.load(f, Loader=yaml.SafeLoader) 92 | 93 | version = repos.get("version", "1") 94 | if version == "1": 95 | return repos 96 | else: 97 | return self._v2_to_v1(filename) 98 | 99 | def validate(self, filename=None): 100 | data = self._get_repos(filename) 101 | model_validation.validate_as(data, "RepositoriesYamlV1") 102 | 103 | def parse(self, filename=None): 104 | """ 105 | Parse the given filename as a set of repository definitions for v1 endpoints. 106 | 107 | The passed file can either be in the old RepositoriesYamlV1 format 108 | or the current RepositoriesYamlV2 format, in which case it will be 109 | "downgraded" to v1 format. This is to maintain existing code and output for 110 | the v1 probeinfo endpoints. 111 | 112 | New endpoints should be built with the data format returned from parse_v2 113 | rather than this function. 114 | """ 115 | self.validate(filename) 116 | repos = self._get_repos(filename) 117 | 118 | repos = [ 119 | Repository(name, definition) for name, definition in list(repos.items()) 120 | ] 121 | 122 | return repos 123 | 124 | def parse_v2(self, filename=None) -> dict: 125 | """ 126 | Parse the given filename as a set of repository definitions. 127 | 128 | The passed file must be in the current RepositoriesYamlV2 format. 129 | """ 130 | with open(filename or REPOSITORIES_FILENAME, "r") as f: 131 | data = yaml.load(f, Loader=yaml.SafeLoader) 132 | model_validation.apply_defaults_and_validate(data, "RepositoriesYamlV2") 133 | repos = data 134 | 135 | app_listings = [] 136 | for app in repos["applications"]: 137 | channels = app.pop("channels") 138 | for channel in channels: 139 | dependencies = app.get("dependencies", []) + channel.pop( 140 | "additional_dependencies", [] 141 | ) 142 | listing = {**app, **channel} 143 | listing["dependencies"] = dependencies 144 | app_id = listing["app_id"] 145 | listing["document_namespace"] = ( 146 | app_id.lower().replace("_", "-").replace(".", "-") 147 | ) 148 | listing["bq_dataset_family"] = ( 149 | app_id.lower().replace("-", "_").replace(".", "_") 150 | ) 151 | # Need a deepcopy to ensure the dictionary values remain distinct. 152 | listing = copy.deepcopy(listing) 153 | model_validation.validate_as(listing, "AppListing") 154 | app_listings.append(listing) 155 | 156 | library_variants = [] 157 | for lib in repos["libraries"]: 158 | variants = lib.pop("variants") 159 | for variant in variants: 160 | listing = {**lib, **variant} 161 | model_validation.validate_as(listing, "LibraryVariant") 162 | library_variants.append(listing) 163 | 164 | return { 165 | "library-variants": library_variants, 166 | "app-listings": app_listings, 167 | } 168 | 169 | def _v2_to_v1(self, filename): 170 | repos_v2 = self.parse_v2(filename) 171 | repos = {} 172 | for lib in repos_v2["library-variants"]: 173 | v1_name = lib["v1_name"] 174 | lib["library_names"] = [lib["dependency_name"]] 175 | lib["app_id"] = v1_name 176 | del lib["library_name"] 177 | del lib["dependency_name"] 178 | del lib["v1_name"] 179 | repos[v1_name] = lib 180 | for app in repos_v2["app-listings"]: 181 | app_channel = app.pop("app_channel", None) 182 | if app_channel is not None: 183 | app["channel"] = app_channel 184 | v1_name = app.pop("v1_name") 185 | app.pop("app_name") 186 | app.pop("canonical_app_name", None) 187 | app.pop("bq_dataset_family") 188 | app_description = app.pop("app_description", None) 189 | app["description"] = app.get("description", app_description) 190 | namespace = app.pop("document_namespace") 191 | app["app_id"] = namespace 192 | repos[v1_name] = app 193 | return repos 194 | -------------------------------------------------------------------------------- /probe_scraper/parsers/scalars.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | from .third_party import parse_scalars 6 | from .utils import get_major_version 7 | 8 | 9 | def extract_scalar_data(s): 10 | 11 | # External scalars.yaml files have release/prerelease, not opt-in/opt-out 12 | try: 13 | optout = s.dataset.endswith("_OPTOUT") 14 | except KeyError: 15 | optout = s._definition.get("collect_on_channels", "prerelease") == "release" 16 | 17 | return { 18 | "description": s.description, 19 | "expiry_version": get_major_version(s.expires), 20 | "cpp_guard": s.cpp_guard, 21 | "optout": optout, 22 | "bug_numbers": s.bug_numbers, 23 | "notification_emails": s.notification_emails, 24 | "details": { 25 | "keyed": s.keyed, 26 | "kind": s.kind, 27 | "record_in_processes": s.record_in_processes, 28 | "record_into_store": s.record_into_store, 29 | }, 30 | } 31 | 32 | 33 | def transform_scalar_info(probes): 34 | return dict((probe.label, extract_scalar_data(probe)) for probe in probes) 35 | 36 | 37 | class ScalarsParser: 38 | def parse(self, filenames, version=None, channel=None): 39 | if len(filenames) > 1: 40 | raise Exception("We don't support loading from more than one file.") 41 | 42 | scalars = parse_scalars.load_scalars(filenames[0], strict_type_checks=False) 43 | 44 | # Get the probe information in a standard format. 45 | return transform_scalar_info(scalars) 46 | -------------------------------------------------------------------------------- /probe_scraper/parsers/tags.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | from pathlib import Path 6 | 7 | from glean_parser.parser import parse_objects 8 | 9 | from .utils import get_source_url 10 | 11 | 12 | class GleanTagsParser: 13 | """ 14 | Use the [Glean Parser] 15 | (https://mozilla.github.io/glean_parser) 16 | to parse tags.yaml files. 17 | """ 18 | 19 | def parse(self, filenames, config, repo_url=None, commit_hash=None): 20 | config = config.copy() 21 | paths = [Path(fname) for fname in filenames] 22 | paths = [path for path in paths if path.is_file()] 23 | results = parse_objects(paths, config) 24 | errors = [err for err in results] 25 | tags = { 26 | tag_name: tag_data.serialize() 27 | for tag_name, tag_data in results.value.get("tags", {}).items() 28 | } 29 | 30 | for v in tags.values(): 31 | if repo_url and commit_hash: 32 | v["source_url"] = get_source_url(v["defined_in"], repo_url, commit_hash) 33 | # the 'defined_in' structure is no longer needed 34 | del v["defined_in"] 35 | return tags, errors 36 | -------------------------------------------------------------------------------- /probe_scraper/parsers/third_party/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/probe-scraper/0506e31f38e61ddc662c0eab18826b370314896e/probe_scraper/parsers/third_party/__init__.py -------------------------------------------------------------------------------- /probe_scraper/parsers/third_party/shared_telemetry_utils.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | # This file contains utility functions shared by the scalars and the histogram generation 6 | # scripts. 7 | 8 | 9 | import re 10 | import sys 11 | 12 | import yaml 13 | 14 | # This is a list of flags that determine which process a measurement is allowed 15 | # to record from. 16 | KNOWN_PROCESS_FLAGS = { 17 | "all": "All", 18 | "all_children": "AllChildren", 19 | "main": "Main", 20 | "content": "Content", 21 | "gpu": "Gpu", 22 | "socket": "Socket", 23 | # Historical Values 24 | "all_childs": "AllChildren", # Supporting files from before bug 1363725 25 | } 26 | 27 | PROCESS_ENUM_PREFIX = "mozilla::Telemetry::Common::RecordedProcessType::" 28 | 29 | 30 | class ParserError(Exception): 31 | """Thrown by different probe parsers. Errors are partitioned into 32 | 'immediately fatal' and 'eventually fatal' so that the parser can print 33 | multiple error messages at a time. See bug 1401612 .""" 34 | 35 | eventual_errors = [] 36 | 37 | def __init__(self, *args): 38 | Exception.__init__(self, *args) 39 | 40 | def handle_later(self): 41 | ParserError.eventual_errors.append(self) 42 | 43 | def handle_now(self): 44 | ParserError.print_eventuals() 45 | print(self.message, file=sys.stderr) 46 | sys.exit(1) 47 | 48 | @classmethod 49 | def print_eventuals(cls): 50 | while cls.eventual_errors: 51 | print(cls.eventual_errors.pop(0).message, file=sys.stderr) 52 | 53 | @classmethod 54 | def exit_func(cls): 55 | if cls.eventual_errors: 56 | cls("Some errors occurred").handle_now() 57 | 58 | 59 | def is_valid_process_name(name): 60 | return name in KNOWN_PROCESS_FLAGS 61 | 62 | 63 | def process_name_to_enum(name): 64 | return PROCESS_ENUM_PREFIX + KNOWN_PROCESS_FLAGS.get(name) 65 | 66 | 67 | class StringTable: 68 | """Manages a string table and allows C style serialization to a file.""" 69 | 70 | def __init__(self): 71 | self.current_index = 0 72 | self.table = {} 73 | 74 | def c_strlen(self, string): 75 | """The length of a string including the null terminating character. 76 | :param string: the input string. 77 | """ 78 | return len(string) + 1 79 | 80 | def stringIndex(self, string): 81 | """Returns the index in the table of the provided string. Adds the string to 82 | the table if it's not there. 83 | :param string: the input string. 84 | """ 85 | if string in self.table: 86 | return self.table[string] 87 | else: 88 | result = self.current_index 89 | self.table[string] = result 90 | self.current_index += self.c_strlen(string) 91 | return result 92 | 93 | def stringIndexes(self, strings): 94 | """Returns a list of indexes for the provided list of strings. 95 | Adds the strings to the table if they are not in it yet. 96 | :param strings: list of strings to put into the table. 97 | """ 98 | return [self.stringIndex(s) for s in strings] 99 | 100 | def writeDefinition(self, f, name): 101 | """Writes the string table to a file as a C const char array. 102 | 103 | This writes out the string table as one single C char array for memory 104 | size reasons, separating the individual strings with '\0' characters. 105 | This way we can index directly into the string array and avoid the additional 106 | storage costs for the pointers to them (and potential extra relocations for those). 107 | 108 | :param f: the output stream. 109 | :param name: the name of the output array. 110 | """ 111 | entries = list(self.table.items()) 112 | entries.sort(key=lambda x: x[1]) 113 | 114 | # Avoid null-in-string warnings with GCC and potentially 115 | # overlong string constants; write everything out the long way. 116 | def explodeToCharArray(string): 117 | def toCChar(s): 118 | if s == "'": 119 | return "'\\''" 120 | else: 121 | return "'%s'" % s 122 | 123 | return ", ".join(map(toCChar, string)) 124 | 125 | f.write("const char %s[] = {\n" % name) 126 | for string, offset in entries: 127 | if "*/" in string: 128 | raise ValueError( 129 | "String in string table contains unexpected sequence '*/': %s" 130 | % string 131 | ) 132 | 133 | e = explodeToCharArray(string) 134 | if e: 135 | f.write( 136 | " /* %5d - \"%s\" */ %s, '\\0',\n" 137 | % (offset, string, explodeToCharArray(string)) 138 | ) 139 | else: 140 | f.write(" /* %5d - \"%s\" */ '\\0',\n" % (offset, string)) 141 | f.write("};\n\n") 142 | 143 | 144 | def static_assert(output, expression, message): 145 | """Writes a C++ compile-time assertion expression to a file. 146 | :param output: the output stream. 147 | :param expression: the expression to check. 148 | :param message: the string literal that will appear if the expression evaluates to 149 | false. 150 | """ 151 | print('static_assert(%s, "%s");' % (expression, message), file=output) 152 | 153 | 154 | def validate_expiration_version(expiration): 155 | """Makes sure the expiration version has the expected format. 156 | 157 | Allowed examples: "1.0", "20", "300.0a1", "60.0a1", "30.5a1", "never" 158 | Disallowed examples: "Never", "asd", "4000000", "60a1" 159 | 160 | :param expiration: the expiration version string. 161 | :return: True if the expiration validates correctly, False otherwise. 162 | """ 163 | if expiration != "never" and not re.match(r"^\d{1,3}(\.\d|\.\da1)?$", expiration): 164 | return False 165 | 166 | return True 167 | 168 | 169 | def add_expiration_postfix(expiration): 170 | """Formats the expiration version and adds a version postfix if needed. 171 | 172 | :param expiration: the expiration version string. 173 | :return: the modified expiration string. 174 | """ 175 | if re.match(r"^[1-9][0-9]*$", expiration): 176 | return expiration + ".0a1" 177 | 178 | if re.match(r"^[1-9][0-9]*\.0$", expiration): 179 | return expiration + "a1" 180 | 181 | return expiration 182 | 183 | 184 | def load_yaml_file(filename): 185 | """Load a YAML file from disk, throw a ParserError on failure.""" 186 | try: 187 | with open(filename, "r") as f: 188 | return yaml.safe_load(f) 189 | except IOError as e: 190 | raise ParserError("Error opening " + filename + ": " + e.message) 191 | except ValueError as e: 192 | raise ParserError( 193 | "Error parsing processes in {}: {}".format(filename, e.message) 194 | ) 195 | -------------------------------------------------------------------------------- /probe_scraper/parsers/third_party/usecounters.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import collections 6 | import re 7 | import sys 8 | 9 | 10 | def read_conf(conf_filename): 11 | # Can't read/write from a single StringIO, so make a new one for reading. 12 | stream = open(conf_filename) 13 | 14 | def parse_counters(stream): 15 | for line_num, line in enumerate(stream): 16 | line = line.rstrip("\n") 17 | if not line or line.startswith("//"): 18 | # empty line or comment 19 | continue 20 | m = re.match(r"method ([A-Za-z0-9]+)\.([A-Za-z0-9]+)$", line) 21 | if m: 22 | interface_name, method_name = m.groups() 23 | yield { 24 | "type": "method", 25 | "interface_name": interface_name, 26 | "method_name": method_name, 27 | } 28 | continue 29 | m = re.match(r"attribute ([A-Za-z0-9]+)\.([A-Za-z0-9]+)$", line) 30 | if m: 31 | interface_name, attribute_name = m.groups() 32 | yield { 33 | "type": "attribute", 34 | "interface_name": interface_name, 35 | "attribute_name": attribute_name, 36 | } 37 | continue 38 | m = re.match(r"property ([A-Za-z0-9]+)$", line) 39 | if m: 40 | property_name = m.group(1) 41 | yield {"type": "property", "property_name": property_name} 42 | continue 43 | m = re.match(r"custom ([A-Za-z0-9_]+) (.*)$", line) 44 | if m: 45 | name, desc = m.groups() 46 | yield {"type": "custom", "name": name, "desc": desc} 47 | continue 48 | raise ValueError("error parsing %s at line %d" % (conf_filename, line_num)) 49 | 50 | return parse_counters(stream) 51 | 52 | 53 | def generate_histograms(filename): 54 | # The mapping for use counters to telemetry histograms depends on the 55 | # ordering of items in the dictionary. 56 | items = collections.OrderedDict() 57 | for counter in read_conf(filename): 58 | 59 | def append_counter(name, desc): 60 | items[name] = { 61 | "expires_in_version": "never", 62 | "kind": "boolean", 63 | "description": desc, 64 | } 65 | 66 | def append_counters(name, desc): 67 | append_counter( 68 | "USE_COUNTER2_%s_DOCUMENT" % name, "Whether a document %s" % desc 69 | ) 70 | append_counter("USE_COUNTER2_%s_PAGE" % name, "Whether a page %s" % desc) 71 | 72 | if counter["type"] == "method": 73 | method = "%s.%s" % (counter["interface_name"], counter["method_name"]) 74 | append_counters(method.replace(".", "_").upper(), "called %s" % method) 75 | elif counter["type"] == "attribute": 76 | attr = "%s.%s" % (counter["interface_name"], counter["attribute_name"]) 77 | counter_name = attr.replace(".", "_").upper() 78 | append_counters("%s_getter" % counter_name, "got %s" % attr) 79 | append_counters("%s_setter" % counter_name, "set %s" % attr) 80 | elif counter["type"] == "property": 81 | prop = counter["property_name"] 82 | append_counters( 83 | "PROPERTY_%s" % prop.replace("-", "_").upper(), 84 | "used the '%s' property" % prop, 85 | ) 86 | elif counter["type"] == "custom": 87 | append_counters(counter["name"].upper(), counter["desc"]) 88 | 89 | return items 90 | -------------------------------------------------------------------------------- /probe_scraper/parsers/utils.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | HTTP_HEADERS = { 6 | "user-agent": "probe-scraper/1.0", 7 | } 8 | 9 | 10 | def set_in_nested_dict(dictionary, path, value): 11 | """Set a property in a nested dictionary by specifying a path to it. 12 | 13 | A call like e.g.: 14 | set_in_nested_dict(d, "a/b/c", 1) 15 | is equivalent to: 16 | d["a"]["b"]["c"] = 1 17 | """ 18 | keys = path.split("/") 19 | for k in keys[:-1]: 20 | dictionary = dictionary[k] 21 | dictionary[keys[-1]] = value 22 | 23 | 24 | def get_major_version(version): 25 | """Extracts the major (leftmost) version of a version string. 26 | 27 | :param version: the version string (e.g. "53.1") 28 | :return: a string containing the leftmost number before the first dot 29 | """ 30 | return version.split(".")[0] 31 | 32 | 33 | def get_source_url(glean_definition, repo_url, commit_hash): 34 | """Add source URL where metrics and pings are defined.""" 35 | line_number = glean_definition["line"] 36 | file_path = glean_definition["filepath"][ 37 | glean_definition["filepath"].find(commit_hash) : # noqa: E203 38 | ] 39 | return f"{repo_url}/blob/{file_path}#L{line_number}" 40 | -------------------------------------------------------------------------------- /probe_scraper/remote_storage.py: -------------------------------------------------------------------------------- 1 | import fnmatch 2 | import gzip 3 | import subprocess 4 | from pathlib import Path 5 | from tempfile import TemporaryDirectory 6 | from typing import List, Optional, Tuple, Union 7 | 8 | from .exc import ProbeScraperServerError 9 | 10 | TEXT_HTML = "text/html" 11 | APPLICATION_JSON = "application/json" 12 | INDEX_HTML = "index.html" 13 | 14 | 15 | def _call(args: List[str]): 16 | process = subprocess.run( 17 | args, 18 | stdout=subprocess.PIPE, 19 | stderr=subprocess.STDOUT, 20 | text=True, 21 | ) 22 | if process.returncode == 0: 23 | print(process.stdout, end="") 24 | else: 25 | raise ProbeScraperServerError( 26 | f"Command {args!r} returned non-zero exit status {process.returncode}: " 27 | + process.stdout 28 | ) 29 | 30 | 31 | def _s3_sync( 32 | src: Union[str, Path], 33 | dst: Union[str, Path], 34 | delete: bool = False, 35 | exclude: Tuple[str, ...] = (), 36 | acl: Optional[str] = None, 37 | content_type: Optional[str] = None, 38 | content_encoding: Optional[str] = None, 39 | cache_control: Optional[str] = None, 40 | ): 41 | # must use sync for dirs and cp for files 42 | if isinstance(src, Path) and src.is_file(): 43 | # must upload files with cp 44 | s3_cmd = "cp" 45 | else: 46 | s3_cmd = "sync" 47 | 48 | _call( 49 | ["aws", "s3", s3_cmd, "--only-show-errors", str(src), str(dst)] 50 | + (["--delete"] if delete else []) 51 | + [ 52 | arg 53 | for key, value in zip( 54 | ( 55 | *("--exclude" for _ in exclude), 56 | "--content-type", 57 | "--content-encoding", 58 | "--cache-control", 59 | "--acl", 60 | ), 61 | ( 62 | *exclude, 63 | content_type, 64 | content_encoding, 65 | cache_control, 66 | acl, 67 | ), 68 | ) 69 | if value is not None 70 | for arg in (key, value) 71 | ] 72 | ) 73 | 74 | 75 | def _gcs_sync( 76 | src: Union[str, Path], 77 | dst: Union[str, Path], 78 | delete: bool = False, 79 | exclude: Tuple[str, ...] = (), 80 | content_type: Optional[str] = None, 81 | content_encoding: Optional[str] = None, 82 | cache_control: Optional[str] = None, 83 | acl: Optional[str] = None, 84 | ): 85 | if isinstance(src, Path) and src.is_file(): 86 | # must upload files with cp 87 | gsutil_cmd = ["cp"] 88 | if delete: 89 | raise ValueError("cannot delete when uploading a single file") 90 | if exclude: 91 | raise ValueError("cannot exclude when uploading a single file") 92 | else: 93 | gsutil_cmd = ["rsync", "-r"] 94 | 95 | _call( 96 | ["gsutil", "-q", "-m"] 97 | # -h flags are global and must appear before the rsync/cp command 98 | + [ 99 | arg 100 | for header, value in zip( 101 | ["Content-Type", "Content-Encoding", "Cache-Control"], 102 | [content_type, content_encoding, cache_control], 103 | ) 104 | if value is not None 105 | for arg in ("-h", f"{header}:{value}") 106 | ] 107 | + gsutil_cmd 108 | # command specific options must appear before src and dst 109 | + (["-d"] if delete else []) 110 | # translate excludes from glob to regex before passing to gsutil 111 | + [arg for item in exclude for arg in ("-x", fnmatch.translate(item))] 112 | + (["-a", acl] if acl is not None else []) 113 | + [str(src), str(dst)] 114 | ) 115 | 116 | 117 | def _get_sync_function(remote: str): 118 | if remote.startswith("s3://"): 119 | return _s3_sync 120 | elif remote.startswith("gs://"): 121 | return _gcs_sync 122 | else: 123 | raise ValueError( 124 | f"remote path must have scheme like s3:// or gs://, got: {remote!r}" 125 | ) 126 | 127 | 128 | def remote_storage_pull(src: str, dst: Path, decompress: bool = False): 129 | sync = _get_sync_function(src) 130 | if sync is _gcs_sync: 131 | # gsutil will decompress files 132 | decompress = False 133 | # prevent error from gsutil when dst and src do not exist 134 | dst.mkdir(parents=True, exist_ok=True) 135 | 136 | if decompress: 137 | with TemporaryDirectory() as tmp: 138 | tmp_path = Path(tmp) 139 | sync(src, tmp_path) 140 | for in_file in tmp_path.rglob("*"): 141 | if not in_file.is_dir(): 142 | out_file = dst / in_file.relative_to(tmp_path) 143 | out_file.parent.mkdir(parents=True, exist_ok=True) 144 | out_file.write_bytes(gzip.decompress(in_file.read_bytes())) 145 | else: 146 | sync(src, dst) 147 | 148 | 149 | def remote_storage_push(src: Path, dst: str, compress: bool = False, **kwargs): 150 | sync = _get_sync_function(dst) 151 | if compress: 152 | kwargs["content_encoding"] = "gzip" 153 | if "exclude" in kwargs: 154 | raise NotImplementedError("exclude is not supported while compressing") 155 | # cloudfront is supposed to automatically gzip objects, but it won't do that 156 | # if the object size is > 10 megabytes (https://webmasters.stackexchange.com/a/111734) 157 | # which our files sometimes are. to work around this, as well as to support google 158 | # cloud storage, we'll gzip the contents into a temporary directory, and upload that 159 | # with a special content encoding 160 | with TemporaryDirectory() as tmp_name: 161 | tmp = Path(tmp_name) 162 | if src.is_dir(): 163 | for in_file in src.rglob("*"): 164 | if not in_file.is_dir(): 165 | out_file = tmp / in_file.relative_to(src) 166 | out_file.parent.mkdir(parents=True, exist_ok=True) 167 | out_file.write_bytes(gzip.compress(in_file.read_bytes())) 168 | index = tmp / INDEX_HTML 169 | if index.exists(): 170 | # must be a tuple 171 | kwargs["exclude"] = (INDEX_HTML,) 172 | sync( 173 | src=tmp, 174 | dst=dst, 175 | content_type=APPLICATION_JSON, 176 | **kwargs, 177 | ) 178 | if index.exists(): 179 | # cannot delete or exclude with a single file 180 | kwargs["delete"] = False 181 | kwargs["exclude"] = () 182 | sync( 183 | src=index, 184 | dst=dst, 185 | content_type=TEXT_HTML, 186 | **kwargs, 187 | ) 188 | else: 189 | tmp_file = tmp / src.name 190 | tmp_file.write_bytes(gzip.compress(src.read_bytes())) 191 | content_type = TEXT_HTML if src.name == INDEX_HTML else APPLICATION_JSON 192 | sync( 193 | src=tmp_file, 194 | dst=dst, 195 | content_type=content_type, 196 | **kwargs, 197 | ) 198 | else: 199 | sync(src, dst, **kwargs) 200 | -------------------------------------------------------------------------------- /probe_scraper/scrapers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/probe-scraper/0506e31f38e61ddc662c0eab18826b370314896e/probe_scraper/scrapers/__init__.py -------------------------------------------------------------------------------- /probe_scraper/scrapers/buildhub.py: -------------------------------------------------------------------------------- 1 | import pprint 2 | import re 3 | from datetime import datetime 4 | 5 | import requests 6 | 7 | 8 | class NoDataFoundException(Exception): 9 | pass 10 | 11 | 12 | class Buildhub(object): 13 | 14 | search_url = "https://buildhub.moz.tools/api/search" 15 | default_window = 1000 16 | 17 | date_formats = ("%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S.%f") 18 | 19 | def _paginate_revision_dates( 20 | self, 21 | iteration, 22 | channel, 23 | min_version, 24 | product, 25 | locale, 26 | platform, 27 | max_version, 28 | verbose, 29 | window, 30 | ): 31 | query_str = [ 32 | {"term": {"source.product": product}}, 33 | {"term": {"target.channel": channel}}, 34 | {"term": {"target.locale": locale}}, 35 | {"term": {"target.platform": platform}}, 36 | ] 37 | 38 | # See: "99" > "65" == True, "100" > "65" == False 39 | # FIXME: This breaks if we get to v200 40 | # If we only need versions above 99 we restrict it to versions below 200, 41 | # then we're good for a bunch of versions. 42 | if min_version >= 100: 43 | query_str.append({"range": {"target.version": {"gte": str(min_version)}}}) 44 | if max_version is None: 45 | # This works because the minimum we ever ask for is v30. 46 | query_str.append({"range": {"target.version": {"lt": "200"}}}) 47 | else: 48 | # If the user didn't set a max version we need to explicitly include v100..v200 here. 49 | if max_version is None: 50 | query_str.append( 51 | { 52 | "bool": { 53 | "should": [ 54 | { 55 | "range": { 56 | "target.version": {"gte": str(min_version)} 57 | } 58 | }, 59 | { 60 | "bool": { 61 | "must": [ 62 | { 63 | "range": { 64 | "target.version": {"gte": "100"} 65 | } 66 | }, 67 | { 68 | "range": { 69 | "target.version": {"lt": "200"} 70 | } 71 | }, 72 | ] 73 | } 74 | }, 75 | ] 76 | } 77 | } 78 | ) 79 | else: 80 | # Otherwise we only check the min version, 81 | # the max version check will be appended 82 | query_str.append( 83 | {"range": {"target.version": {"gte": str(min_version)}}} 84 | ) 85 | 86 | if max_version is not None: 87 | query_str.append( 88 | { 89 | "bool": { 90 | "should": [ 91 | {"range": {"target.version": {"lte": str(max_version)}}}, 92 | {"prefix": {"target.version": str(max_version)}}, 93 | ] 94 | } 95 | } 96 | ) 97 | 98 | body = {"query": {"bool": {"filter": query_str}}, "size": window} 99 | 100 | if iteration != 0: 101 | body["from"] = iteration * window 102 | 103 | if verbose: 104 | print("------QUERY STRING------\n") 105 | pprint.pprint(body) 106 | 107 | response = requests.post(url=Buildhub.search_url, json=body) 108 | data = response.json() 109 | 110 | if verbose: 111 | print("------QUERY RESULTS------\n") 112 | pprint.pprint(data) 113 | 114 | return data 115 | 116 | def _distinct_and_clean(self, records): 117 | """ 118 | For more information on the schema of the records, 119 | see the Buildhub API documentation: 120 | https://buildhub.readthedocs.io/en/latest/api.html#more-about-the-data-schema 121 | """ 122 | cleaned_records = {} 123 | 124 | for record in records: 125 | # %:z not supported, see https://bugs.python.org/msg169952 126 | # Drop the tz portion entirely 127 | d = record["_source"]["download"]["date"] 128 | if re.search(r"\+\d{2}:\d{2}$", d): 129 | d = d[:-6] 130 | 131 | date = None 132 | try: 133 | date = datetime.strptime(d, self.date_formats[0]) 134 | except ValueError: 135 | pass 136 | 137 | if date is None: 138 | date = datetime.strptime(d, self.date_formats[1]) 139 | 140 | entry = { 141 | "date": date, 142 | "revision": record["_source"]["source"]["revision"], 143 | "version": record["_source"]["target"]["version"], 144 | "tree": record["_source"]["source"]["tree"], 145 | } 146 | 147 | revision = entry["revision"] 148 | min_entry = entry 149 | 150 | if revision in cleaned_records: 151 | if cleaned_records[revision] != entry: 152 | min_entry = min( 153 | (entry, cleaned_records[revision]), key=lambda x: x["date"] 154 | ) 155 | 156 | cleaned_records[revision] = min_entry 157 | 158 | return sorted(cleaned_records.values(), key=lambda x: x["date"]) 159 | 160 | def get_revision_dates( 161 | self, 162 | channel, 163 | min_version, 164 | product="firefox", 165 | locale="en-US", 166 | platform="win64", 167 | max_version=None, 168 | verbose=False, 169 | window=500, 170 | ): 171 | """ 172 | Retrieve the revisions and publish-dates for a given filter set. 173 | The combination of channel, product, local, and platform almost 174 | gives a set of unique (revision, publication-dates). For example, 175 | `win64` includes x86 and arm-64 builds. As such we de-duplicate 176 | the result set and include the build with the earliest publication 177 | date. 178 | 179 | Tree is the source tree, usually one of: 180 | - mozilla-central 181 | - mozilla-beta 182 | - mozilla-release 183 | 184 | :param channel: The release channel 185 | :param min_version: The minimum version to include 186 | :param product: Defaults to firefox 187 | :param locale: Defaults to en-US 188 | :param platform: Defaults to win64 189 | :param max_version: Optional maximum version to include 190 | :param verbose: Verbose output of query string and results 191 | :param window: Number of records to retrieve at a time 192 | 193 | returns a list of records of type 194 | { 195 | "date": 196 | "revision": , 197 | "version": , 198 | "tree": 199 | } 200 | """ 201 | 202 | # Because "100" > "99" == False we special-case v100 to v199. 203 | # v200 is far out, so we just ignore that for now. 204 | assert min_version < 200, "Only versions below 200 are supported" 205 | 206 | total_hits = 0 207 | results = [] 208 | 209 | for i in range(2**20): 210 | data = self._paginate_revision_dates( 211 | i, 212 | channel, 213 | min_version, 214 | product, 215 | locale, 216 | platform, 217 | max_version, 218 | verbose, 219 | window, 220 | ) 221 | 222 | # hits/total gives total number of records, including 223 | # those outside the window. We need to know the number 224 | # inside the window. 225 | hits = len(data["hits"]["hits"]) 226 | 227 | if hits: 228 | total_hits += hits 229 | results.append(data) 230 | 231 | # optimization, removes the last no-result window 232 | if hits < window: 233 | break 234 | 235 | if total_hits == 0: 236 | raise NoDataFoundException( 237 | "No data found for channel {} and minimum \ 238 | version {}".format( 239 | channel, min_version 240 | ) 241 | ) 242 | 243 | all_records = [ 244 | record for result in results for record in result["hits"]["hits"] 245 | ] 246 | return self._distinct_and_clean(all_records) 247 | -------------------------------------------------------------------------------- /probe_scraper/scrapers/moz_central_scraper.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import json 6 | import os 7 | import re 8 | from collections import defaultdict 9 | 10 | import requests 11 | 12 | from ..parsers.utils import HTTP_HEADERS 13 | from .buildhub import Buildhub 14 | 15 | BASE_URI = "https://hg.mozilla.org" 16 | 17 | REGISTRY_FILES = { 18 | "histogram": [ 19 | "toolkit/components/telemetry/Histograms.json", 20 | "dom/base/UseCounters.conf", 21 | "dom/base/nsDeprecatedOperationList.h", 22 | "servo/components/style/properties/counted_unknown_properties.py", 23 | "devtools/shared/css/generated/properties-db.js", 24 | ], 25 | "scalar": [ 26 | "toolkit/components/telemetry/Scalars.yaml", 27 | ], 28 | "event": [ 29 | "toolkit/components/telemetry/Events.yaml", 30 | ], 31 | } 32 | 33 | 34 | CHANNELS = { 35 | "nightly": { 36 | "base_uri": f"{BASE_URI}/mozilla-central", 37 | "tag_regex": "^FIREFOX_(AURORA|BETA)_[0-9]+_BASE$", 38 | "artificial_tags": [ 39 | { 40 | "date": [1567362726.0, 0], 41 | "node": "fd2934cca1ae7b492f29a4d240915aa9ec5b4977", 42 | "tag": "FIREFOX_BETA_71_BASE", 43 | } 44 | ], 45 | }, 46 | "beta": { 47 | "base_uri": f"{BASE_URI}/releases/mozilla-beta", 48 | "tag_regex": "^FIREFOX_BETA_[0-9]+_BASE$", 49 | }, 50 | "release": { 51 | "base_uri": f"{BASE_URI}/releases/mozilla-release", 52 | "tag_regex": "^FIREFOX_[0-9]+_0_RELEASE$", 53 | }, 54 | } 55 | 56 | SKIP_REVISIONS = { 57 | "942c201b1ac7a46a449f1fb80da7b050ec0ea120", 58 | "1807a36ff99f01abca1c37442fb5b344465bfbdf", 59 | "30bdee9799a07b8770719aa868416174ff0c54f5", 60 | "9fb70b4ae59336b805a1651e7c57c6385cca0717", 61 | "81578db6bf8939678d490b69f0daf4b675027e3a", 62 | "b8567457ece9593ddb00344130597698145bdc5c", 63 | "c4bdea458a08b975ffd70faed4a2f6fbe1e563bc", 64 | "d420f9190e2f35e314aa67ee346650f86451792c", 65 | "a680e8cd9618f4afbbb148ad464824cd6ce558d9", 66 | "5cbd3d92a78c54b324b6009a25d196adaa8a669b", 67 | "75c1403f58f79d1abd43d33fdd1beb36db9367c6", 68 | "cafaf813b0a938a197a488e629883770b2d33393", 69 | "cbbf6a7e34a363b39107b60dddac2aa713eaa8b5", 70 | } 71 | 72 | MIN_FIREFOX_VERSION = 30 73 | ERROR_CACHE_FILENAME = "probe_scraper_errors_cache.json" 74 | ARTIFICIAL_TAG = "artificial" 75 | 76 | 77 | def extract_major_version(version_str): 78 | """ 79 | Given a version string, e.g. "62.0a1", 80 | extract the major version as an int. 81 | """ 82 | search = re.search(r"^(\d+)\.", version_str) 83 | if search is not None: 84 | return int(search.group(1)) 85 | else: 86 | raise Exception("Invalid version string " + version_str) 87 | 88 | 89 | def relative_path_is_in_version(rel_path, version): 90 | # The devtools file exists in a bunch of versions, but we only care for it 91 | # since firefox 71 (bug 1578661). 92 | if ( 93 | rel_path == "devtools/shared/css/generated/properties-db.js" 94 | or rel_path == "servo/components/style/properties/counted_unknown_properties.py" 95 | ): 96 | return version >= 71 97 | return True 98 | 99 | 100 | def download_files(channel, node, temp_dir, error_cache, version, tree=None): 101 | if tree is None: 102 | uri = CHANNELS[channel]["base_uri"] 103 | else: 104 | # mozilla-release and mozilla-beta need to be prefixed with "release/" 105 | # sometimes they aren't from buildhub, add them if they are missing 106 | if not tree.startswith("releases/") and tree != "mozilla-central": 107 | tree = f"releases/{tree}" 108 | uri = f"{BASE_URI}/{tree}" 109 | 110 | base_uri = f"{uri}/raw-file/{node}/" 111 | node_path = os.path.join(temp_dir, "hg", node) 112 | 113 | results = {} 114 | 115 | def add_result(ptype, disk_path): 116 | if ptype not in results: 117 | results[ptype] = [] 118 | results[ptype].append(disk_path) 119 | 120 | all_files = [(k, x) for k, l in list(REGISTRY_FILES.items()) for x in l] 121 | for ptype, rel_path in all_files: 122 | disk_path = os.path.join(node_path, rel_path) 123 | if os.path.exists(disk_path): 124 | add_result(ptype, disk_path) 125 | continue 126 | 127 | uri = base_uri + rel_path 128 | # requests_cache doesn't cache on error status codes. 129 | # We just use our own cache for these for now. 130 | if uri in error_cache: 131 | continue 132 | 133 | if not relative_path_is_in_version(rel_path, int(version)): 134 | continue 135 | 136 | req = requests.get(uri, headers=HTTP_HEADERS) 137 | if req.status_code != requests.codes.ok: 138 | if os.path.basename(rel_path) == "Histograms.json": 139 | raise Exception( 140 | "Request returned status " + str(req.status_code) + " for " + uri 141 | ) 142 | else: 143 | error_cache[uri] = req.status_code 144 | continue 145 | 146 | dir = os.path.split(disk_path)[0] 147 | if not os.path.exists(dir): 148 | os.makedirs(dir) 149 | with open(disk_path, "wb") as f: 150 | for chunk in req.iter_content(chunk_size=128): 151 | f.write(chunk) 152 | 153 | add_result(ptype, disk_path) 154 | 155 | return results 156 | 157 | 158 | def load_error_cache(folder): 159 | path = os.path.join(folder, ERROR_CACHE_FILENAME) 160 | if not os.path.exists(path): 161 | return {} 162 | with open(path, "r") as f: 163 | return json.load(f) 164 | 165 | 166 | def save_error_cache(folder, error_cache): 167 | path = os.path.join(folder, ERROR_CACHE_FILENAME) 168 | with open(path, "w") as f: 169 | json.dump(error_cache, f, sort_keys=True, indent=2, separators=(",", ": ")) 170 | 171 | 172 | def scrape_channel_revisions( 173 | folder=None, min_fx_version=None, max_fx_version=None, channels=None 174 | ): 175 | """ 176 | Returns data in the format: 177 | { 178 | : { 179 | : { 180 | "date": , 181 | "version": , 182 | "registries": { 183 | "histogram": [path, ...], 184 | "event": [path, ...], 185 | "scalar": [path, ...] 186 | } 187 | } 188 | }, 189 | ... 190 | } 191 | """ 192 | if min_fx_version is None: 193 | min_fx_version = MIN_FIREFOX_VERSION 194 | 195 | error_cache = load_error_cache(folder) 196 | bh = Buildhub() 197 | results = defaultdict(dict) 198 | 199 | if channels is None: 200 | channels = CHANNELS.keys() 201 | 202 | for channel in channels: 203 | print("\nRetreiving Buildhub results for channel " + channel) 204 | 205 | revision_dates = [ 206 | rd 207 | for rd in bh.get_revision_dates( 208 | channel, min_fx_version, max_version=max_fx_version 209 | ) 210 | if rd["revision"] not in SKIP_REVISIONS 211 | ] 212 | num_revisions = len(revision_dates) 213 | 214 | print(" " + str(num_revisions) + " revisions found") 215 | 216 | for i, rd in enumerate(revision_dates): 217 | revision = rd["revision"] 218 | 219 | print( 220 | ( 221 | f" Downloading files for revision number {str(i+1)}/{str(num_revisions)}" 222 | f" - revision: {revision}, tree: {rd['tree']}, version: {str(rd['version'])}" 223 | ) 224 | ) 225 | version = extract_major_version(rd["version"]) 226 | files = download_files( 227 | channel, revision, folder, error_cache, version, tree=rd["tree"] 228 | ) 229 | 230 | results[channel][revision] = { 231 | "date": rd["date"], 232 | "version": version, 233 | "registries": files, 234 | } 235 | save_error_cache(folder, error_cache) 236 | 237 | return results 238 | -------------------------------------------------------------------------------- /probe_scraper/transform_revisions.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | from collections import defaultdict 6 | 7 | 8 | def transform(node_data): 9 | results = defaultdict(dict) 10 | for channel, nodes in node_data.items(): 11 | for node_id, details in nodes.items(): 12 | results[channel][node_id] = { 13 | "version": details.get("version"), 14 | "date": details.get("date"), 15 | } 16 | 17 | return results 18 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | web_dependency: mark a test that requires a web connection. 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | awscli==1.29.7 2 | beautifulsoup4==4.8.2 3 | GitPython==3.1.41 4 | boto3==1.28.7 5 | Flask==2.3.3 6 | glean-parser~=17.1.0 7 | google-cloud-bigquery==3.23.1 8 | google-cloud-storage==2.2.1 9 | gsutil==5.28 10 | Jinja2==3.1.6 11 | jsonschema==3.1.1 12 | python-dateutil==2.8.0 13 | PyYAML==6.0.1 14 | requests==2.32.0 15 | requests_cache==0.5.2 16 | requests_file==1.4.3 17 | schema==0.7.1 18 | urllib3==1.26.19 19 | Werkzeug==2.3.8 20 | yamllint 21 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup 4 | 5 | setup( 6 | name="probe-scraper", 7 | version="0.1", 8 | description="Scrape metric data from Mozilla products repositories.", 9 | author="Mozilla", 10 | # While this is not owned by the Glean team, I could not find a better 11 | # email address for this. 12 | author_email="glean-team@mozilla.com", 13 | classifiers=[ 14 | "Intended Audience :: Developers", 15 | "Natural Language :: English", 16 | "Programming Language :: Python :: 3", 17 | "Programming Language :: Python :: 3.8", 18 | "Programming Language :: Python :: 3.9", 19 | "Programming Language :: Python :: 3.10", 20 | ], 21 | url="https://github.com/mozilla/probe-scraper/", 22 | packages=["probe_scraper"], 23 | license="MPL 2.0", 24 | ) 25 | -------------------------------------------------------------------------------- /test_requirements.txt: -------------------------------------------------------------------------------- 1 | flake8==7.0.0 2 | pytest>=3.0 3 | black==24.4.2 4 | isort==5.13.2 5 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/probe-scraper/0506e31f38e61ddc662c0eab18826b370314896e/tests/__init__.py -------------------------------------------------------------------------------- /tests/resources/Histograms.json: -------------------------------------------------------------------------------- 1 | { 2 | "TELEMETRY_TEST_FLAG": { 3 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 4 | "expires_in_version": "never", 5 | "kind": "flag", 6 | "description": "a testing histogram; not meant to be touched" 7 | }, 8 | "TELEMETRY_TEST_COUNT": { 9 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 10 | "expires_in_version": "never", 11 | "kind": "count", 12 | "description": "a testing histogram; not meant to be touched" 13 | }, 14 | "TELEMETRY_TEST_COUNT2": { 15 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 16 | "expires_in_version": "never", 17 | "kind": "count", 18 | "bug_numbers": [1288745], 19 | "description": "a testing histogram; not meant to be touched" 20 | }, 21 | "TELEMETRY_TEST_COUNT_INIT_NO_RECORD": { 22 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 23 | "expires_in_version": "never", 24 | "kind": "count", 25 | "description": "a testing histogram; not meant to be touched - initially not recording" 26 | }, 27 | "TELEMETRY_TEST_CATEGORICAL": { 28 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 29 | "bug_numbers": [1188888], 30 | "expires_in_version": "never", 31 | "kind": "categorical", 32 | "labels": [ 33 | "CommonLabel", 34 | "Label2", 35 | "Label3" 36 | ], 37 | "description": "a testing histogram; not meant to be touched" 38 | }, 39 | "TELEMETRY_TEST_CATEGORICAL_OPTOUT": { 40 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 41 | "bug_numbers": [1188888], 42 | "expires_in_version": "never", 43 | "releaseChannelCollection": "opt-out", 44 | "kind": "categorical", 45 | "labels": [ 46 | "CommonLabel", 47 | "Label4", 48 | "Label5", 49 | "Label6" 50 | ], 51 | "description": "a testing histogram; not meant to be touched" 52 | }, 53 | "TELEMETRY_TEST_CATEGORICAL_NVALUES": { 54 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 55 | "bug_numbers": [1188888], 56 | "expires_in_version": "never", 57 | "kind": "categorical", 58 | "n_values": 70, 59 | "labels": [ 60 | "CommonLabel", 61 | "Label7", 62 | "Label8" 63 | ], 64 | "description": "a testing histogram; not meant to be touched" 65 | }, 66 | "TELEMETRY_TEST_CATEGORICAL_EMPTY_LABELS": { 67 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 68 | "bug_numbers": [1188888], 69 | "expires_in_version": "never", 70 | "kind": "categorical", 71 | "labels": [ 72 | ], 73 | "description": "a testing histogram; not meant to be touched" 74 | }, 75 | "TELEMETRY_TEST_KEYED_COUNT_INIT_NO_RECORD": { 76 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 77 | "expires_in_version": "never", 78 | "kind": "count", 79 | "keyed": true, 80 | "description": "a testing histogram; not meant to be touched - initially not recording" 81 | }, 82 | "TELEMETRY_TEST_KEYED_FLAG": { 83 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 84 | "expires_in_version": "never", 85 | "kind": "flag", 86 | "keyed": true, 87 | "description": "a testing histogram; not meant to be touched" 88 | }, 89 | "TELEMETRY_TEST_KEYED_COUNT": { 90 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 91 | "expires_in_version": "never", 92 | "kind": "count", 93 | "keyed": true, 94 | "description": "a testing histogram; not meant to be touched" 95 | }, 96 | "TELEMETRY_TEST_KEYED_BOOLEAN": { 97 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 98 | "expires_in_version": "never", 99 | "kind": "boolean", 100 | "keyed": true, 101 | "bug_numbers": [1299144], 102 | "description": "a testing histogram; not meant to be touched" 103 | }, 104 | "TELEMETRY_TEST_RELEASE_OPTOUT": { 105 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 106 | "expires_in_version": "never", 107 | "kind": "flag", 108 | "releaseChannelCollection": "opt-out", 109 | "description": "a testing histogram; not meant to be touched" 110 | }, 111 | "TELEMETRY_TEST_RELEASE_OPTIN": { 112 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 113 | "expires_in_version": "never", 114 | "kind": "flag", 115 | "releaseChannelCollection": "opt-in", 116 | "description": "a testing histogram; not meant to be touched" 117 | }, 118 | "TELEMETRY_TEST_KEYED_RELEASE_OPTIN": { 119 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 120 | "expires_in_version": "never", 121 | "kind": "flag", 122 | "keyed": true, 123 | "releaseChannelCollection": "opt-in", 124 | "description": "a testing histogram; not meant to be touched" 125 | }, 126 | "TELEMETRY_TEST_KEYED_RELEASE_OPTOUT": { 127 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 128 | "expires_in_version": "never", 129 | "kind": "flag", 130 | "keyed": true, 131 | "releaseChannelCollection": "opt-out", 132 | "description": "a testing histogram; not meant to be touched" 133 | }, 134 | "TELEMETRY_TEST_EXPONENTIAL": { 135 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 136 | "expires_in_version": "never", 137 | "kind": "exponential", 138 | "low": 1, 139 | "high": 2147483646, 140 | "n_buckets": 10, 141 | "bug_numbers": [1288745], 142 | "description": "a testing histogram; not meant to be touched" 143 | }, 144 | "TELEMETRY_TEST_LINEAR": { 145 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 146 | "expires_in_version": "never", 147 | "kind": "linear", 148 | "low": 1, 149 | "high": 2147483646, 150 | "n_buckets": 10, 151 | "bug_numbers": [1288745], 152 | "description": "a testing histogram; not meant to be touched" 153 | }, 154 | "TELEMETRY_TEST_BOOLEAN": { 155 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 156 | "expires_in_version" : "never", 157 | "kind": "boolean", 158 | "bug_numbers": [1288745], 159 | "description": "a testing histogram; not meant to be touched" 160 | }, 161 | "TELEMETRY_TEST_EXPIRED": { 162 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 163 | "expires_in_version": "4.0a1", 164 | "kind": "flag", 165 | "description": "a testing histogram; not meant to be touched" 166 | }, 167 | "TELEMETRY_TEST_ALL_CHILDREN": { 168 | "record_in_processes": ["all_children"], 169 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 170 | "expires_in_version": "never", 171 | "kind": "linear", 172 | "low": 1, 173 | "high": 10000, 174 | "n_buckets": 10, 175 | "bug_numbers": [1363725], 176 | "description": "a testing histogram; not meant to be touched" 177 | }, 178 | "TELEMETRY_TEST_ALL_CHILDS": { 179 | "record_in_processes": ["all_childs"], 180 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 181 | "expires_in_version": "never", 182 | "kind": "linear", 183 | "low": 1, 184 | "high": 10000, 185 | "n_buckets": 10, 186 | "bug_numbers": [1335343,1363725], 187 | "description": "a testing histogram; not meant to be touched" 188 | }, 189 | "EXPRESSION_IN_LOW_HIGH_ATTRIBUTE": { 190 | "expires_in_version": "never", 191 | "kind": "exponential", 192 | "low": "32 * 1024", 193 | "high": "16 * 1024 * 1024", 194 | "n_buckets": 200, 195 | "extended_statistics_ok": true, 196 | "description": "Test Case for expression in low/high attribute" 197 | }, 198 | "NON_INTEGER_IN_HIGH_ATTRIBUTE": { 199 | "expires_in_version": "never", 200 | "kind": "exponential", 201 | "description": "Test Case for non-integer in high attribute", 202 | "high": "5000", 203 | "n_buckets": 10, 204 | "extended_statistics_ok": true 205 | }, 206 | "HISTOGRAM_WITH_MULTISTORE": { 207 | "alert_emails": ["telemetry-client-dev@mozilla.com"], 208 | "expires_in_version": "never", 209 | "kind": "linear", 210 | "low": 1, 211 | "high": 10000, 212 | "n_buckets": 10, 213 | "bug_numbers": [1335343,1363725], 214 | "description": "a testing histogram; not meant to be touched", 215 | "record_into_store": ["main", "store2"] 216 | } 217 | } 218 | -------------------------------------------------------------------------------- /tests/resources/UseCounters.conf: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | // This file defines a list of use counters, which are things that can 6 | // record usage of Web platform features and then report this information 7 | // through Telemetry. 8 | // 9 | // The format of this file is very strict. Each line can be: 10 | // 11 | // (a) a blank line 12 | // 13 | // (b) a comment, which is a line that begins with "//" 14 | // 15 | // (c) one of three possible use counter declarations: 16 | // 17 | // method . 18 | // attribute . 19 | // property 20 | // 21 | // The |CSS property method name| should be identical to the |method| 22 | // argument to CSS_PROP and related macros. The method name is 23 | // identical to the name of the property, except that all hyphens are 24 | // removed and CamelCase naming is used. See nsCSSPropList.h for 25 | // further details. 26 | // 27 | // To actually cause use counters to be incremented, DOM methods 28 | // and attributes must have a [UseCounter] extended attribute in 29 | // the Web IDL file. CSS properties require no special treatment 30 | // beyond being listed below. 31 | // 32 | // You might reasonably ask why we have this file and we require 33 | // annotating things with [UseCounter] in the relevant WebIDL file as 34 | // well. Generating things from bindings codegen and ensuring all the 35 | // dependencies were correct would have been rather difficult, and 36 | // annotating the WebIDL files does nothing for identifying CSS 37 | // property usage, which we would also like to track. 38 | 39 | method SVGSVGElement.getElementById 40 | attribute SVGSVGElement.currentScale 41 | property Fill 42 | -------------------------------------------------------------------------------- /tests/resources/metrics.yaml: -------------------------------------------------------------------------------- 1 | $schema: moz://mozilla.org/schemas/glean/metrics/1-0-0 2 | 3 | example: 4 | duration: 5 | type: timespan 6 | description: | 7 | The duration of the last foreground session. 8 | time_unit: second 9 | send_in_pings: 10 | - baseline 11 | bugs: 12 | - 1497894, 1519120 13 | data_reviews: 14 | - https://bugzilla.mozilla.org/show_bug.cgi?id=1512938#c3 15 | notification_emails: 16 | - telemetry-client-dev@mozilla.com 17 | expires: '2015-07-11' 18 | 19 | os: 20 | type: string 21 | lifetime: application 22 | send_in_pings: 23 | - baseline 24 | - session_end 25 | description: | 26 | The name of the operating system. 27 | bugs: 28 | - 1497894 29 | data_reviews: 30 | - https://bugzilla.mozilla.org/show_bug.cgi?id=1512938#c3 31 | notification_emails: 32 | - telemetry-client-dev@mozilla.com 33 | expires: never 34 | -------------------------------------------------------------------------------- /tests/resources/nsDeprecatedOperationList.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | // IWYU pragma: private, include "nsIDocument.h" 7 | 8 | /* 9 | * This file contains the list of deprecated DOM operations. It is 10 | * designed to be used as input to the C preprocessor *only*. 11 | */ 12 | 13 | DEPRECATED_OPERATION(GetAttributeNode) 14 | DEPRECATED_OPERATION(SetAttributeNode) 15 | -------------------------------------------------------------------------------- /tests/resources/test_events.yaml: -------------------------------------------------------------------------------- 1 | # This category contains event entries used for Telemetry tests. 2 | # They will not be sent out with any pings. 3 | telemetry.test: 4 | test: 5 | methods: ["test1", "test2"] 6 | objects: ["object1", "object2"] 7 | bug_numbers: [1286606] 8 | notification_emails: ["telemetry-client-dev@mozilla.com"] 9 | record_in_processes: ['main', 'content'] 10 | description: This is a test entry for Telemetry. 11 | expiry_date: never 12 | extra_keys: 13 | key1: This is just a test description. 14 | key2: This is another test description. 15 | optout: 16 | objects: ["object1", "object2"] 17 | bug_numbers: [1286606] 18 | notification_emails: ["telemetry-client-dev@mozilla.com"] 19 | description: This is an opt-out test entry. 20 | expiry_date: never 21 | release_channel_collection: opt-out 22 | extra_keys: 23 | key1: This is just a test description. 24 | expired_version: 25 | objects: ["object1", "object2"] 26 | bug_numbers: [1286606] 27 | notification_emails: ["telemetry-client-dev@mozilla.com"] 28 | description: This is a test entry with an expired version. 29 | expiry_version: "3.6" 30 | too_long_of_an_event_name: 31 | objects: ["object1", "object2"] 32 | bug_numbers: [1286606] 33 | notification_emails: ["telemetry-client-dev@mozilla.com"] 34 | description: This is an opt-out test entry. 35 | expiry_date: never 36 | release_channel_collection: opt-out 37 | extra_keys: 38 | key1: This is just a test description. 39 | pause_behavior_change: This is a too-long extra key 40 | 41 | # This is a secondary category used for Telemetry tests. 42 | # The events here will not be sent out with any pings. 43 | telemetry.test.second: 44 | test: 45 | objects: ["object1", "object2", "object3"] 46 | bug_numbers: [1286606] 47 | notification_emails: ["telemetry-client-dev@mozilla.com"] 48 | description: This is a test entry for Telemetry. 49 | expiry_date: never 50 | extra_keys: 51 | key1: This is just a test description. 52 | -------------------------------------------------------------------------------- /tests/resources/test_repo_files/duplicate/0/metrics.yaml: -------------------------------------------------------------------------------- 1 | $schema: moz://mozilla.org/schemas/glean/metrics/1-0-0 2 | 3 | example: 4 | duration: 5 | type: counter 6 | description: | 7 | The duration of the last foreground session. 8 | send_in_pings: 9 | - baseline 10 | bugs: 11 | - 1497894, 1519120 12 | data_reviews: 13 | - https://bugzilla.mozilla.org/show_bug.cgi?id=1512938#c3 14 | notification_emails: 15 | - alice@example.com 16 | expires: '2100-07-11' 17 | -------------------------------------------------------------------------------- /tests/resources/test_repo_files/expired/0/metrics.yaml: -------------------------------------------------------------------------------- 1 | $schema: moz://mozilla.org/schemas/glean/metrics/1-0-0 2 | 3 | example: 4 | duration: 5 | type: timespan 6 | description: | 7 | The duration of the last foreground session. 8 | time_unit: second 9 | send_in_pings: 10 | - baseline 11 | bugs: 12 | - 1497894, 1519120 13 | data_reviews: 14 | - https://bugzilla.mozilla.org/show_bug.cgi?id=1512938#c3 15 | notification_emails: 16 | - bob@example.com 17 | expires: '2019-01-01' 18 | 19 | os: 20 | type: string 21 | lifetime: application 22 | send_in_pings: 23 | - baseline 24 | description: | 25 | Stop 26 | bugs: 27 | - 1497894 28 | data_reviews: 29 | - https://bugzilla.mozilla.org/show_bug.cgi?id=1512938#c3 30 | notification_emails: 31 | - telemetry-client-dev@mozilla.com 32 | expires: never 33 | -------------------------------------------------------------------------------- /tests/resources/test_repo_files/improper/0/metrics.yaml: -------------------------------------------------------------------------------- 1 | example: 2 | duration: 3 | type: timespan 4 | time_unit: second 5 | send_in_pings: 6 | - baseline 7 | bugs: 8 | - 1497894, 1519120 9 | data_reviews: 10 | - https://bugzilla.mozilla.org/show_bug.cgi?id=1512938#c3 11 | notification_emails: 12 | - telemetry-client-dev@mozilla.com 13 | expires: never 14 | -------------------------------------------------------------------------------- /tests/resources/test_repo_files/normal/0/metrics.yaml: -------------------------------------------------------------------------------- 1 | $schema: moz://mozilla.org/schemas/glean/metrics/1-0-0 2 | 3 | example: 4 | duration: 5 | type: timespan 6 | description: | 7 | The duration of the last foreground session. 8 | time_unit: second 9 | send_in_pings: 10 | - baseline 11 | bugs: 12 | - 1497894, 1519120 13 | data_reviews: 14 | - https://bugzilla.mozilla.org/show_bug.cgi?id=1512938#c3 15 | notification_emails: 16 | - bob@example.com 17 | expires: '2100-07-11' 18 | 19 | os: 20 | type: string 21 | lifetime: application 22 | send_in_pings: 23 | - baseline 24 | description: | 25 | Stop 26 | bugs: 27 | - 1497894 28 | data_reviews: 29 | - https://bugzilla.mozilla.org/show_bug.cgi?id=1512938#c3 30 | notification_emails: 31 | - telemetry-client-dev@mozilla.com 32 | expires: never 33 | -------------------------------------------------------------------------------- /tests/resources/test_repo_files/normal/1/metrics.yaml: -------------------------------------------------------------------------------- 1 | $schema: moz://mozilla.org/schemas/glean/metrics/1-0-0 2 | 3 | example: 4 | duration: 5 | type: timespan 6 | description: | 7 | The duration of the last foreground session. 8 | time_unit: second 9 | send_in_pings: 10 | - baseline 11 | bugs: 12 | - https://bugzilla.mozilla.org/show_bug.cgi?id=1497894 13 | - https://bugzilla.mozilla.org/show_bug.cgi?id=1519120 14 | data_reviews: 15 | - https://bugzilla.mozilla.org/show_bug.cgi?id=1512938#c3 16 | notification_emails: 17 | - charlie@example.com 18 | expires: '2100-07-11' 19 | 20 | os: 21 | type: string 22 | lifetime: application 23 | send_in_pings: 24 | - baseline 25 | description: | 26 | don't 27 | bugs: 28 | - https://bugzilla.mozilla.org/show_bug.cgi?id=1497894 29 | data_reviews: 30 | - https://bugzilla.mozilla.org/show_bug.cgi?id=1512938#c3 31 | notification_emails: 32 | - telemetry-client-dev@mozilla.com 33 | expires: never 34 | -------------------------------------------------------------------------------- /tests/resources/test_repo_files/normal/2/metrics.yaml: -------------------------------------------------------------------------------- 1 | $schema: moz://mozilla.org/schemas/glean/metrics/2-0-0 2 | 3 | example: 4 | duration: 5 | type: timespan 6 | description: | 7 | The duration of the last foreground session. 8 | time_unit: second 9 | send_in_pings: 10 | - baseline 11 | bugs: 12 | - https://bugzilla.mozilla.org/show_bug.cgi?id=1497894 13 | - https://bugzilla.mozilla.org/show_bug.cgi?id=1519120 14 | data_reviews: 15 | - https://bugzilla.mozilla.org/show_bug.cgi?id=1512938#c3 16 | notification_emails: 17 | - charlie@example.com 18 | expires: '2100-07-11' 19 | 20 | os: 21 | type: string 22 | lifetime: application 23 | send_in_pings: 24 | - baseline 25 | description: | 26 | pop 27 | metadata: 28 | tags: 29 | - foo 30 | bugs: 31 | - https://bugzilla.mozilla.org/show_bug.cgi?id=1497894 32 | data_reviews: 33 | - https://bugzilla.mozilla.org/show_bug.cgi?id=1512938#c3 34 | notification_emails: 35 | - telemetry-client-dev@mozilla.com 36 | expires: never 37 | 38 | -------------------------------------------------------------------------------- /tests/resources/test_repo_files/normal/2/tags.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | $schema: moz://mozilla.org/schemas/glean/tags/1-0-0 3 | 4 | foo: 5 | description: "the foo tag" 6 | -------------------------------------------------------------------------------- /tests/resources/test_scalars.yaml: -------------------------------------------------------------------------------- 1 | # This file contains a definition of the scalar probes that are recorded in Telemetry. 2 | # They are submitted with the "main" pings and can be inspected in about:telemetry. 3 | 4 | # The following section is for probes testing the Telemetry system. They will not be 5 | # submitted in pings and are only used for testing. 6 | telemetry.test: 7 | unsigned_int_kind: 8 | bug_numbers: 9 | - 1276190 10 | description: > 11 | This is a test uint type with a really long description, maybe spanning even multiple 12 | lines, to just prove a point: everything works just fine. 13 | expires: never 14 | kind: uint 15 | notification_emails: 16 | - telemetry-client-dev@mozilla.com 17 | 18 | string_kind: 19 | bug_numbers: 20 | - 1276190 21 | description: A string test type with a one line comment that works just fine! 22 | expires: never 23 | kind: string 24 | notification_emails: 25 | - telemetry-client-dev@mozilla.com 26 | 27 | boolean_kind: 28 | bug_numbers: 29 | - 1281214 30 | description: A boolean test type with a one line comment that works just fine! 31 | expires: never 32 | kind: boolean 33 | notification_emails: 34 | - telemetry-client-dev@mozilla.com 35 | 36 | expired: 37 | bug_numbers: 38 | - 1276190 39 | description: This is an expired testing scalar; not meant to be touched. 40 | expires: 4.0a1 41 | kind: uint 42 | notification_emails: 43 | - telemetry-client-dev@mozilla.com 44 | 45 | unexpired: 46 | bug_numbers: 47 | - 1276190 48 | description: This is an unexpired testing scalar; not meant to be touched. 49 | expires: "375.0" 50 | kind: uint 51 | notification_emails: 52 | - telemetry-client-dev@mozilla.com 53 | 54 | release_optin: 55 | bug_numbers: 56 | - 1276190 57 | description: A testing scalar; not meant to be touched. 58 | expires: never 59 | kind: uint 60 | notification_emails: 61 | - telemetry-client-dev@mozilla.com 62 | release_channel_collection: opt-in 63 | 64 | release_optout: 65 | bug_numbers: 66 | - 1276190 67 | description: A testing scalar; not meant to be touched. 68 | expires: never 69 | kind: uint 70 | notification_emails: 71 | - telemetry-client-dev@mozilla.com 72 | release_channel_collection: opt-out 73 | 74 | keyed_release_optin: 75 | bug_numbers: 76 | - 1277806 77 | description: A testing scalar; not meant to be touched. 78 | expires: never 79 | kind: uint 80 | keyed: true 81 | notification_emails: 82 | - telemetry-client-dev@mozilla.com 83 | release_channel_collection: opt-in 84 | 85 | keyed_release_optout: 86 | bug_numbers: 87 | - 1277806 88 | description: A testing scalar; not meant to be touched. 89 | expires: never 90 | kind: uint 91 | keyed: true 92 | notification_emails: 93 | - telemetry-client-dev@mozilla.com 94 | release_channel_collection: opt-out 95 | 96 | keyed_expired: 97 | bug_numbers: 98 | - 1277806 99 | description: This is an expired testing scalar; not meant to be touched. 100 | expires: 4.0a1 101 | kind: uint 102 | keyed: true 103 | notification_emails: 104 | - telemetry-client-dev@mozilla.com 105 | 106 | keyed_unsigned_int: 107 | bug_numbers: 108 | - 1277806 109 | description: A testing keyed uint scalar; not meant to be touched. 110 | expires: never 111 | kind: uint 112 | keyed: true 113 | notification_emails: 114 | - telemetry-client-dev@mozilla.com 115 | 116 | keyed_boolean_kind: 117 | bug_numbers: 118 | - 1277806 119 | description: A testing keyed boolean scalar; not meant to be touched. 120 | expires: never 121 | kind: boolean 122 | keyed: true 123 | notification_emails: 124 | - telemetry-client-dev@mozilla.com 125 | record_in_processes: 126 | - 'main' 127 | - 'content' 128 | 129 | content_only_uint: 130 | bug_numbers: 131 | - 1278556 132 | description: A testing uint scalar; not meant to be touched. 133 | expires: never 134 | kind: uint 135 | notification_emails: 136 | - telemetry-client-dev@mozilla.com 137 | record_in_processes: 138 | - 'content' 139 | 140 | all_processes_uint: 141 | bug_numbers: 142 | - 1278556 143 | description: A testing uint scalar; not meant to be touched. 144 | expires: never 145 | kind: uint 146 | notification_emails: 147 | - telemetry-client-dev@mozilla.com 148 | record_in_processes: 149 | - 'all' 150 | 151 | all_child_processes_string: 152 | bug_numbers: 153 | - 1278556 154 | description: A testing string scalar; not meant to be touched. 155 | expires: never 156 | kind: string 157 | notification_emails: 158 | - telemetry-client-dev@mozilla.com 159 | record_in_processes: 160 | - 'all_childs' 161 | 162 | other.test: 163 | test_probe: 164 | bug_numbers: 165 | - 1276190 166 | description: > 167 | This is a test uint type with a really long description, maybe spanning even multiple 168 | lines, to just prove a point: everything works just fine. 169 | expires: never 170 | kind: uint 171 | cpp_guard: 'XP_WIN' 172 | notification_emails: 173 | - telemetry-client-dev@mozilla.com 174 | 175 | multistore_probe: 176 | bug_numbers: 177 | - 1276190 178 | description: > 179 | This is a test uint type with a really long description, maybe spanning even multiple 180 | lines, to just prove a point: everything works just fine. 181 | expires: never 182 | kind: uint 183 | notification_emails: 184 | - telemetry-client-dev@mozilla.com 185 | record_into_store: 186 | - main 187 | - store2 188 | -------------------------------------------------------------------------------- /tests/test_buildhub.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import pytest 4 | 5 | from probe_scraper.scrapers.buildhub import Buildhub, NoDataFoundException 6 | 7 | FX_RELEASE_62_0_3 = { 8 | "revision": "c9ed11ae5c79df3dcb69075e1c9da0317d1ecb1b", 9 | "date": datetime(2018, 10, 1, 18, 40, 35), 10 | "version": "62.0.3rc1", 11 | "tree": "releases/mozilla-release", 12 | } 13 | 14 | VERBOSE = True 15 | 16 | 17 | @pytest.fixture 18 | def records(): 19 | return [ 20 | { 21 | "_source": { 22 | "download": {"date": "2019-01-28T23:49:22.717388+00:00"}, 23 | "source": {"revision": "abc", "tree": "releases/mozilla-release"}, 24 | "target": {"version": "1"}, 25 | } 26 | }, 27 | { 28 | "_source": { 29 | "download": {"date": "2019-01-29T23:49:22Z"}, 30 | "source": {"revision": "def", "tree": "releases/mozilla-release"}, 31 | "target": {"version": "2"}, 32 | } 33 | }, 34 | ] 35 | 36 | 37 | @pytest.mark.web_dependency 38 | def test_nightly_count(): 39 | channel, min_version, max_version = "nightly", 62, 62 40 | 41 | bh = Buildhub() 42 | releases = bh.get_revision_dates( 43 | channel, min_version, max_version=max_version, verbose=VERBOSE 44 | ) 45 | assert len(releases) == 97 46 | 47 | 48 | @pytest.mark.web_dependency 49 | def test_pagination(): 50 | channel, min_version, max_version = "nightly", 62, 62 51 | 52 | bh = Buildhub() 53 | releases = bh.get_revision_dates( 54 | channel, min_version, max_version=max_version, verbose=VERBOSE, window=10 55 | ) 56 | assert len(releases) == 97 57 | 58 | 59 | @pytest.mark.web_dependency 60 | def test_duplicate_revisions(): 61 | channel, min_version, max_version = "nightly", 67, 67 62 | 63 | bh = Buildhub() 64 | releases = bh.get_revision_dates( 65 | channel, min_version, max_version=max_version, verbose=VERBOSE 66 | ) 67 | assert len({r["revision"] for r in releases}) == len(releases) 68 | 69 | 70 | @pytest.mark.web_dependency 71 | def test_release(): 72 | channel, min_version, max_version = "release", 62, 62 73 | 74 | bh = Buildhub() 75 | releases = bh.get_revision_dates( 76 | channel, min_version, max_version=max_version, verbose=VERBOSE 77 | ) 78 | 79 | assert FX_RELEASE_62_0_3 in releases 80 | 81 | 82 | @pytest.mark.web_dependency 83 | def test_min_release(): 84 | channel, min_version, max_version = "release", 63, 63 85 | 86 | bh = Buildhub() 87 | releases = bh.get_revision_dates( 88 | channel, min_version, max_version=max_version, verbose=VERBOSE 89 | ) 90 | 91 | assert FX_RELEASE_62_0_3 not in releases 92 | 93 | 94 | @pytest.mark.web_dependency 95 | def test_no_min_max_version_overlap(): 96 | channel, min_version, max_version = "release", 63, 62 97 | bh = Buildhub() 98 | 99 | with pytest.raises(NoDataFoundException): 100 | bh.get_revision_dates( 101 | channel, min_version, max_version=max_version, verbose=VERBOSE 102 | ) 103 | 104 | 105 | @pytest.mark.web_dependency 106 | def test_no_released_version(): 107 | channel, min_version = "release", 199 108 | bh = Buildhub() 109 | 110 | with pytest.raises(NoDataFoundException): 111 | bh.get_revision_dates(channel, min_version, verbose=VERBOSE) 112 | 113 | 114 | def test_version_200(): 115 | channel, min_version = "release", 200 116 | bh = Buildhub() 117 | 118 | with pytest.raises(AssertionError): 119 | bh.get_revision_dates(channel, min_version, verbose=VERBOSE) 120 | 121 | 122 | def test_cleaned_dates(records): 123 | bh = Buildhub() 124 | 125 | expected = [ 126 | { 127 | "revision": "abc", 128 | "date": datetime(2019, 1, 28, 23, 49, 22, 717388), 129 | "version": "1", 130 | "tree": "releases/mozilla-release", 131 | }, 132 | { 133 | "revision": "def", 134 | "date": datetime(2019, 1, 29, 23, 49, 22), 135 | "version": "2", 136 | "tree": "releases/mozilla-release", 137 | }, 138 | ] 139 | 140 | assert bh._distinct_and_clean(records) == expected 141 | 142 | 143 | # Test unique and sorted values 144 | def test_unique_sorted(records): 145 | bh = Buildhub() 146 | 147 | records[1]["_source"]["source"]["revision"] = "abc" 148 | records[1]["_source"]["download"]["date"] = "2019-01-22T23:49:22Z" 149 | 150 | expected = [ 151 | { 152 | "revision": "abc", 153 | "date": datetime(2019, 1, 22, 23, 49, 22), 154 | "version": "2", 155 | "tree": "releases/mozilla-release", 156 | }, 157 | ] 158 | 159 | assert bh._distinct_and_clean(records) == expected 160 | -------------------------------------------------------------------------------- /tests/test_event_parser.py: -------------------------------------------------------------------------------- 1 | from probe_scraper.parsers.events import EventsParser 2 | 3 | 4 | def is_string(s): 5 | return isinstance(s, str) 6 | 7 | 8 | def test_event_parser(): 9 | # Parse the events from the test definitions. 10 | parser = EventsParser() 11 | parsed_events = parser.parse(["tests/resources/test_events.yaml"], "55") 12 | 13 | # Make sure we loaded all the events. 14 | assert len(parsed_events) == 5 15 | 16 | # Make sure each of them contains all the required fields and details. 17 | REQUIRED_FIELDS = [ 18 | "cpp_guard", 19 | "description", 20 | "details", 21 | "expiry_version", 22 | "optout", 23 | "bug_numbers", 24 | ] 25 | REQUIRED_DETAILS = ["methods", "objects", "extra_keys", "record_in_processes"] 26 | 27 | for name, data in parsed_events.items(): 28 | assert is_string(name) 29 | 30 | # Make sure we have all the required fields and details. 31 | for field in REQUIRED_FIELDS: 32 | assert field in data 33 | 34 | for field in REQUIRED_DETAILS: 35 | assert field in data["details"] 36 | 37 | 38 | def parse(channel, version): 39 | parser = EventsParser() 40 | return parser.parse(["tests/resources/test_events.yaml"], version, channel) 41 | 42 | 43 | def test_channel_version_ignore(): 44 | assert parse("release", 52) == {} 45 | assert parse("release", 53) != {} 46 | 47 | assert parse("beta", 52) == {} 48 | assert parse("beta", 53) != {} 49 | 50 | assert parse("nightly", 52) == {} 51 | assert parse("nightly", 53) == {} 52 | assert parse("nightly", 54) != {} 53 | -------------------------------------------------------------------------------- /tests/test_fog_checks.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from pathlib import Path 3 | from typing import Dict, List 4 | 5 | import pytest 6 | 7 | from probe_scraper import fog_checks, transform_probes 8 | from probe_scraper.parsers.repositories import Repository 9 | from probe_scraper.scrapers.git_scraper import Commit 10 | 11 | FAKE_METRIC = { 12 | "type": "string", 13 | "expires": "never", 14 | "notification_emails": ["bar@foo.com"], 15 | "bugs": ["https://bugzilla.mozilla.org/show_bug.cgi?id=1701769"], 16 | } 17 | 18 | FAKE_REPO_META = { 19 | "notification_emails": ["foo@bar.com"], 20 | } 21 | 22 | 23 | @pytest.fixture 24 | def fake_latest_nightly_version() -> str: 25 | return "100" 26 | 27 | 28 | @pytest.fixture 29 | def fake_metrics(fake_latest_nightly_version) -> Dict[str, Dict]: 30 | return { 31 | "category.name.metric_name": FAKE_METRIC, 32 | "expired.category.name.metric_name": {**FAKE_METRIC, "expires": "expired"}, 33 | } 34 | 35 | 36 | @pytest.fixture 37 | def fake_commit_timestamp() -> int: 38 | return int(datetime.now().timestamp()) 39 | 40 | 41 | @pytest.fixture 42 | def fake_metrics_by_commit( 43 | fake_commit_timestamp, fake_metrics 44 | ) -> Dict[str, Dict[str, Dict]]: 45 | return { 46 | Commit( 47 | hash="deadcode", 48 | timestamp=fake_commit_timestamp, 49 | reflog_index=0, 50 | is_head=True, 51 | ): { 52 | **fake_metrics, 53 | "newer.category.name.metric_name": FAKE_METRIC, 54 | }, 55 | Commit( 56 | hash="decafcaf", 57 | timestamp=fake_commit_timestamp, 58 | reflog_index=1, 59 | is_head=False, 60 | ): fake_metrics, 61 | } 62 | 63 | 64 | @pytest.fixture 65 | def fake_commits(fake_commit_timestamp) -> Dict[Commit, List[Path]]: 66 | # `decafcaf` should remain the most recent SHA. 67 | return { 68 | Commit( 69 | hash="decafcaf", 70 | timestamp=fake_commit_timestamp, 71 | reflog_index=1, 72 | is_head=False, 73 | ): [], 74 | Commit( 75 | hash="deadcode", 76 | timestamp=fake_commit_timestamp, 77 | reflog_index=0, 78 | is_head=True, 79 | ): [], 80 | } 81 | 82 | 83 | @pytest.fixture 84 | def fake_metrics_by_repo_by_commit( 85 | fake_metrics_by_commit, fake_repos 86 | ) -> Dict[str, Dict[Commit, Dict[str, Dict]]]: 87 | return { 88 | repo.name: { 89 | commit: { 90 | f"{metric_name}_{repo.name}": metric 91 | for metric_name, metric in metrics.items() 92 | } 93 | for commit, metrics in fake_metrics_by_commit.items() 94 | } 95 | for repo in fake_repos 96 | } 97 | 98 | 99 | @pytest.fixture 100 | def fake_metrics_by_repo( 101 | fake_metrics_by_repo_by_commit, 102 | ) -> Dict[str, Dict[str, Dict[str, Dict]]]: 103 | return transform_probes.transform_metrics_by_hash(fake_metrics_by_repo_by_commit) 104 | 105 | 106 | @pytest.fixture 107 | def fake_repos() -> List[Repository]: 108 | return [ 109 | Repository("glean-core", dict(FAKE_REPO_META, library_names=["glean-core"])), 110 | Repository("firefox-desktop", dict(FAKE_REPO_META, dependencies=["gecko"])), 111 | Repository("gecko", dict(FAKE_REPO_META, dependencies=["glean-core"])), 112 | ] 113 | 114 | 115 | @pytest.fixture 116 | def fake_commits_by_repo( 117 | fake_repos, fake_commits 118 | ) -> Dict[str, Dict[Commit, List[Path]]]: 119 | return {repo.name: fake_commits for repo in fake_repos} 120 | 121 | 122 | def test_get_current_metrics(fake_metrics_by_repo): 123 | current_metrics_by_repo = fog_checks.get_current_metrics_by_repo( 124 | fake_metrics_by_repo 125 | ) 126 | assert ( 127 | "newer.category.name.metric_name_glean-core" 128 | in current_metrics_by_repo["glean-core"] 129 | ) 130 | 131 | 132 | def test_get_expiring_metrics(fake_metrics, fake_latest_nightly_version): 133 | expiring_metrics = fog_checks.get_expiring_metrics( 134 | { 135 | **fake_metrics, 136 | "expiring.metric_name": { 137 | **FAKE_METRIC, 138 | "expires": str(int(fake_latest_nightly_version) + 1), 139 | }, 140 | }, 141 | fake_latest_nightly_version, 142 | ) 143 | assert "expired.category.name.metric_name" in expiring_metrics 144 | assert "expiring.metric_name" in expiring_metrics 145 | assert "category.name.metric_name" not in expiring_metrics 146 | 147 | 148 | def test_fbagefem_does_nothing_with_no_fog_repos(fake_metrics_by_repo, fake_repos): 149 | fake_repos = [repo for repo in fake_repos if repo.name not in fog_checks.FOG_REPOS] 150 | fake_metrics_by_repo = { 151 | repo_name: metrics 152 | for repo_name, metrics in fake_metrics_by_repo.items() 153 | if repo_name not in fog_checks.FOG_REPOS 154 | } 155 | expiry_emails = fog_checks.file_bugs_and_get_emails_for_expiring_metrics( 156 | fake_repos, fake_metrics_by_repo, None, True 157 | ) 158 | assert expiry_emails is None 159 | 160 | 161 | @pytest.mark.web_dependency # fbagefem gets the latest nightly version from product-info 162 | def test_fbagefem_returns_emails_for_expiring_metrics(fake_metrics_by_repo, fake_repos): 163 | expiry_emails = fog_checks.file_bugs_and_get_emails_for_expiring_metrics( 164 | fake_repos, 165 | fake_metrics_by_repo, 166 | None, 167 | True, 168 | ) 169 | for fog_repo in fog_checks.FOG_REPOS: 170 | assert f"expired_metrics_{fog_repo}" in expiry_emails 171 | assert len(expiry_emails[f"expired_metrics_{fog_repo}"]["emails"]) == 1 172 | 173 | 174 | def test_bug_number_from_url(): 175 | assert ( 176 | fog_checks.bug_number_from_url( 177 | "https://bugzilla.mozilla.org/show_bug.cgi?id=1701769" 178 | ) 179 | == 1701769 180 | ) 181 | assert ( 182 | fog_checks.bug_number_from_url("https://bugzilla.mozilla.org/1885138") 183 | == 1885138 184 | ) 185 | assert ( 186 | fog_checks.bug_number_from_url( 187 | "https://bugzilla.mozilla.org/show_bug.cgi?id=1701769#c1" 188 | ) 189 | == 1701769 190 | ) 191 | assert fog_checks.bug_number_from_url("https://bugzil.la/1701769") == 1701769 192 | # Parser shouldn't give a good number for github urls 193 | assert ( 194 | fog_checks.bug_number_from_url( 195 | "https://github.com/mozilla/probe-scraper/pull/382" 196 | ) 197 | is None 198 | ) 199 | -------------------------------------------------------------------------------- /tests/test_glean_checks.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from probe_scraper.glean_checks import check_for_duplicate_metrics 4 | from probe_scraper.parsers.repositories import Repository 5 | 6 | OLD_DATE = "2019-10-04 17:30:42" 7 | NEWER_DATE = "2019-12-04 17:30:42" 8 | NEWEST_DATE = "2020-12-04 17:30:42" 9 | 10 | BASE_METADATA = {"notification_emails": ["foo@bar.com"]} 11 | 12 | 13 | @pytest.fixture 14 | def fake_repositories(): 15 | return [ 16 | Repository("glean-core", dict(BASE_METADATA, library_names=["glean-core"])), 17 | Repository( 18 | "glean-android", dict(BASE_METADATA, library_names=["glean-android"]) 19 | ), 20 | Repository( 21 | "fake-app", 22 | dict(BASE_METADATA, dependencies=["glean-core", "glean-android"]), 23 | ), 24 | ] 25 | 26 | 27 | def test_check_duplicate_metrics_no_duplicates(fake_repositories): 28 | # no overlap between metrics defined by glean-core and glean-android (both used by burnham) 29 | # check_for_duplicate_metrics should return False 30 | assert not check_for_duplicate_metrics( 31 | fake_repositories, 32 | { 33 | "glean-core": { 34 | "app_display_version": { 35 | "history": [ 36 | dict( 37 | BASE_METADATA, dates={"first": OLD_DATE, "last": NEWER_DATE} 38 | ) 39 | ] 40 | } 41 | }, 42 | "glean-android": { 43 | "app_display_version_android": { 44 | "history": [ 45 | dict( 46 | BASE_METADATA, dates={"first": OLD_DATE, "last": NEWER_DATE} 47 | ) 48 | ] 49 | } 50 | }, 51 | "fake-app": {}, 52 | }, 53 | {}, 54 | ) 55 | 56 | 57 | def test_check_duplicate_metrics_duplicates(fake_repositories): 58 | # glean-core and glean-android define the same metric in the current date 59 | # check_for_duplicate_metrics should return True 60 | assert check_for_duplicate_metrics( 61 | fake_repositories, 62 | { 63 | "glean-core": { 64 | "app_display_version": { 65 | "history": [ 66 | dict( 67 | BASE_METADATA, dates={"first": OLD_DATE, "last": NEWER_DATE} 68 | ) 69 | ] 70 | } 71 | }, 72 | "glean-android": { 73 | "app_display_version": { 74 | "history": [ 75 | dict( 76 | BASE_METADATA, dates={"first": OLD_DATE, "last": NEWER_DATE} 77 | ) 78 | ] 79 | }, 80 | }, 81 | "fake-app": {}, 82 | }, 83 | {}, 84 | ) 85 | 86 | 87 | def test_check_duplicate_metrics_duplicates_in_the_past(fake_repositories): 88 | # glean-core and glean-android define the same metric at one point in the 89 | # past, but not presently 90 | # check_for_duplicate_metrics should return False 91 | assert not check_for_duplicate_metrics( 92 | fake_repositories, 93 | { 94 | "glean-core": { 95 | "app_display_version": { 96 | "history": [ 97 | dict( 98 | BASE_METADATA, dates={"first": OLD_DATE, "last": NEWER_DATE} 99 | ) 100 | ] 101 | } 102 | }, 103 | "glean-android": { 104 | "app_display_version": { 105 | "history": [ 106 | dict( 107 | BASE_METADATA, dates={"first": OLD_DATE, "last": NEWER_DATE} 108 | ), 109 | ] 110 | }, 111 | "new_metric": { 112 | "history": [ 113 | # the newer date here implies that app_display_version above was removed 114 | dict( 115 | BASE_METADATA, 116 | dates={"first": OLD_DATE, "last": NEWEST_DATE}, 117 | ), 118 | ] 119 | }, 120 | }, 121 | "fake-app": {}, 122 | }, 123 | {}, 124 | ) 125 | -------------------------------------------------------------------------------- /tests/test_glean_limit_date.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | 6 | import json 7 | import os 8 | from datetime import datetime, time, timedelta 9 | from pathlib import Path 10 | from uuid import uuid4 11 | 12 | import git 13 | import pytest 14 | import yaml 15 | from git import Head, Repo 16 | 17 | import probe_scraper.runner 18 | 19 | 20 | @pytest.fixture 21 | def test_dir(tmp_path_factory) -> Path: 22 | # Where we will build the test git repo 23 | return tmp_path_factory.mktemp("test_git_repositories") 24 | 25 | 26 | def generate_repo( 27 | test_dir: Path, 28 | repo_name: str, 29 | branch: str = "main", 30 | skip_commits: int = 0, 31 | num_commits: int = 1, 32 | base_dir: Path = Path("tests/resources/test_repo_files"), 33 | base_datetime: datetime = datetime.utcnow(), 34 | ) -> Path: 35 | directory = test_dir / f"{repo_name}-{uuid4().hex}" 36 | repo = Repo.init(directory) 37 | # Ensure the default branch is using a fixed name. 38 | # User config could change that, 39 | # breaking tests with implicit assumptions further down the line. 40 | repo.head.reference = Head(repo, f"refs/heads/{branch}") 41 | 42 | base_path = base_dir / repo_name 43 | for i in range(skip_commits, skip_commits + num_commits): 44 | files_dir = base_path / str(i) 45 | if not files_dir.exists(): 46 | break 47 | 48 | for path in files_dir.iterdir(): 49 | print(f"Copying file {path.name}") 50 | destination = directory / path.name 51 | destination.write_bytes(path.read_bytes()) 52 | 53 | repo.index.add("*") 54 | # We need to synthesize the timestamps of commits to each be a second 55 | # apart, otherwise the commits may be at exactly the same second, which 56 | # means they won't always sort in order, and thus the merging of identical 57 | # metrics in adjacent commits may not happen correctly. 58 | commit_date = f"{base_datetime + timedelta(seconds=i):%Y-%m-%dT%H:%M:%S}" 59 | repo.index.commit(f"Commit {i}", commit_date=commit_date) 60 | 61 | return directory 62 | 63 | 64 | def test_single_commit(test_dir: Path): 65 | today_date = datetime.utcnow().date() 66 | today_datetime = datetime.combine(today_date, time.min) 67 | repo_path = generate_repo( 68 | test_dir, 69 | "normal", 70 | num_commits=2, 71 | # each commit after the first adds 1 second to base_datetime, so setting 72 | # glean_limit_date=today_date and base_datetime to 1 second before that will 73 | # only collect the second commit 74 | base_datetime=today_datetime - timedelta(seconds=1), 75 | ) 76 | 77 | repositories_info = { 78 | "version": "2", 79 | "libraries": [], 80 | "applications": [ 81 | { 82 | "app_name": "example", 83 | "canonical_app_name": "Example", 84 | "app_description": "foo", 85 | "url": str(repo_path), 86 | "notification_emails": ["nobody@example.com"], 87 | "metrics_files": ["metrics.yaml"], 88 | "channels": [ 89 | { 90 | "v1_name": "example", 91 | "app_id": "example", 92 | "app_channel": "release", 93 | } 94 | ], 95 | } 96 | ], 97 | } 98 | repositories_file = test_dir / "repositories.yaml" 99 | repositories_file.write_text(yaml.dump(repositories_info)) 100 | 101 | # generate output with date limit 102 | actual_kwargs = dict( 103 | cache_dir=test_dir / "cache", 104 | out_dir=test_dir / "actual", 105 | firefox_version=None, 106 | min_firefox_version=None, 107 | process_moz_central_probes=False, 108 | process_glean_metrics=True, 109 | repositories_file=repositories_file, 110 | dry_run=True, 111 | glean_repos=None, 112 | firefox_channel=None, 113 | output_bucket="", 114 | cache_bucket=None, 115 | env="dev", 116 | bugzilla_api_key=None, 117 | glean_urls=[str(repo_path)], 118 | glean_commit=None, 119 | glean_commit_branch=None, 120 | email_file=test_dir / "emails.txt", 121 | update=True, 122 | glean_limit_date=today_date, 123 | ) 124 | probe_scraper.runner.main(**actual_kwargs) 125 | 126 | # shallow clone repo with single commit to generate expected output 127 | original_repo_path = repo_path.parent / f"{repo_path.name}-original" 128 | os.rename(repo_path, original_repo_path) 129 | # must use file:// or git will ignore --depth 130 | git.Repo.clone_from(f"file://{original_repo_path.absolute()}", repo_path, depth=1) 131 | expect_kwargs = { 132 | **actual_kwargs, 133 | "update": False, 134 | "out_dir": test_dir / "expect", 135 | "glean_limit_date": None, 136 | } 137 | probe_scraper.runner.main(**expect_kwargs) 138 | # validate 139 | expect_metrics = json.loads( 140 | (test_dir / "expect" / "glean" / "example" / "metrics").read_text() 141 | ) 142 | actual_metrics = json.loads( 143 | (test_dir / "actual" / "glean" / "example" / "metrics").read_text() 144 | ) 145 | assert expect_metrics == actual_metrics 146 | 147 | 148 | def test_add_commit(test_dir: Path): 149 | today_date = datetime.utcnow().date() 150 | today_datetime = datetime.combine(today_date, time.min) 151 | repo_path = generate_repo( 152 | test_dir, 153 | "normal", 154 | num_commits=2, 155 | # each commit after the first adds 1 second to base_datetime, so setting 156 | # glean_limit_date=today_date and base_datetime to 1 second before that will 157 | # only collect the second commit 158 | base_datetime=today_datetime - timedelta(seconds=1), 159 | ) 160 | 161 | repositories_info = { 162 | "version": "2", 163 | "libraries": [], 164 | "applications": [ 165 | { 166 | "app_name": "example", 167 | "canonical_app_name": "Example", 168 | "app_description": "foo", 169 | "url": str(repo_path), 170 | "notification_emails": ["nobody@example.com"], 171 | "metrics_files": ["metrics.yaml"], 172 | "channels": [ 173 | { 174 | "v1_name": "example", 175 | "app_id": "example", 176 | "app_channel": "release", 177 | } 178 | ], 179 | } 180 | ], 181 | } 182 | repositories_file = test_dir / "repositories.yaml" 183 | repositories_file.write_text(yaml.dump(repositories_info)) 184 | 185 | # generate expected output without date limit 186 | expect_kwargs = dict( 187 | cache_dir=test_dir / "cache", 188 | out_dir=test_dir / "expect", 189 | firefox_version=None, 190 | min_firefox_version=None, 191 | process_moz_central_probes=False, 192 | process_glean_metrics=True, 193 | repositories_file=repositories_file, 194 | dry_run=True, 195 | glean_repos=None, 196 | firefox_channel=None, 197 | output_bucket="", 198 | cache_bucket=None, 199 | env="dev", 200 | bugzilla_api_key=None, 201 | glean_urls=[str(repo_path)], 202 | glean_commit=None, 203 | glean_commit_branch=None, 204 | email_file=test_dir / "emails.txt", 205 | update=False, 206 | glean_limit_date=None, 207 | ) 208 | probe_scraper.runner.main(**expect_kwargs) 209 | 210 | # clone repo with only first commit to initialize state before updating 211 | actual_kwargs = {**expect_kwargs, "out_dir": test_dir / "actual"} 212 | original_repo_path = repo_path.parent / f"{repo_path.name}-original" 213 | os.rename(repo_path, original_repo_path) 214 | repo = git.Repo.clone_from(original_repo_path, repo_path) 215 | repo.git.reset("HEAD~", hard=True) 216 | probe_scraper.runner.main(**actual_kwargs) 217 | # validate files are initially different 218 | expect_metrics = json.loads( 219 | (test_dir / "expect" / "glean" / "example" / "metrics").read_text() 220 | ) 221 | actual_metrics = json.loads( 222 | (test_dir / "actual" / "glean" / "example" / "metrics").read_text() 223 | ) 224 | assert expect_metrics != actual_metrics 225 | 226 | # update with second commit and date limit 227 | repo.git.pull() 228 | actual_kwargs["update"] = True 229 | actual_kwargs["glean_limit_date"] = today_date 230 | probe_scraper.runner.main(**actual_kwargs) 231 | # validate files are now equivalent 232 | expect_metrics = json.loads( 233 | (test_dir / "expect" / "glean" / "example" / "metrics").read_text() 234 | ) 235 | for metric in expect_metrics: 236 | for element in expect_metrics[metric]["history"]: 237 | for index in ("first", "last"): 238 | # reflog index is expected to be inaccurate in update mode 239 | element["reflog-index"][index] = 0 240 | actual_metrics = json.loads( 241 | (test_dir / "actual" / "glean" / "example" / "metrics").read_text() 242 | ) 243 | assert expect_metrics == actual_metrics 244 | -------------------------------------------------------------------------------- /tests/test_glean_push.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | 6 | import datetime 7 | import os 8 | import time 9 | import unittest.mock 10 | from contextlib import contextmanager 11 | from pathlib import Path 12 | from unittest.mock import Mock 13 | 14 | import pytest 15 | import yaml 16 | from git import Head, Repo 17 | 18 | from probe_scraper import glean_push 19 | 20 | 21 | @contextmanager 22 | def pushd(path: Path): 23 | cwd = os.getcwd() 24 | try: 25 | os.chdir(path) 26 | yield 27 | finally: 28 | os.chdir(cwd) 29 | 30 | 31 | @pytest.fixture(autouse=True) 32 | def empty_output_bucket(): 33 | with unittest.mock.patch.dict(os.environ, {"OUTPUT_BUCKET": ""}): 34 | yield 35 | 36 | 37 | @pytest.fixture 38 | def test_dir(tmp_path_factory) -> Path: 39 | # Where we will build the test git repo 40 | return tmp_path_factory.mktemp("test_git_repositories") 41 | 42 | 43 | @pytest.fixture 44 | def repositories_file(test_dir: Path) -> Path: 45 | # Where we will write the repositories file 46 | return test_dir / "repositories.yaml" 47 | 48 | 49 | def generate_repo( 50 | test_dir: Path, 51 | repo_name: str, 52 | branch: str = "main", 53 | num_commits: int = 1, 54 | base_dir: Path = Path("tests/resources/test_repo_files"), 55 | ) -> Path: 56 | directory = test_dir / repo_name 57 | repo = Repo.init(directory) 58 | # Ensure the default branch is using a fixed name. 59 | # User config could change that, 60 | # breaking tests with implicit assumptions further down the line. 61 | repo.head.reference = Head(repo, f"refs/heads/{branch}") 62 | 63 | # We need to synthesize the time stamps of commits to each be a second 64 | # apart, otherwise the commits may be at exactly the same second, which 65 | # means they won't always sort in order, and thus the merging of identical 66 | # metrics in adjacent commits may not happen correctly. 67 | base_time = time.time() 68 | 69 | base_path = base_dir / repo_name 70 | for i in range(num_commits): 71 | files_dir = base_path / str(i) 72 | if not files_dir.exists(): 73 | break 74 | 75 | for path in files_dir.iterdir(): 76 | print(f"Copying file {path.name}") 77 | destination = directory / path.name 78 | destination.write_bytes(path.read_bytes()) 79 | 80 | repo.index.add("*") 81 | commit_date = datetime.datetime.fromtimestamp(base_time + i).isoformat() 82 | commit_date = commit_date[: commit_date.find(".")] 83 | repo.index.commit("Commit {index}".format(index=i), commit_date=commit_date) 84 | 85 | return directory 86 | 87 | 88 | def test_missing_metrics_file(test_dir: Path, repositories_file: Path): 89 | repo_path = generate_repo(test_dir, "normal") 90 | commit = Repo(repo_path).head.commit.hexsha 91 | data = {"url": str(repo_path), "commit": commit, "branch": ""} 92 | request = Mock(get_json=Mock(return_value=data)) 93 | 94 | repositories_info = { 95 | "version": "2", 96 | "libraries": [], 97 | "applications": [ 98 | { 99 | "app_name": "example", 100 | "canonical_app_name": "Example", 101 | "app_description": "foo", 102 | "url": str(repo_path), 103 | "notification_emails": ["nobody@example.com"], 104 | "metrics_files": ["missing/metrics.yaml"], 105 | "channels": [ 106 | { 107 | "v1_name": "example", 108 | "app_id": "app-id", 109 | "app_channel": "release", 110 | } 111 | ], 112 | } 113 | ], 114 | } 115 | repositories_file.write_text(yaml.dump(repositories_info)) 116 | with pushd(repositories_file.parent): 117 | response = glean_push.main(request) 118 | assert response.status_code == 400 119 | assert ( 120 | response.data.decode() 121 | == f"Error: missing/metrics.yaml not found in commit {commit} for app-id\n" 122 | ) 123 | 124 | repositories_info["applications"][0]["deprecated"] = True 125 | repositories_file.write_text(yaml.dump(repositories_info)) 126 | with pushd(repositories_file.parent): 127 | response = glean_push.main(request) 128 | assert response.status_code == 200 129 | assert response.data.decode() == "update is valid, but not published\n" 130 | -------------------------------------------------------------------------------- /tests/test_histogram_parser.py: -------------------------------------------------------------------------------- 1 | from probe_scraper.parsers.histograms import HistogramsParser 2 | 3 | 4 | def is_string(s): 5 | return isinstance(s, str) 6 | 7 | 8 | def histogram_parser(version, usecounter_optout): 9 | FILES = [ 10 | "tests/resources/Histograms.json", 11 | "tests/resources/nsDeprecatedOperationList.h", 12 | "tests/resources/UseCounters.conf", 13 | ] 14 | 15 | HISTOGRAMS = [ 16 | "TELEMETRY_TEST_FLAG", 17 | "TELEMETRY_TEST_COUNT", 18 | "TELEMETRY_TEST_COUNT2", 19 | "TELEMETRY_TEST_COUNT_INIT_NO_RECORD", 20 | "TELEMETRY_TEST_CATEGORICAL", 21 | "TELEMETRY_TEST_CATEGORICAL_OPTOUT", 22 | "TELEMETRY_TEST_CATEGORICAL_NVALUES", 23 | "TELEMETRY_TEST_CATEGORICAL_EMPTY_LABELS", 24 | "TELEMETRY_TEST_KEYED_COUNT_INIT_NO_RECORD", 25 | "TELEMETRY_TEST_KEYED_FLAG", 26 | "TELEMETRY_TEST_KEYED_COUNT", 27 | "TELEMETRY_TEST_KEYED_BOOLEAN", 28 | "TELEMETRY_TEST_RELEASE_OPTOUT", 29 | "TELEMETRY_TEST_RELEASE_OPTIN", 30 | "TELEMETRY_TEST_KEYED_RELEASE_OPTIN", 31 | "TELEMETRY_TEST_KEYED_RELEASE_OPTOUT", 32 | "TELEMETRY_TEST_EXPONENTIAL", 33 | "TELEMETRY_TEST_LINEAR", 34 | "TELEMETRY_TEST_BOOLEAN", 35 | "TELEMETRY_TEST_EXPIRED", 36 | "TELEMETRY_TEST_ALL_CHILDREN", 37 | "TELEMETRY_TEST_ALL_CHILDS", 38 | "EXPRESSION_IN_LOW_HIGH_ATTRIBUTE", 39 | "NON_INTEGER_IN_HIGH_ATTRIBUTE", 40 | "HISTOGRAM_WITH_MULTISTORE", 41 | ] 42 | 43 | USE_COUNTERS = [ 44 | "USE_COUNTER2_SVGSVGELEMENT_GETELEMENTBYID_DOCUMENT", 45 | "USE_COUNTER2_SVGSVGELEMENT_GETELEMENTBYID_PAGE", 46 | "USE_COUNTER2_SVGSVGELEMENT_CURRENTSCALE_getter_DOCUMENT", 47 | "USE_COUNTER2_SVGSVGELEMENT_CURRENTSCALE_getter_PAGE", 48 | "USE_COUNTER2_SVGSVGELEMENT_CURRENTSCALE_setter_DOCUMENT", 49 | "USE_COUNTER2_SVGSVGELEMENT_CURRENTSCALE_setter_PAGE", 50 | "USE_COUNTER2_PROPERTY_FILL_DOCUMENT", 51 | "USE_COUNTER2_PROPERTY_FILL_PAGE", 52 | ] 53 | 54 | DEPRECATED_OPERATIONS = [ 55 | "USE_COUNTER2_DEPRECATED_GetAttributeNode_DOCUMENT", 56 | "USE_COUNTER2_DEPRECATED_GetAttributeNode_PAGE", 57 | "USE_COUNTER2_DEPRECATED_SetAttributeNode_DOCUMENT", 58 | "USE_COUNTER2_DEPRECATED_SetAttributeNode_PAGE", 59 | ] 60 | 61 | # Parse the histograms from the test definitions. 62 | parser = HistogramsParser() 63 | parsed_histograms = parser.parse(FILES, version) 64 | 65 | # Check that all expected histogram keys are present. 66 | ALL_KEYS = HISTOGRAMS + USE_COUNTERS + DEPRECATED_OPERATIONS 67 | assert set(ALL_KEYS) == set(parsed_histograms.keys()) 68 | 69 | # Make sure each of them contains all the required fields and details. 70 | REQUIRED_FIELDS = [ 71 | "cpp_guard", 72 | "description", 73 | "details", 74 | "expiry_version", 75 | "optout", 76 | "bug_numbers", 77 | ] 78 | 79 | REQUIRED_DETAILS = [ 80 | "low", 81 | "high", 82 | "keyed", 83 | "kind", 84 | "n_buckets", 85 | "record_in_processes", 86 | "record_into_store", 87 | ] 88 | 89 | for name, data in parsed_histograms.items(): 90 | assert is_string(name) 91 | 92 | # Check that we have all the required fields for each probe. 93 | for field in REQUIRED_FIELDS: 94 | assert field in data 95 | 96 | # Check that we have all the needed details. 97 | for field in REQUIRED_DETAILS: 98 | assert field in data["details"] 99 | 100 | # If multiple stores set, they should be both listed 101 | if name == "HISTOGRAM_WITH_MULTISTORE": 102 | assert ["main", "store2"] == data["details"]["record_into_store"] 103 | else: 104 | # Default multistore if unspecified is just "main" 105 | assert ["main"] == data["details"]["record_into_store"] 106 | 107 | # Categorical histograms should have a non-empty `details["labels"]`. 108 | if data["details"]["kind"] == "categorical": 109 | assert "labels" in data["details"].keys() and isinstance( 110 | data["details"]["labels"], list 111 | ) 112 | else: 113 | assert "labels" not in data["details"].keys() 114 | 115 | if name.startswith("USE_COUNTER2_"): 116 | assert data["optout"] == usecounter_optout 117 | 118 | 119 | # Test for an old Firefox version. 120 | def test_histogram_parser_old(): 121 | histogram_parser("55", usecounter_optout=False) 122 | 123 | 124 | # Test for a newer Firefox version with Use Counters on release 125 | def test_histogram_parser_new(): 126 | histogram_parser("70", usecounter_optout=True) 127 | -------------------------------------------------------------------------------- /tests/test_library_refs.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | 3 | 4 | def test_library_refs(): 5 | yaml_file = open("repositories.yaml", "r") 6 | repositories = yaml.safe_load(yaml_file) 7 | yaml_file.close() 8 | libs = set() 9 | for library in repositories["libraries"]: 10 | for variant in library["variants"]: 11 | libs.add(variant["dependency_name"]) 12 | for app in repositories["applications"]: 13 | missing_libs = set(app["dependencies"]) - libs 14 | if missing_libs: 15 | raise KeyError( 16 | f'application {app["app_name"]} contains invalid library references: {missing_libs}' 17 | ) 18 | -------------------------------------------------------------------------------- /tests/test_metrics_parser.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from probe_scraper.parsers.metrics import GleanMetricsParser 4 | 5 | 6 | def is_string(s): 7 | return isinstance(s, str) 8 | 9 | 10 | def test_metrics_parser(): 11 | # Parse the histograms from the test definitions. 12 | parser = GleanMetricsParser() 13 | parsed_metrics, errs = parser.parse(["tests/resources/metrics.yaml"], {}) 14 | 15 | assert errs == [] 16 | 17 | # Make sure we loaded all the metrics. 18 | # Notably, we do not check the contents; that is left up to the 19 | # glean parser to handle. 20 | assert len(parsed_metrics) == 2 21 | for name, data in parsed_metrics.items(): 22 | assert is_string(name) 23 | 24 | # Check that ping names are normalized 25 | assert "session-end" in parsed_metrics["example.os"]["send_in_pings"] 26 | 27 | 28 | def test_source_url(): 29 | parser = GleanMetricsParser() 30 | parsed_metrics, errs = parser.parse( 31 | ["tests/resources/metrics.yaml"], {}, "https://www.test.com/foo", "tests" 32 | ) 33 | 34 | assert ( 35 | parsed_metrics["example.duration"]["source_url"] 36 | == "https://www.test.com/foo/blob/tests/resources/metrics.yaml#L4" 37 | ) 38 | assert ( 39 | parsed_metrics["example.os"]["source_url"] 40 | == "https://www.test.com/foo/blob/tests/resources/metrics.yaml#L19" 41 | ) 42 | with pytest.raises(KeyError): 43 | parsed_metrics["example.os"]["defined_in"] 44 | -------------------------------------------------------------------------------- /tests/test_moz_central_scraper.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | 4 | import pytest 5 | 6 | from probe_scraper.scrapers import moz_central_scraper 7 | 8 | 9 | def test_extract_major_version(): 10 | assert moz_central_scraper.extract_major_version("62.0a1") == 62 11 | assert moz_central_scraper.extract_major_version("63.0.2") == 63 12 | with pytest.raises(Exception): 13 | moz_central_scraper.extract_major_version("helloworld") 14 | 15 | 16 | def path_is_in_version(path, version): 17 | return moz_central_scraper.relative_path_is_in_version(path, version) 18 | 19 | 20 | @pytest.mark.web_dependency 21 | def test_channel_revisions(): 22 | tmp_dir = "./.test-files" 23 | min_fx_version = 62 24 | max_fx_version = 62 25 | channel = "release" 26 | revision = "c9ed11ae5c79df3dcb69075e1c9da0317d1ecb1b" 27 | 28 | res = moz_central_scraper.scrape_channel_revisions( 29 | tmp_dir, min_fx_version, max_fx_version=max_fx_version, channels=[channel] 30 | ) 31 | 32 | registries = { 33 | probe_type: [ 34 | os.path.join(tmp_dir, "hg", revision, path) 35 | for path in paths 36 | if path_is_in_version(path, 62) 37 | ] 38 | for probe_type, paths in moz_central_scraper.REGISTRY_FILES.items() 39 | } 40 | 41 | record = { 42 | "date": datetime(2018, 10, 1, 18, 40, 35), 43 | "version": 62, 44 | "registries": registries, 45 | } 46 | 47 | assert res[channel][revision] == record 48 | -------------------------------------------------------------------------------- /tests/test_repositories_parser.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | import jsonschema 5 | import pytest 6 | import yaml 7 | 8 | from probe_scraper.parsers.repositories import RepositoriesParser 9 | 10 | 11 | def write_to_temp_file(data): 12 | fd, path = tempfile.mkstemp() 13 | with os.fdopen(fd, "w") as tmp: 14 | tmp.write(yaml.dump(data)) 15 | return path 16 | 17 | 18 | @pytest.fixture 19 | def parser(): 20 | return RepositoriesParser() 21 | 22 | 23 | @pytest.fixture 24 | def incorrect_repos_file(): 25 | data = { 26 | "some-repo": { 27 | # missing `notification_emails` 28 | "app_id": "mobile-metrics-example", 29 | "description": "foo", 30 | "url": "www.github.com/fbertsch/mobile-metrics-example", 31 | "metrics_files": ["metrics.yaml"], 32 | } 33 | } 34 | 35 | return write_to_temp_file(data) 36 | 37 | 38 | @pytest.fixture 39 | def correct_repos_file(): 40 | data = { 41 | "test-repo": { 42 | "app_id": "mobile-metrics-example", 43 | "description": "foo", 44 | "channel": "release", 45 | "url": "www.github.com/fbertsch/mobile-metrics-example", 46 | "notification_emails": ["frank@mozilla.com"], 47 | "metrics_files": ["metrics.yaml"], 48 | } 49 | } 50 | 51 | return write_to_temp_file(data) 52 | 53 | 54 | @pytest.fixture 55 | def invalid_release_channel_file(): 56 | data = { 57 | "test-repo": { 58 | "app_id": "mobile-metrics-example", 59 | "description": "foo", 60 | "channel": "releaze", 61 | "url": "www.github.com/fbertsch/mobile-metrics-example", 62 | "notification_emails": ["frank@mozilla.com"], 63 | "metrics_files": ["metrics.yaml"], 64 | } 65 | } 66 | 67 | return write_to_temp_file(data) 68 | 69 | 70 | def test_repositories(parser): 71 | parser.validate() 72 | 73 | 74 | def test_repositories_parser_incorrect(parser, incorrect_repos_file): 75 | with pytest.raises(jsonschema.exceptions.ValidationError): 76 | parser.validate(incorrect_repos_file) 77 | 78 | 79 | def test_repositories_parser_invalid_channel(parser, invalid_release_channel_file): 80 | with pytest.raises(jsonschema.exceptions.ValidationError): 81 | parser.validate(invalid_release_channel_file) 82 | 83 | 84 | def test_repositories_class(parser, correct_repos_file): 85 | repos = parser.parse(correct_repos_file) 86 | 87 | assert len(repos) == 1 88 | assert set(repos[0].get_metrics_file_paths()) == {"metrics.yaml"} 89 | assert repos[0].to_dict() == { 90 | "app_id": "mobile-metrics-example", 91 | "channel": "release", 92 | "dependencies": [], 93 | "deprecated": False, 94 | "description": "foo", 95 | "metrics_file_paths": ["metrics.yaml"], 96 | "name": "test-repo", 97 | "notification_emails": ["frank@mozilla.com"], 98 | "ping_file_paths": [], 99 | "prototype": False, 100 | "tag_file_paths": [], 101 | "url": "www.github.com/fbertsch/mobile-metrics-example", 102 | "skip_documentation": False, 103 | "moz_pipeline_metadata_defaults": {}, 104 | "moz_pipeline_metadata": {}, 105 | } 106 | -------------------------------------------------------------------------------- /tests/test_scalar_parser.py: -------------------------------------------------------------------------------- 1 | from probe_scraper.parsers.scalars import ScalarsParser 2 | 3 | 4 | def is_string(s): 5 | return isinstance(s, str) 6 | 7 | 8 | def test_scalar_parser(): 9 | # Parse the histograms from the test definitions. 10 | parser = ScalarsParser() 11 | parsed_scalars = parser.parse(["tests/resources/test_scalars.yaml"], "55") 12 | 13 | # Make sure we loaded all the scalars. 14 | assert len(parsed_scalars) == 17 15 | 16 | # Make sure each of them contains all the required fields and details. 17 | REQUIRED_FIELDS = [ 18 | "cpp_guard", 19 | "description", 20 | "details", 21 | "expiry_version", 22 | "optout", 23 | "bug_numbers", 24 | ] 25 | REQUIRED_DETAILS = ["keyed", "kind", "record_in_processes", "record_into_store"] 26 | 27 | for name, data in parsed_scalars.items(): 28 | assert is_string(name) 29 | 30 | # Make sure we have all the required fields and details. 31 | for field in REQUIRED_FIELDS: 32 | assert field in data 33 | 34 | for field in REQUIRED_DETAILS: 35 | assert field in data["details"] 36 | 37 | # If multiple stores set, they should be both listed 38 | if name == "other.test.multistore_probe": 39 | assert ["main", "store2"] == data["details"]["record_into_store"] 40 | else: 41 | # Default multistore if unspecified is just "main" 42 | assert ["main"] == data["details"]["record_into_store"] 43 | --------------------------------------------------------------------------------