├── .circleci └── config.yml ├── .flake8 ├── .gitignore ├── .pre-commit-config.yaml ├── API.md ├── CODE_OF_CONDUCT.md ├── DATA_COLLECTION_POLICY.md ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── analysis ├── TAARExperimentV2Analysis.ipynb ├── TAARExperimentV2ETL.ipynb ├── TAARExperimentV2Retention-alternate.ipynb ├── TAARExperimentV2Retention.ipynb ├── TAARLogMunge.ipynb ├── TAARV2.Rmd └── TAARV2.html ├── bin ├── build ├── deploy ├── pipstrap.py ├── run ├── run_package_test.py ├── taar-redis.py ├── test └── test_env.sh ├── docker-compose.yml ├── docs ├── TAARLITE-README.md ├── randomized_tails.md ├── release_instructions.md └── taarlite-screenshot.png ├── environment.yml ├── prod-requirements.txt ├── setup.cfg ├── setup.py ├── taar ├── __init__.py ├── adapters │ ├── __init__.py │ └── tests │ │ └── __init__.py ├── context.py ├── flask_app.py ├── interfaces.py ├── logs │ ├── __init__.py │ ├── moz_logging.py │ └── stubs.py ├── plugin.py ├── profile_fetcher.py ├── recommenders │ ├── __init__.py │ ├── base_recommender.py │ ├── cache.py │ ├── collaborative_recommender.py │ ├── debug.py │ ├── ensemble_recommender.py │ ├── guid_based_recommender.py │ ├── locale_recommender.py │ ├── randomizer.py │ ├── recommendation_manager.py │ ├── redis_cache.py │ ├── similarity_recommender.py │ └── ua_parser.py ├── settings.py ├── tests │ └── __init__.py └── utils.py └── tests ├── __init__.py ├── conftest.py ├── mocks.py ├── noop_fixtures.py ├── similarity_data.py ├── test_collaborativerecommender.py ├── test_ensemblerecommender.py ├── test_guid_based_recommender.py ├── test_integration.py ├── test_localerecommender.py ├── test_profile_fetcher.py ├── test_randomizer.py ├── test_recommendation_manager.py └── test_similarityrecommender.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # These environment variables must be set in CircleCI UI 2 | # 3 | # DOCKERHUB_REPO - docker hub repo, format: / 4 | # DOCKER_USER 5 | # DOCKER_PASS 6 | # 7 | 8 | version: 2 9 | jobs: 10 | build_deploy: 11 | machine: 12 | enable: true 13 | steps: 14 | - checkout 15 | - run: ./bin/build 16 | - run: ./bin/deploy 17 | integration_test: 18 | docker: 19 | - image: continuumio/miniconda3 20 | steps: 21 | - checkout 22 | - setup_remote_docker: 23 | docker_layer_caching: true 24 | - run: apt-get update; apt-get install make -y 25 | - run: . /opt/conda/etc/profile.d/conda.sh && conda env create -n taar-37 --file environment.yml 26 | - run: . /opt/conda/etc/profile.d/conda.sh && conda activate taar-37 && python setup.py install && make pytest 27 | 28 | workflows: 29 | version: 2 30 | test_build_deploy: 31 | jobs: 32 | - integration_test: 33 | filters: 34 | tags: 35 | only: /.*/ 36 | - build_deploy: 37 | requires: 38 | - integration_test 39 | filters: 40 | tags: 41 | only: /.*/ 42 | branches: 43 | only: master 44 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = W503,W504,E203,E231 3 | exclude = .git,__pycache__ 4 | max-complexity = 10 5 | max-line-length = 120 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # intermediate csvs for analysis 2 | *.csv 3 | TAARV2_cache/ 4 | TAARV2_files/ 5 | 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | env/ 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *,cover 52 | .hypothesis/ 53 | fixtures 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # IPython Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # dotenv 87 | .env 88 | 89 | # virtualenv 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # MAC os garbage 100 | .DS_Store 101 | 102 | # Pycharm garbage (j/k jetbrains you know i love you) 103 | .idea 104 | 105 | # vim garbage 106 | *.sw? 107 | 108 | # vscode garbage 109 | .vscode 110 | 111 | # unix history files 112 | .bash_history 113 | .python_history 114 | 115 | # output from bin/build 116 | version.json 117 | 118 | # local virtual_env 119 | virtual_env 120 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/ambv/black 3 | rev: stable 4 | hooks: 5 | - id: black 6 | language_version: python3.7 7 | - repo: https://gitlab.com/pycqa/flake8 8 | rev: 3.7.9 9 | hooks: 10 | - id: flake8 11 | -------------------------------------------------------------------------------- /API.md: -------------------------------------------------------------------------------- 1 | API documentation 2 | 3 | # Get addon recommendations 4 | 5 | Allow the Authenticated User to update their details. 6 | 7 | **URL** : `/v1/api/recommendations//` 8 | 9 | **Method** : `POST` 10 | 11 | **Auth required** : NO 12 | 13 | **Permissions required** : None 14 | 15 | **Data constraints** 16 | 17 | ```json 18 | { 19 | "options": {"promoted": [ 20 | ["[1 to 30 chars]", Some Number], 21 | ["[1 to 30 chars]", Some Number], 22 | ] 23 | } 24 | } 25 | ``` 26 | 27 | Note that the only valid key for the top level JSON is `options`. 28 | 29 | `options` is always a dictionary of optional values. 30 | 31 | To denote no optional data - it is perfectly valid for the JSON data 32 | to have no `options` key, or even simpler - not have POST data at all. 33 | 34 | Each item in the promoted addon GUID list is accompanied by an 35 | integer weight. Any weight is greater than a TAAR recommended addon 36 | GUID. 37 | 38 | **Data examples** 39 | 40 | Partial data is allowed. 41 | 42 | ```json 43 | { 44 | "options": {"promoted": [ 45 | ["guid1", 10], 46 | ["guid2", 5], 47 | ] 48 | } 49 | } 50 | ``` 51 | 52 | 53 | ## Success Responses 54 | 55 | **Condition** : Data provided is valid 56 | 57 | **Code** : `200 OK` 58 | 59 | **Content example** : Response will reflect a list of addon GUID suggestions. 60 | 61 | ```json 62 | { 63 | "results": ["taar-guid1", "taar-guid2", "taar-guid3"], 64 | "result_info": [], 65 | } 66 | ``` 67 | 68 | ## Error Response 69 | 70 | **Condition** : If provided data is invalid, e.g. options object is not a dictionary. 71 | 72 | **Code** : `400 BAD REQUEST` 73 | 74 | **Content example** : 75 | 76 | ```json 77 | { 78 | "invalid_option": [ 79 | "Please provide a dictionary with a `promoted` key mapped to a list of promoted addon GUIDs", 80 | ] 81 | } 82 | ``` 83 | 84 | ## Notes 85 | 86 | * Endpoint will ignore irrelevant and read-only data such as parameters that 87 | don't exist, or fields. 88 | * Endpoint will try to fail gracefully and return an empty list in the 89 | results key if no suggestions can be made. 90 | * The only condition when the endpoint should return an error code if 91 | the options data is malformed. 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Community Participation Guidelines 2 | 3 | This repository is governed by Mozilla's code of conduct and etiquette guidelines. 4 | For more details, please read the 5 | [Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). 6 | 7 | ## How to Report 8 | For more information on how to report violations of the Community Participation Guidelines, please read our '[How to Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' page. 9 | 10 | 16 | -------------------------------------------------------------------------------- /DATA_COLLECTION_POLICY.md: -------------------------------------------------------------------------------- 1 | # TAAR: Data Collection Policy 2 | Data collection policy and opt-out mechanisms pertaining to the Telemetry-Aware Addon Recommender (TAAR) service. 3 | 4 | Table of Contents (ToC): 5 | =========================== 6 | 7 | * [Overview](#overview) 8 | * [Technical details](#technical-details) 9 | * [Client-side](#client-side-data-collection) 10 | * [Server-side](#data-collected-from-the-taar-server) 11 | * [Opt-out mechanisms](#opt-out-mechanisms-for-taar) 12 | * [Privacy considerations](#privacy-considerations) 13 | 14 | ## Overview 15 | To better predict what extensions you may find interesting, Firefox uses the telemetry-Aware Add-on Recommender (TAAR) system—a Mozilla service that recommends extensions by examining basic browser telemetry. This means TAAR analyzes usage statistics from a large number of other Firefox users, looks at other extensions you may have installed, and considers general characteristics about your Firefox profile (like language preference). Based on this information, TAAR surfaces extension recommendations tailored just for you. 16 | 17 | Extensions allow you to add features to Firefox to customize your browsing experience. Extensions are software programs, most often developed by a third party, that modify the way Firefox works. 18 | 19 | ## Technical details 20 | In order to associate a client's existing telemetry data to a [TAAR-API](https://github.com/mozilla/taar-api) request during a browser session a one way [sha256 hash](https://en.wikipedia.org/wiki/SHA-2) of the [telemetry client_id](https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/common-ping.html) is exposed via a specific [TAAR preference](https://bugzilla.mozilla.org/show_bug.cgi?id=1499470). This preference can be enabled/disabled via the `about:preferences` page in Firefox. The hashed client_id allows us to use a subset of Firefox telemetry data sources to build models that can be used to make Web Extension recommendations based (in part) on telemetry. 21 | 22 | ### Client-side data collection 23 | The following data collection is implemented on the _client_ 24 | 25 | * one-way hashed telemetry client_id: sha256(client_id) 26 | 27 | ### Data collected from the TAAR server 28 | Note: this data is logged once a successful lookup has been performed associating a hashed client_id with a previously seen hashed client_id. The complete list of fields below is only collected in the the case that a successful response (including a set of recommendations has been has been made). This data is not from the client nor is it accessible by the client. It's included here for completeness. 29 | 30 | * timestamp (a timestamp object: system timestamp for the taar request) 31 | * taar.model (a string value: indicating the current version of the taar service and the model used) 32 | * logger.identifier (a string value: logger name, should always be "srg.taar") 33 | * hashed_client_id (a string value: the same hashed clientId as is exposed to AMO) 34 | * model_parameters (list of floats: necessary parameters for operating and diagnosing TAAR operation) 35 | * guids_recommended (a list of strings: the list of guids that were served as recommendations for that taar request) 36 | 37 | ## Opt-out mechanisms for TAAR 38 | To turn off personalized recommendations in the Add-ons Manager, visit [hamburger menu] > Preferences > Data Collection and Use, and un-check the box that reads, “Allow Firefox to make personalized extension recommendations.” 39 | 40 | If you opt out, you’ll still see generalized recommended extensions in the Add-ons Manager, however they won’t be personally tailored for you using telemetry data. 41 | 42 | The TAAR service will not log any information while a client session is in [Private Browsing Mode](https://support.mozilla.org/en-US/kb/private-browsing-use-firefox-without-history). 43 | 44 | ## Privacy considerations 45 | All data collected by the TAAR service adhere to Mozilla's [Data Collection guidelines](https://wiki.mozilla.org/Firefox/Data_Collection). No Personally Identifiable information (PII) is collected by the TAAR-service and raw data collected by the TAAR service is never joined against other derived datasets. 46 | 47 | A data retention period of (180 days) for the raw data applies to all client and server side data collected by the TAAR service. Anonymised aggregates based on this data may be stored indefinitely. Aggregates derived from the collected data will be used for monitoring the health fo the Web Extensions ecosystem. System logs will be used to diagnose problems with the TAAR core technology. 48 | 49 | Data collection and retention decisions related to the development of the TAAR services are driven by the [Mozilla Privacy Principles](https://www.mozilla.org/en-US/privacy/principles/) and strive to provide transparent 50 | 51 | Recommendations are strictly intended to provide a better browsing experience for Firefox users. As described above, extension developers cannot pay for placement in the recommendation program, and Firefox does not receive any compensation as a result of this process. 52 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3 2 | ENV PYTHONDONTWRITEBYTECODE 1 3 | 4 | MAINTAINER Victor Ng 5 | # add a non-privileged user for installing and running 6 | # the application 7 | RUN groupadd --gid 10001 app && \ 8 | useradd --uid 10001 --gid 10001 --home /app --create-home app 9 | 10 | RUN apt-get update && \ 11 | apt-get install -y --no-install-recommends build-essential gettext curl \ 12 | libopenblas-dev libatlas3-base gfortran && \ 13 | rm -rf /var/lib/apt/lists/* 14 | 15 | WORKDIR /app 16 | 17 | # First copy requirements so we can take advantage of docker 18 | # caching. 19 | COPY ./environment.yml /app/environment.yml 20 | RUN conda env update -n taar-37 -f environment.yml 21 | 22 | COPY . /app 23 | RUN python setup.py develop 24 | 25 | RUN . /opt/conda/etc/profile.d/conda.sh && \ 26 | conda activate taar-37 && python setup.py install 27 | 28 | 29 | USER app 30 | 31 | # Using /bin/bash as the entrypoint works around some volume mount issues on Windows 32 | # where volume-mounted files do not have execute bits set. 33 | # https://github.com/docker/compose/issues/2301#issuecomment-154450785 has additional background. 34 | ENTRYPOINT ["/bin/bash", "/app/bin/run"] 35 | 36 | # bin/run supports web|web-dev|test options 37 | CMD ["web"] 38 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | recursive-include taar *.py *.html *.js *.css 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build up tests flake8 ci tests-with-cov 2 | 3 | all: 4 | # PySpark only knows eggs, not wheels 5 | python setup.py sdist 6 | 7 | setup_conda: 8 | # Install all dependencies and setup repo in dev mode 9 | conda env update -n taar-37 -f environment.yml 10 | python setup.py develop 11 | 12 | conda_update: 13 | # Actualize env after .yml file was modified 14 | conda env update -n taar-37 -f environment.yml --prune 15 | 16 | conda_export: 17 | conda env export > environment.yml 18 | 19 | upload: 20 | twine upload --repository-url https://upload.pypi.org/legacy/ dist/* 21 | 22 | pytest: 23 | python setup.py develop 24 | python setup.py test 25 | flake8 taar tests 26 | 27 | build: 28 | docker build . -t taar:latest 29 | 30 | up: 31 | docker-compose up 32 | 33 | test-container: 34 | docker run -e CODECOV_TOKEN=${CODECOV_TOKEN} -it taar:latest test 35 | 36 | run_local: 37 | . bin/test_env.sh && python taar/flask_app.py -H 0.0.0.0 -P 8001 38 | 39 | run_package_test: 40 | python setup.py develop 41 | python bin/run_package_test.py 42 | 43 | shell: 44 | docker run -it taar:latest bash 45 | -------------------------------------------------------------------------------- /analysis/TAARExperimentV2ETL.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from moztelemetry import Dataset\n", 10 | "from pyspark.sql import Row\n", 11 | "from pyspark.sql.types import BooleanType, LongType\n", 12 | "import pandas as pd\n", 13 | "import pyspark.sql.functions as F\n", 14 | "import datetime as dt\n", 15 | "\n", 16 | "sc.setLogLevel(\"INFO\")" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "Define util funcs" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "def utc2date(seconds):\n", 33 | " \"\"\"\n", 34 | " Takes unix time in seconds and returns a string representation\n", 35 | " \"\"\"\n", 36 | " utc = dt.datetime(1970, 1, 1)\n", 37 | " try:\n", 38 | " return dt.datetime.strftime(utc + dt.timedelta(seconds=seconds), format='%Y%m%d')\n", 39 | " except:\n", 40 | " return None\n", 41 | "\n", 42 | " \n", 43 | "def shield_data(x):\n", 44 | " \"\"\"\n", 45 | " Grabs the data reported by the shield add-on \n", 46 | " \"\"\"\n", 47 | " return x.get(\"payload\", {}).get(\"data\", {}).get(\"attributes\", {})\n", 48 | "\n", 49 | "\n", 50 | "def _cast(col, f):\n", 51 | " if col != 'null':\n", 52 | " try:\n", 53 | " return f(col)\n", 54 | " except:\n", 55 | " pass\n", 56 | " return\n", 57 | "\n", 58 | "_bool = lambda x: True if x == 'true' else False\n", 59 | "\n", 60 | "castLong = F.udf(lambda x: _cast(x, long), LongType())\n", 61 | "castBool = F.udf(lambda x: _cast(x, _bool), BooleanType())\n", 62 | " \n", 63 | "\n", 64 | "def collapse_fields(x):\n", 65 | " \"\"\"\n", 66 | " Collapsed nested field names \n", 67 | " and returns a flattened object as a \n", 68 | " PySpark Row to prepare for DataFrame \n", 69 | " conversion\n", 70 | " \"\"\"\n", 71 | " if x is None:\n", 72 | " x = {}\n", 73 | " data = x.get(\"payload\", {}).get(\"data\").get(\"attributes\", {})\n", 74 | " addons= x.get(\"environment\", {}).get(\"addons\", {}).get(\"activeAddons\", {})\n", 75 | " result = Row(\n", 76 | " client_id=x.get(\"clientId\"),\n", 77 | " locale=x.get(\"environment\", {}).get(\"settings\", {}).get(\"locale\"),\n", 78 | " branch=x.get(\"payload\", {}).get(\"branch\"),\n", 79 | " addon_id=data.get(\"addon_id\"),\n", 80 | " clicked_button=data.get(\"clickedButton\"),\n", 81 | " creation_date=x.get(\"creationDate\"),\n", 82 | " ping_type=data.get(\"pingType\"),\n", 83 | " saw_popup=data.get(\"sawPopup\"),\n", 84 | " src=data.get(\"srcURI\"),\n", 85 | " start_time_utc=data.get(\"startTime\"),\n", 86 | " dwell_time=data.get(\"aboutAddonsActiveTabSeconds\"),\n", 87 | " discopane_loaded=data.get(\"discoPaneLoaded\"),\n", 88 | " submission_date_s3=x.get(\"meta\").get(\"submissionDate\"),\n", 89 | " current_addons=[i for i in addons if \\\n", 90 | " not addons[i].get('isSystem', True) and \\\n", 91 | " not addons[i].get('foreignInstall', True)]\n", 92 | " )\n", 93 | " return result" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "Define study dates in string and unix format" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 5, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "study start date: 20180312\n", 113 | "study end date: 20180423\n" 114 | ] 115 | } 116 | ], 117 | "source": [ 118 | "START_DATE_STR = \"20180312\"\n", 119 | "END_DATE_STR = \"20180423\"\n", 120 | "print(\"study start date: \" + START_DATE_STR + \"\\n\" + \"study end date: \" + END_DATE_STR)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Load raw pings from experiment" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": { 134 | "code_folding": [] 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "# load all taar pings from our adjusted start date of 20171008\n", 139 | "taarv2_pings = (\n", 140 | " Dataset.from_source(\"telemetry\")\n", 141 | " .where(docType=\"shield-study-addon\")\n", 142 | " .where(submissionDate=lambda x: x >= START_DATE_STR and x <= END_DATE_STR)\n", 143 | " .records(sc)\n", 144 | " .filter(lambda x: x.get(\"payload\", {}).get(\"study_name\") == \"TAARExperimentV2\")\n", 145 | " .filter(lambda x: x.get(\"payload\", {}).get(\"addon_version\") == \"1.0.13\")\n", 146 | " .filter(lambda x: x.get(\"payload\", {}).get(\"testing\") == False)\n", 147 | ")" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "Convert pings to a structured spark DataFrame" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": { 161 | "scrolled": false 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "# sampleRatio infers schema from first 0.1% of rows\n", 166 | "taarv2_DF = taarnet-mozaws-prod-us-west-2-pipeline-analysisv2_pings.map(collapse_fields).toDF(sampleRatio=0.001)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "Cast non-string columns to the appropriate type" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 10, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "bool_cols = [\n", 183 | " 'discopane_loaded',\n", 184 | " 'clicked_button',\n", 185 | " 'saw_popup', \n", 186 | "]\n", 187 | "\n", 188 | "long_cols = [\n", 189 | " 'start_time_utc',\n", 190 | " 'dwell_time',\n", 191 | "]\n", 192 | "\n", 193 | "for b in bool_cols:\n", 194 | " taarv2_DF = taarv2_DF.withColumn(b, castBool(b))\n", 195 | " \n", 196 | "for l in long_cols:\n", 197 | " taarv2_DF = taarv2_DF.withColumn(l, castLong(l))" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 11, 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "name": "stdout", 207 | "output_type": "stream", 208 | "text": [ 209 | "root\n", 210 | " |-- addon_id: string (nullable = true)\n", 211 | " |-- branch: string (nullable = true)\n", 212 | " |-- clicked_button: boolean (nullable = true)\n", 213 | " |-- client_id: string (nullable = true)\n", 214 | " |-- creation_date: string (nullable = true)\n", 215 | " |-- current_addons: array (nullable = true)\n", 216 | " | |-- element: string (containsNull = true)\n", 217 | " |-- discopane_loaded: boolean (nullable = true)\n", 218 | " |-- dwell_time: long (nullable = true)\n", 219 | " |-- locale: string (nullable = true)\n", 220 | " |-- ping_type: string (nullable = true)\n", 221 | " |-- saw_popup: boolean (nullable = true)\n", 222 | " |-- src: string (nullable = true)\n", 223 | " |-- start_time_utc: long (nullable = true)\n", 224 | " |-- submission_date_s3: string (nullable = true)\n", 225 | "\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "taarv2_DF.printSchema()" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "Write to S3, partitioning by `branch`, since most subsequent queries will involve aggregating by this field" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 12, 243 | "metadata": { 244 | "code_folding": [], 245 | "scrolled": false 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "S3_PATH = 's3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/'\n", 250 | "\n", 251 | "(\n", 252 | "taarv2_DF\n", 253 | " .repartition(1)\n", 254 | " .write\n", 255 | " .partitionBy('branch')\n", 256 | " .mode(\"overwrite\")\n", 257 | " .parquet(S3_PATH)\n", 258 | ")" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 13, 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "name": "stdout", 268 | "output_type": "stream", 269 | "text": [ 270 | "n records: 8762664\n", 271 | "n clients: 3491762\n", 272 | "[Row(min(submission_date_s3)=u'20180312', max(submission_date_s3)=u'20180417')]\n" 273 | ] 274 | } 275 | ], 276 | "source": [ 277 | "# verify\n", 278 | "t = sqlContext.read.parquet(S3_PATH)\n", 279 | "\n", 280 | "print \"n records:\", t.count()\n", 281 | "print \"n clients:\", t.select('client_id').distinct().count()\n", 282 | "sd = t.select(F.min(\"submission_date_s3\"), \n", 283 | " F.max('submission_date_s3'))\n", 284 | "print sd.collect()" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [] 293 | } 294 | ], 295 | "metadata": { 296 | "anaconda-cloud": {}, 297 | "kernelspec": { 298 | "display_name": "Python [conda root]", 299 | "language": "python", 300 | "name": "conda-root-py" 301 | }, 302 | "language_info": { 303 | "codemirror_mode": { 304 | "name": "ipython", 305 | "version": 2 306 | }, 307 | "file_extension": ".py", 308 | "mimetype": "text/x-python", 309 | "name": "python", 310 | "nbconvert_exporter": "python", 311 | "pygments_lexer": "ipython2", 312 | "version": "2.7.12" 313 | } 314 | }, 315 | "nbformat": 4, 316 | "nbformat_minor": 1 317 | } 318 | -------------------------------------------------------------------------------- /analysis/TAARLogMunge.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pyspark.sql.functions as F\n", 12 | "import datetime as dt\n", 13 | "import ast\n", 14 | "import boto3\n", 15 | "import json\n", 16 | "import re" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 3, 22 | "metadata": { 23 | "collapsed": true 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "DATA_LOCATION = \"s3://net-mozaws-prod-us-west-2-pipeline-analysis/taar-api-logs-daily/\"" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 4, 33 | "metadata": { 34 | "collapsed": true 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "# Parse the TAAR application logs from s3 source.\n", 39 | "taar_logs = sqlContext\\\n", 40 | " .read.format(\"com.databricks.spark.csv\")\\\n", 41 | " .option(\"header\", \"true\")\\\n", 42 | " .option(\"inferschema\", \"true\")\\\n", 43 | " .option(\"mode\", \"DROPMALFORMED\")\\\n", 44 | " .load(DATA_LOCATION)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 5, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "StructType(List(StructField(timestamp,StringType,true),StructField(severity,IntegerType,true),StructField(type,StringType,true),StructField(fields,StringType,true),StructField(date,StringType,true)))\n", 57 | "\n", 58 | "[Row(timestamp=u'2018-03-30 00:00:23.000', severity=6, type=u'taar.recommenders.ensemble_recommender', fields=u\"{message=client_id: [00000000-0000-0000-0000-000000000000], ensemble_weight: [{'similarity': 0.09216174, 'collaborative': 2.16759527, 'legacy': 0.05516607, 'locale': 2.09866473}], guids: [['uBlock0@raymondhill.net', '{73a6fe31-595d-460b-a920-fcc0f8843232}', 'firefox@ghostery.com', 'firefoxdav@icloud.com', 'ich@maltegoetz.de', 'idsafe@norton.com', 'nortonsafeweb@symantec.com', '{d04b0b40-3dab-4f0b-97a6-04ec3eddbfb0}', 'artur.dubovoy@gmail.com', '{a0d7ccb3-214d-498b-b4aa-0e8fda9a7bf7}']], recommender=null, client_id=null, lang=null, limit=null, num_recommendations=null, maximum_similarity=null}\", date=u'2018-03-29')]\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "# Display log file schema.\n", 64 | "print(taar_logs.schema)\n", 65 | "# Display one exampel row of log data.\n", 66 | "print(\"\\n\" + str(taar_logs.take(1)))" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 6, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "StructType(List(StructField(severity,IntegerType,true),StructField(type,StringType,true),StructField(fields,StringType,true),StructField(date,StringType,true),StructField(parsed_time,TimestampType,true)))\n", 79 | "\n", 80 | "\n", 81 | "[Row(severity=6, type=u'taar.recommenders.ensemble_recommender', fields=u\"{message=client_id: [00000000-0000-0000-0000-000000000000], ensemble_weight: [{'similarity': 0.09216174, 'collaborative': 2.16759527, 'legacy': 0.05516607, 'locale': 2.09866473}], guids: [['uBlock0@raymondhill.net', '{73a6fe31-595d-460b-a920-fcc0f8843232}', 'firefox@ghostery.com', 'firefoxdav@icloud.com', 'ich@maltegoetz.de', 'idsafe@norton.com', 'nortonsafeweb@symantec.com', '{d04b0b40-3dab-4f0b-97a6-04ec3eddbfb0}', 'artur.dubovoy@gmail.com', '{a0d7ccb3-214d-498b-b4aa-0e8fda9a7bf7}']], recommender=null, client_id=null, lang=null, limit=null, num_recommendations=null, maximum_similarity=null}\", date=u'2018-03-29', parsed_time=datetime.datetime(2018, 3, 30, 0, 0, 23))]\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "# Convert text timestamp to actual timestamp object.\n", 87 | "time_format = \"yyyy-MM-dd HH:mm:ss.SSS\"\n", 88 | "taar_logs_timestamps = taar_logs.withColumn(\"parsed_time\", F.to_timestamp(\"timestamp\", time_format)\n", 89 | " .cast(\"double\")\n", 90 | " .cast(\"timestamp\")).drop(\"timestamp\")\n", 91 | "\n", 92 | "print(taar_logs_timestamps.schema)\n", 93 | "print(\"\\n\")\n", 94 | "print(taar_logs_timestamps.take(1))" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 7, 100 | "metadata": { 101 | "collapsed": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "# Define a utility for writing results of this analysis to an accessible s3 bucket.\n", 106 | "def write_to_s3(bucket_name, filename, data, aws_access_key_id=None, aws_secret_access_key=None):\n", 107 | " \"\"\" write list as CSV to s3\n", 108 | " params: bucket_name, str, name of bucket\n", 109 | " filename, str, name of file (prefix + file name)\n", 110 | " return: nothing\n", 111 | " \"\"\"\n", 112 | " s3 = boto3.Session(aws_access_key_id=aws_access_key_id,\n", 113 | " aws_secret_access_key=aws_secret_access_key).resource('s3')\n", 114 | " obj = s3.Object(bucket_name, filename)\n", 115 | " obj.put(Body=json.dumps(data, ensure_ascii=False).encode('utf8'))" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 8, 121 | "metadata": { 122 | "collapsed": true 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "def is_log_type_recommendation(r):\n", 127 | " return \"taar.recommenders.\" in r[\"type\"]\n", 128 | " \n", 129 | "def is_log_type_ensemble(r):\n", 130 | " return \"ensemble_recommender\" in r[\"type\"]\n", 131 | "\n", 132 | "def valid_uuid_as_field(r):\n", 133 | " reg_comp = re.compile(\"[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}\");\n", 134 | " return reg_comp.findall(r['fields'])\n", 135 | "\n", 136 | "def manual_dedup(p):\n", 137 | " zes = \"00000000-0000-0000-0000-000000000000\"\n", 138 | " a = set()\n", 139 | " for c in p:\n", 140 | " if len(c) == 1:\n", 141 | " if c != zes:\n", 142 | " a |= set(c)\n", 143 | " else:\n", 144 | " for g in c:\n", 145 | " if g != zes:\n", 146 | " a |= set(g)\n", 147 | " uuid_list = list(a)\n", 148 | " return uuid_list" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 9, 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "name": "stdout", 158 | "output_type": "stream", 159 | "text": [ 160 | "lines of log data for TAAR service: 903766\n", 161 | "lines of log data after date filtering to study period: 807734\n" 162 | ] 163 | } 164 | ], 165 | "source": [ 166 | "# Filter out log data from outside experiment time\n", 167 | "# 2018-03-12 begin date\n", 168 | "# 2018-04-18 end date\n", 169 | "print(\"lines of log data for TAAR service: \" + str(taar_logs_timestamps.count()))\n", 170 | "taar_logs_time_filtered = taar_logs_timestamps.where((taar_logs_timestamps.parsed_time > dt.datetime(2018, 3, 12, 0, 0, 0)) & (taar_logs_timestamps.parsed_time < dt.datetime(2018, 4, 23, 0, 0, 0)))\n", 171 | "print(\"lines of log data after date filtering to study period: \" + str(taar_logs_time_filtered.count()))" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 10, 177 | "metadata": {}, 178 | "outputs": [ 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | "number of failed client lookups: 24470\n", 184 | "post deduplication: 21859\n" 185 | ] 186 | } 187 | ], 188 | "source": [ 189 | "# Find clients that had data retrieval failures\n", 190 | "def is_dynamo_interaction(p):\n", 191 | " return 'taar.adapters.dynamo' in p[\"type\"]\n", 192 | "\n", 193 | "def is_client_data_fail(p):\n", 194 | " return \"message=Error loading client data for\" in p[\"fields\"]\n", 195 | " \n", 196 | "clients_with_lookup_fail = taar_logs_time_filtered.rdd\\\n", 197 | " .filter(lambda p: is_dynamo_interaction(p))\\\n", 198 | " .filter(lambda p: is_client_data_fail(p))\\\n", 199 | " .map(lambda p: valid_uuid_as_field(p))\n", 200 | "\n", 201 | "print(\"number of failed client lookups: \" + str(clients_with_lookup_fail.count()))\n", 202 | "\n", 203 | "unique_output_failed_lookup_clientIDs = clients_with_lookup_fail.toDF().distinct().collect()\n", 204 | "print(\"post deduplication: \" + str(len(unique_output_failed_lookup_clientIDs)))\n", 205 | "\n", 206 | "# write the blacklist\n", 207 | "write_to_s3(\"net-mozaws-prod-us-west-2-pipeline-analysis\", \"failed_dynamo_clients.csv\", unique_output_failed_lookup_clientIDs)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 11, 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "name": "stdout", 217 | "output_type": "stream", 218 | "text": [ 219 | "number of linear taar service events: 471583\n", 220 | "unique clients served by linear taar: 175911\n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "def is_linear_recomender(p):\n", 226 | " return 'taar.recommenders.recommendation_manager' in p[\"type\"]\n", 227 | "\n", 228 | "# Find clients successfully served by linear\n", 229 | "client_ids_linear_serves = taar_logs_time_filtered.rdd\\\n", 230 | " .filter(lambda p: not is_dynamo_interaction(p))\\\n", 231 | " .filter(lambda p: not is_client_data_fail(p))\\\n", 232 | " .filter(lambda p: is_linear_recomender(p))\\\n", 233 | " .map(lambda p: valid_uuid_as_field(p))\n", 234 | " \n", 235 | "print(\"number of linear taar service events: \" + str(client_ids_linear_serves.count()))\n", 236 | "unique_client_ids_linear_serves = client_ids_linear_serves.collect()\n", 237 | "\n", 238 | "unique_client_ids_linear_serves = manual_dedup(unique_client_ids_linear_serves)\n", 239 | "print(\"unique clients served by linear taar: \" + str(len(unique_client_ids_linear_serves)))\n", 240 | "\n", 241 | "write_to_s3(\"net-mozaws-prod-us-west-2-pipeline-analysis\", \"clients_served_linear.csv\", unique_client_ids_linear_serves)" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 12, 247 | "metadata": {}, 248 | "outputs": [ 249 | { 250 | "name": "stdout", 251 | "output_type": "stream", 252 | "text": [ 253 | "number of ensemble taar service events: 287211\n", 254 | "unique clients served by ensemble taar: 175321\n" 255 | ] 256 | } 257 | ], 258 | "source": [ 259 | "def is_ensemble_recommender(p):\n", 260 | " return 'recommenders.ensemble_recommender' in p[\"type\"]\n", 261 | "\n", 262 | "def valid_ensemble_uuid(p):\n", 263 | " reg_comp = re.compile(\"message=client_id: \\\\[\")\n", 264 | " txt = reg_comp.split(p['fields'])\n", 265 | " return txt[1][0:36]\n", 266 | " \n", 267 | "# find clients successfully served by ensemble\n", 268 | "client_ids_ensemble_serves = taar_logs_time_filtered.rdd\\\n", 269 | " .filter(lambda p: not is_dynamo_interaction(p))\\\n", 270 | " .filter(lambda p: not is_client_data_fail(p))\\\n", 271 | " .filter(lambda p: is_ensemble_recommender(p))\\\n", 272 | " .map(lambda p: valid_ensemble_uuid(p))\n", 273 | " \n", 274 | "print(\"number of ensemble taar service events: \" + str(client_ids_ensemble_serves.count()))\n", 275 | "\n", 276 | "unique_client_ids_ensemble_serves = list(set(client_ids_ensemble_serves.collect()))\n", 277 | "print(\"unique clients served by ensemble taar: \" + str(len(unique_client_ids_ensemble_serves)))\n", 278 | "\n", 279 | "write_to_s3(\"net-mozaws-prod-us-west-2-pipeline-analysis\", \"clients_served_ensemble.csv\", unique_client_ids_ensemble_serves)" 280 | ] 281 | } 282 | ], 283 | "metadata": { 284 | "anaconda-cloud": {}, 285 | "kernelspec": { 286 | "display_name": "Python 2", 287 | "language": "python", 288 | "name": "python2" 289 | }, 290 | "language_info": { 291 | "codemirror_mode": { 292 | "name": "ipython", 293 | "version": 2 294 | }, 295 | "file_extension": ".py", 296 | "mimetype": "text/x-python", 297 | "name": "python", 298 | "nbconvert_exporter": "python", 299 | "pygments_lexer": "ipython2", 300 | "version": "2.7.13" 301 | }, 302 | "name": "taar_log_munge", 303 | "notebookId": 10421 304 | }, 305 | "nbformat": 4, 306 | "nbformat_minor": 1 307 | } 308 | -------------------------------------------------------------------------------- /bin/build: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -eo pipefail 3 | 4 | # create a version.json 5 | printf '{"commit":"%s","version":"%s","source":"https://github.com/%s/%s","build":"%s"}\n' \ 6 | "$CIRCLE_SHA1" \ 7 | "$CIRCLE_TAG" \ 8 | "$CIRCLE_PROJECT_USERNAME" \ 9 | "$CIRCLE_PROJECT_REPONAME" \ 10 | "$CIRCLE_BUILD_URL" \ 11 | > version.json 12 | 13 | echo "Building the docker image with the tag app:build" 14 | docker build -t app:build . 15 | -------------------------------------------------------------------------------- /bin/deploy: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -eo pipefail 3 | 4 | # default variables 5 | : "${CIRCLE_TAG:=latest}" 6 | 7 | # Usage: retry MAX CMD... 8 | # Retry CMD up to MAX times. If it fails MAX times, returns failure. 9 | # Example: retry 3 docker push "mozilla/telemetry-analysis-service:$TAG" 10 | function retry() { 11 | max=$1 12 | shift 13 | count=1 14 | until "$@"; do 15 | count=$((count + 1)) 16 | if [[ $count -gt $max ]]; then 17 | return 1 18 | fi 19 | echo "$count / $max" 20 | done 21 | return 0 22 | } 23 | 24 | echo "Logging into Docker hub" 25 | retry 3 docker login -u="$DOCKER_USER" -p="$DOCKER_PASS" 26 | 27 | echo "Tagging app:build with $CIRCLE_TAG" 28 | docker tag app:build "$DOCKERHUB_REPO:$CIRCLE_TAG" || 29 | (echo "Couldn't tag app:build as $DOCKERHUB_REPO:$CIRCLE_TAG" && false) 30 | 31 | echo "Pushing tag $CIRCLE_TAG to $DOCKERHUB_REPO" 32 | retry 3 docker push "$DOCKERHUB_REPO:$CIRCLE_TAG" || 33 | (echo "Couldn't push $DOCKERHUB_REPO:$CIRCLE_TAG" && false) 34 | 35 | echo "Pushed $DOCKERHUB_REPO:$TAG" 36 | -------------------------------------------------------------------------------- /bin/pipstrap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """A small script that can act as a trust root for installing pip 8 3 | 4 | Embed this in your project, and your VCS checkout is all you have to trust. In 5 | a post-peep era, this lets you claw your way to a hash-checking version of pip, 6 | with which you can install the rest of your dependencies safely. All it assumes 7 | is Python 2.7 or better and *some* version of pip already installed. If 8 | anything goes wrong, it will exit with a non-zero status code. 9 | 10 | """ 11 | # This is here so embedded copies are MIT-compliant: 12 | # Copyright (c) 2016 Erik Rose 13 | # 14 | # Permission is hereby granted, free of charge, to any person obtaining a copy 15 | # of this software and associated documentation files (the "Software"), to 16 | # deal in the Software without restriction, including without limitation the 17 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 18 | # sell copies of the Software, and to permit persons to whom the Software is 19 | # furnished to do so, subject to the following conditions: 20 | # 21 | # The above copyright notice and this permission notice shall be included in 22 | # all copies or substantial portions of the Software. 23 | from __future__ import print_function 24 | from hashlib import sha256 25 | from os.path import join 26 | from pipes import quote 27 | from shutil import rmtree 28 | from subprocess import check_output 29 | from sys import exit 30 | from tempfile import mkdtemp 31 | 32 | try: 33 | from urllib2 import build_opener, HTTPHandler, HTTPSHandler 34 | except ImportError: 35 | from urllib.request import build_opener, HTTPHandler, HTTPSHandler 36 | try: 37 | from urlparse import urlparse 38 | except ImportError: 39 | from urllib.parse import urlparse # 3.4 40 | 41 | 42 | PACKAGES = [ 43 | # Pip has no dependencies, as it vendors everything: 44 | ( 45 | "https://pypi.python.org/packages/source/p/pip/pip-8.0.2.tar.gz", 46 | "46f4bd0d8dfd51125a554568d646fe4200a3c2c6c36b9f2d06d2212148439521", 47 | ), 48 | # This version of setuptools has only optional dependencies: 49 | ( 50 | "https://pypi.python.org/packages/source/s/setuptools/" 51 | "setuptools-19.4.tar.gz", 52 | "214bf29933f47cf25e6faa569f710731728a07a19cae91ea64f826051f68a8cf", 53 | ), 54 | # We require Python 2.7 or later because we don't support wheel's 55 | # conditional dep on argparse. This version of wheel has no other 56 | # dependencies: 57 | ( 58 | "https://pypi.python.org/packages/source/w/wheel/wheel-0.26.0.tar.gz", 59 | "eaad353805c180a47545a256e6508835b65a8e830ba1093ed8162f19a50a530c", 60 | ), 61 | ] 62 | 63 | 64 | class HashError(Exception): 65 | def __str__(self): 66 | url, path, actual, expected = self.args 67 | return ( 68 | "{url} did not match the expected hash {expected}. Instead, " 69 | "it was {actual}. The file (left at {path}) may have been " 70 | "tampered with.".format(**locals()) 71 | ) 72 | 73 | 74 | def hashed_download(url, temp, digest): 75 | """Download ``url`` to ``temp``, make sure it has the SHA-256 ``digest``, 76 | and return its path.""" 77 | # Based on pip 1.4.1's URLOpener but with cert verification removed 78 | def opener(): 79 | opener = build_opener(HTTPSHandler()) 80 | # Strip out HTTPHandler to prevent MITM spoof: 81 | for handler in opener.handlers: 82 | if isinstance(handler, HTTPHandler): 83 | opener.handlers.remove(handler) 84 | return opener 85 | 86 | def read_chunks(response, chunk_size): 87 | while True: 88 | chunk = response.read(chunk_size) 89 | if not chunk: 90 | break 91 | yield chunk 92 | 93 | response = opener().open(url) 94 | path = join(temp, urlparse(url).path.split("/")[-1]) 95 | actual_hash = sha256() 96 | with open(path, "wb") as file: 97 | for chunk in read_chunks(response, 4096): 98 | file.write(chunk) 99 | actual_hash.update(chunk) 100 | 101 | actual_digest = actual_hash.hexdigest() 102 | if actual_digest != digest: 103 | raise HashError(url, path, actual_digest, digest) 104 | return path 105 | 106 | 107 | def main(): 108 | temp = mkdtemp(prefix="pipstrap-") 109 | try: 110 | downloads = [hashed_download(url, temp, digest) for url, digest in PACKAGES] 111 | check_output( 112 | "pip install --no-index --no-deps -U " 113 | + " ".join(quote(d) for d in downloads), 114 | shell=True, 115 | ) 116 | except HashError as exc: 117 | print(exc) 118 | except Exception: 119 | rmtree(temp) 120 | raise 121 | else: 122 | rmtree(temp) 123 | return 0 124 | return 1 125 | 126 | 127 | if __name__ == "__main__": 128 | exit(main()) 129 | -------------------------------------------------------------------------------- /bin/run: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -eo pipefail 3 | 4 | # default variables 5 | : "${PORT:=8000}" 6 | : "${SLEEP:=1}" 7 | : "${THREADS:=8}" 8 | : "${TRIES:=60}" 9 | : "${WORKERS:=4}" 10 | 11 | usage() { 12 | echo "usage: bin/run web|web-dev|test|python|shell" 13 | exit 1 14 | } 15 | 16 | wait_for() { 17 | tries=0 18 | echo "Waiting for $1 to listen on $2..." 19 | while true; do 20 | [[ $tries -lt $TRIES ]] || return 21 | (echo > /dev/tcp/$1/$2) >/dev/null 2>&1 22 | result= 23 | [[ $? -eq 0 ]] && return 24 | sleep $SLEEP 25 | tries=$((tries + 1)) 26 | done 27 | } 28 | 29 | [ $# -lt 1 ] && usage 30 | 31 | case $1 in 32 | web) 33 | . /opt/conda/etc/profile.d/conda.sh && conda activate taar-37 && exec newrelic-admin run-program gunicorn taar.flask_app:app --bind 0.0.0.0:${PORT} --workers ${WORKERS} --threads ${THREADS} --access-logfile - 34 | ;; 35 | web-dev) 36 | exec python taar/flask_app.py --host=0.0.0.0 --port=${PORT} 37 | ;; 38 | test) 39 | . /opt/conda/etc/profile.d/conda.sh && \ 40 | conda activate taar-37 && \ 41 | rm -f coverage.xml &&\ 42 | coverage erase && \ 43 | pytest --cov && \ 44 | # submit coverage 45 | coverage xml 46 | bash <(curl -s https://codecov.io/bash) 47 | ;; 48 | python) 49 | shift 50 | . /opt/conda/etc/profile.d/conda.sh && conda activate taar-37 && exec python $@ 51 | ;; 52 | shell) 53 | shift 54 | . /opt/conda/etc/profile.d/conda.sh && conda activate taar-37 && exec bash $@ 55 | ;; 56 | *) 57 | exec "$@" 58 | ;; 59 | esac 60 | -------------------------------------------------------------------------------- /bin/run_package_test.py: -------------------------------------------------------------------------------- 1 | # Emulate package call from Ensemble Spark job 2 | 3 | 4 | COLLABORATIVE, SIMILARITY, LOCALE = "collaborative", "similarity", "locale" 5 | PREDICTOR_ORDER = [COLLABORATIVE, SIMILARITY, LOCALE] 6 | 7 | 8 | def load_recommenders(): 9 | from taar.recommenders import LocaleRecommender 10 | from taar.recommenders import SimilarityRecommender 11 | from taar.recommenders import CollaborativeRecommender 12 | from taar.context import package_context 13 | 14 | ctx = package_context() 15 | 16 | lr = LocaleRecommender(ctx) 17 | sr = SimilarityRecommender(ctx) 18 | cr = CollaborativeRecommender(ctx) 19 | return {LOCALE: lr, COLLABORATIVE: cr, SIMILARITY: sr} 20 | 21 | 22 | if __name__ == '__main__': 23 | for i in range(2): 24 | rec_map = load_recommenders() 25 | 26 | recommender_list = [ 27 | rec_map[COLLABORATIVE].recommend, # Collaborative 28 | rec_map[SIMILARITY].recommend, # Similarity 29 | rec_map[LOCALE].recommend, # Locale 30 | ] 31 | 32 | client_data = {"installed_addons": ["uBlock0@raymondhill.net"], 33 | "locale": "en-CA", 34 | "client_id": "test-client-001", 35 | "activeAddons": [], 36 | "geo_city": "brasilia-br", 37 | "subsession_length": 4911, 38 | "os": "mac", 39 | "bookmark_count": 7, 40 | "tab_open_count": 4, 41 | "total_uri": 222, 42 | "unique_tlds": 21 43 | } 44 | 45 | for key, rec in rec_map.items(): 46 | print(key) 47 | assert rec.can_recommend(client_data) 48 | assert len(rec.recommend(client_data, limit=4)) == 4 49 | -------------------------------------------------------------------------------- /bin/taar-redis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from taar.interfaces import ITAARCache 3 | from taar.context import app_context 4 | import click 5 | 6 | 7 | @click.command() 8 | @click.option("--reset", is_flag=True, help="Reset the redis cache to an empty state") 9 | @click.option("--load", is_flag=True, help="Load data into redis") 10 | @click.option("--info", is_flag=True, help="Display information about the cache state") 11 | def main(reset, load, info): 12 | """ 13 | Manage the TAAR+TAARLite redis cache. 14 | 15 | This expecte that the following enviroment variables are set: 16 | 17 | REDIS_HOST 18 | REDIS_PORT 19 | """ 20 | if not (reset or load or info): 21 | print("No options were set!") 22 | return 23 | 24 | ctx = app_context() 25 | cache = ctx[ITAARCache] 26 | 27 | if reset: 28 | if cache.reset(): 29 | print("Successfully flushed db0 bookkeeping database.") 30 | else: 31 | print("Error while flushign db0 bookkeeping database.") 32 | if load: 33 | cache.safe_load_data() 34 | if info: 35 | cache.info() 36 | 37 | 38 | if __name__ == "__main__": 39 | main() 40 | -------------------------------------------------------------------------------- /bin/test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -eo pipefail 3 | 4 | # default variables 5 | export DEVELOPMENT=1 6 | 7 | # pass CI env vars into docker containers for codecov submission 8 | [ ! -z ${CI+check} ] && \ 9 | echo "Getting Codecov environment variables" && \ 10 | export CI_ENV=`bash <(curl -s https://codecov.io/env)` 11 | 12 | # run docker compose with the given environment variables 13 | docker-compose run -e DEVELOPMENT $CI_ENV web tox -etestsnocov 14 | -------------------------------------------------------------------------------- /bin/test_env.sh: -------------------------------------------------------------------------------- 1 | # Setup shell test enviroment settings 2 | export TAAR_API_PLUGIN=taar.plugin 3 | export DISABLE_REDIS=True 4 | export TAAR_MAX_RESULTS=10 5 | export BIGTABLE_PROJECT_ID=moz-fx-data-taar-pr-prod-e0f7 6 | export BIGTABLE_INSTANCE_ID=taar-prod-202006 7 | export BIGTABLE_TABLE_ID=taar_profile 8 | export TAAR_ITEM_MATRIX_BUCKET=moz-fx-data-taar-pr-prod-e0f7-prod-models 9 | export TAAR_ITEM_MATRIX_KEY=addon_recommender/item_matrix.json.bz2 10 | export TAAR_ADDON_MAPPING_BUCKET=moz-fx-data-taar-pr-prod-e0f7-prod-models 11 | export TAAR_ADDON_MAPPING_KEY=addon_recommender/addon_mapping.json.bz2 12 | export TAAR_ENSEMBLE_BUCKET=moz-fx-data-taar-pr-prod-e0f7-prod-models 13 | export TAAR_ENSEMBLE_KEY=taar/ensemble/ensemble_weight.json.bz2 14 | export TAAR_WHITELIST_BUCKET=moz-fx-data-taar-pr-prod-e0f7-prod-models 15 | export TAAR_WHITELIST_KEY=addon_recommender/only_guids_top_200.json.bz2 16 | export TAAR_LOCALE_BUCKET=moz-fx-data-taar-pr-prod-e0f7-prod-models 17 | export TAAR_LOCALE_KEY=taar/locale/top10_dict.json.bz2 18 | export TAAR_SIMILARITY_BUCKET=moz-fx-data-taar-pr-prod-e0f7-prod-models 19 | export TAAR_SIMILARITY_DONOR_KEY=taar/similarity/donors.json.bz2 20 | export TAAR_SIMILARITY_LRCURVES_KEY=taar/similarity/lr_curves.json.bz2 21 | export TAARLITE_GUID_COINSTALL_BUCKET=moz-fx-data-taar-pr-prod-e0f7-prod-models 22 | export TAARLITE_GUID_COINSTALL_KEY=taar/lite/guid_coinstallation.json.bz2 23 | export TAARLITE_GUID_RANKING_KEY=taar/lite/guid_install_ranking.json.bz2 -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.8" 2 | services: 3 | redis: 4 | image: "redis:alpine" 5 | ports: 6 | - "6379:6379" 7 | web: 8 | image: "taar:latest" 9 | depends_on: 10 | - redis 11 | volumes: 12 | - /Users/epavlov/.gcp_creds:/app/.gcp_creds 13 | environment: 14 | - WORKERS=1 15 | - THREADS=2 16 | - LOG_LEVEL=20 17 | - GOOGLE_APPLICATION_CREDENTIALS=/app/.gcp_creds/moz-fx-data-taar-pr-prod-e0f7-bf36ebdc13e9.json 18 | - REDIS_HOST=redis 19 | - TAAR_API_PLUGIN=taar.plugin 20 | - TAAR_MAX_RESULTS=10 21 | - TAARLITE_MAX_RESULTS=4 22 | - BIGTABLE_PROJECT_ID=moz-fx-data-taar-pr-prod-e0f7 23 | - BIGTABLE_INSTANCE_ID=taar-prod-202006 24 | - BIGTABLE_TABLE_ID=taar_profile 25 | ports: 26 | - "8000:8000" 27 | populate-redis: 28 | image: "taar:latest" 29 | command: 30 | - python 31 | - /opt/conda/bin/taar-redis.py 32 | - --load 33 | depends_on: 34 | - redis 35 | volumes: 36 | - /Users/epavlov/.gcp_creds:/app/.gcp_creds 37 | environment: 38 | - LOG_LEVEL=20 39 | - GOOGLE_APPLICATION_CREDENTIALS=/app/.gcp_creds/moz-fx-data-taar-pr-prod-e0f7-bf36ebdc13e9.json 40 | - REDIS_HOST=redis 41 | - TAAR_ITEM_MATRIX_BUCKET=moz-fx-data-taar-pr-prod-e0f7-prod-models 42 | - TAAR_ITEM_MATRIX_KEY=addon_recommender/item_matrix.json.bz2 43 | - TAAR_ADDON_MAPPING_BUCKET=moz-fx-data-taar-pr-prod-e0f7-prod-models 44 | - TAAR_ADDON_MAPPING_KEY=addon_recommender/addon_mapping.json.bz2 45 | - TAAR_ENSEMBLE_BUCKET=moz-fx-data-taar-pr-prod-e0f7-prod-models 46 | - TAAR_ENSEMBLE_KEY=taar/ensemble/ensemble_weight.json.bz2 47 | - TAAR_WHITELIST_BUCKET=moz-fx-data-taar-pr-prod-e0f7-prod-models 48 | - TAAR_WHITELIST_KEY=addon_recommender/only_guids_top_200.json.bz2 49 | - TAAR_LOCALE_BUCKET=moz-fx-data-taar-pr-prod-e0f7-prod-models 50 | - TAAR_LOCALE_KEY=taar/locale/top10_dict.json.bz2 51 | - TAAR_SIMILARITY_BUCKET=moz-fx-data-taar-pr-prod-e0f7-prod-models 52 | - TAAR_SIMILARITY_DONOR_KEY=taar/similarity/donors.json.bz2 53 | - TAAR_SIMILARITY_LRCURVES_KEY=taar/similarity/lr_curves.json.bz2 54 | - TAARLITE_GUID_COINSTALL_BUCKET=moz-fx-data-taar-pr-prod-e0f7-prod-models 55 | - TAARLITE_GUID_COINSTALL_KEY=taar/lite/guid_coinstallation.json.bz2 56 | - TAARLITE_GUID_RANKING_KEY=taar/lite/guid_install_ranking.json.bz2 57 | 58 | 59 | -------------------------------------------------------------------------------- /docs/TAARLITE-README.md: -------------------------------------------------------------------------------- 1 | # Taar-lite 2 | 3 | The TAAR-lite service has been merged into the main TAAR repository 4 | now. 5 | 6 | TAAR-lite exposes a GUID-GUID recommender that recommends addons based 7 | on the co-installation rate of each accept-list addons with other 8 | accept listed addons. 9 | 10 | 11 | #### ETL workflow AMO guid-guid TAAR-lite 12 | * [taar_amodump.py](https://github.com/mozilla/taar_gcp_etl/blob/master/taar_etl/taar_amodump.py) 13 | * Scheduled to run daily 14 | * Collects all listed addons by callign the [AMO public API](https://addons.mozilla.org/api/v4/addons/search/) endpoint 15 | * Applies filter returning only Firefox Web Browser Extensions 16 | * Writes __extended_addons_database.json__ 17 | * [taar_amowhitelist.py](https://github.com/mozilla/taar_gcp_etl/blob/master/taar_etl/taar_amowhitelist.py) 18 | * Scheduled to run daily, dependent on successful completion of [taar_amodump.py](https://github.com/mozilla/taar_gcp_etl/blob/master/taar_etl/taar_amodump.py) 19 | * Filters the addons contained in __extended_addons_database.json__ 20 | * removes legacy addons 21 | * removes Web Extensions with a rating < 3.0 22 | * removes Web Extensions uploaded less than 60 days ago 23 | * removes [Firefox Pioneer](https://addons.mozilla.org/en-GB/firefox/addon/firefox-pioneer/?src=search) 24 | * Writes __whitelist_addons_database.json__ 25 | * [taar_lite_guidguid.py](https://github.com/mozilla/taar_gcp_etl/blob/master/taar_etl/taar_lite_guidguid.py) 26 | * Computes the coinstallation rate of each whitelisted addon with other whitelisted addons for a sample of Firefox clients 27 | * Removes rare combinations of coinstallations 28 | * writes __guid_coinstallation.json__ 29 | 30 | ## Build and run tests 31 | 32 | The main TAAR build and test instructions are applicable as this is 33 | now a unified codebase. 34 | -------------------------------------------------------------------------------- /docs/randomized_tails.md: -------------------------------------------------------------------------------- 1 | # Randomized tail selection of addons 2 | 3 | Randomized recommendations does not mean that recommendations are 4 | fully randomized. Weights for each recommendation are normalized to 5 | so that the sum of weights equals 1.0. 6 | 7 | Using `numpy.random.choice` - we then select a non-uniform random 8 | sample from the list of suggestions without replacement. Weights are 9 | used to define a vector of probabilities. 10 | -------------------------------------------------------------------------------- /docs/release_instructions.md: -------------------------------------------------------------------------------- 1 | # Instructions for releasing updates 2 | 3 | ## Overview 4 | 5 | Releases for TAAR are split across ETL jobs for Airflow and the 6 | webservice that handles traffic coming from addons.mozilla.org. 7 | 8 | You may or may not need to upgrade all parts at once. 9 | 10 | ### ETL release instructions 11 | 12 | ETL releases are subdivided further into 3 categories: 13 | 14 | 1. Scala code that requires deployment by Java JAR file to a Dataproc environment 15 | 2. PySpark code that requires deployment by a single monolithic script in the 16 | Dataproc enviroment. These are stored in [telemetry-airflow/jobs] 17 | and are autodeployed to gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs 18 | 3. Python code that executes in a Google Kubernetes Engine (GKE) 19 | enviroment using a docker container image. 20 | 4. TAAR User profile information 21 | 22 | #### 1. Scala jobs for Dataproc 23 | * [com.mozilla.telemetry.ml.AddonRecommender](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/ml/AddonRecommender.scala) from telemetry-batch-view.jar 24 | 25 | #### 2. PySpark jobs for Dataproc 26 | 27 | * [telemetry-airflow/jobs/taar_locale.py](https://github.com/mozilla/telemetry-airflow/blob/master/jobs/taar_locale.py) 28 | * [telemetry-airflow/jobs/taar_similarity.py](https://github.com/mozilla/telemetry-airflow/blob/master/jobs/taar_similarity.py) 29 | * [telemetry-airflow/jobs/taar_lite_guidguid.py](https://github.com/mozilla/telemetry-airflow/blob/master/jobs/taar_lite_guidguid.py) 30 | 31 | #### 3. GKEPodOperator jobs 32 | 33 | * [taar_etl_.taar_amodump](https://github.com/mozilla/taar_gcp_etl/blob/master/taar_etl/taar_amodump.py) 34 | * [taar_etl.taar_amowhitelist](https://github.com/mozilla/taar_gcp_etl/blob/master/taar_etl/taar_amowhitelist.py) 35 | * [taar_etl.taar_update_whitelist](https://github.com/mozilla/taar_gcp_etl/blob/master/taar_etl/taar_update_whitelist.py) 36 | * [taar_etl.taar_lite_guid_ranking](https://github.com/mozilla/taar_gcp_etl/blob/master/taar_etl/taar_lite_guid_ranking.py) 37 | 38 | 39 | #### 4. TAAR User profile information 40 | 41 | The TAAR User profile information is stored in Cloud BigTable. The 42 | job is run as a list of idempotent steps. All tasks are contained in 43 | a single file at: 44 | 45 | * [taar_etl.taar_profile_bigtable](https://github.com/mozilla/taar_gcp_etl/blob/master/taar_etl/taar_profile_bigtable.py) 46 | 47 | 48 | ## Jobs are scheduled in two separate DAGs in Airflow. 49 | 50 | * [taar_daily](https://github.com/mozilla/telemetry-airflow/blob/master/dags/taar_daily.py) 51 | * [taar_weekly](https://github.com/mozilla/telemetry-airflow/blob/master/dags/taar_weekly.py) 52 | 53 | 54 | ### Updating code for GKEPodOperator jobs 55 | 56 | G#KEPodOperator jobs must have code packaged up as containers for 57 | execution in GKE. Code can be found in the taar_gcp_etl repository. 58 | Detailed build instructions can be found in the 59 | [README.md](https://github.com/mozilla/taar_gcp_etl/blob/master/README.md) 60 | in that repository. 61 | 62 | Generally, if you tag a revision in `taar_gcp_etl` - CircleCI will build the production 63 | container for you automatically. You will also need to update the 64 | container tag in the `taar_daily` or `taar_weekly` DAGs. 65 | 66 | ### Updating code for PySpark jobs 67 | 68 | PySpark jobs are maintained in the telemetry-airflow repository. You 69 | must take care to update the code in that repository and have it 70 | merged to master for code to autodeploy into the production Airflow instance. 71 | 72 | 73 | Airflow execution will always copy jobs out of the 74 | [jobs](https://github.com/mozilla/telemetry-airflow/tree/master/jobs) 75 | into `gs://moz-fx-data-prod-airflow-dataproc-artifacts/` 76 | 77 | ### Updating code for the Scala ETL job 78 | 79 | The sole scala job remaining is part of the telemetry-batch-view 80 | repository. Airflow will automatically use the latest code in the 81 | master branch of `telemetry-batch-view`. 82 | 83 | 84 | ## Deploying TAAR the webservice 85 | 86 | The TAAR webservice is setup as a single container with no dependant 87 | containers. If you are familiar with earlier versions of TAAR, you 88 | may be expecting redis servers to also be required - this is no longer 89 | the case. Models are sufficiently small that they can held in memory. 90 | 91 | Tagging a version in git will trigger CircleCI to build a container 92 | image for production. 93 | 94 | Autopush on tag is currently enabled for staging environment. 95 | 96 | You must inform operations to push the tag to production enviroment. 97 | 98 | 99 | ## Deploying Pypi package required for Ensemble Spark job 100 | 101 | Update package version in setup.py 102 | 103 | `make all` 104 | 105 | `make upload` 106 | 107 | Update package version in `taar_weekly` airflow DAG. 108 | 109 | 110 | 111 | ## A note about logging 112 | 113 | tl;dr - Do **NOT** use python's logging module for any logging in the TAAR 114 | repository. TAAR's recommendation code is used by the ETL jobs - some 115 | of which execute inside a PySpark enviroment and logging is 116 | incompatible with PySpark. 117 | 118 | PySpark distributes executable objects across the spark worker nodes 119 | by pickling live objects. Unfortunately, Python uses non-serizable 120 | mutexes in the logging module which was not fixed until python 3.8. 121 | 122 | See the https://bugs.python.org/issue30520 for details. 123 | 124 | You cannot upgrade TAAR to use Python 3.8 either, as the full 125 | numerical computation stack of PySpark, numpy, scipy, sklearn do not 126 | properly support Python 3.8. 127 | 128 | So again -just **don't use python logging**. 129 | -------------------------------------------------------------------------------- /docs/taarlite-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/taar/f542a1ec1ea50812c81a9782922447adc0a5bfab/docs/taarlite-screenshot.png -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: taar-37 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - pip=20.1.1 7 | - python=3.7.6 8 | - python_abi=3.7 9 | - setuptools=46.4.0 10 | - numpy=1.18.4 11 | - scipy=1.4.1 12 | - pip: 13 | - arrow==0.12.1 14 | - asn1crypto==0.24.0 15 | - atomicwrites==1.1.5 16 | - attrs==18.1.0 17 | - aws==0.2.5 18 | - aws-sam-translator==1.23.0 19 | - aws-xray-sdk==0.95 20 | - backports-functools-lru-cache==1.5 21 | - backports-ssl-match-hostname==3.5.0.1 22 | - backports-tempfile==1.0 23 | - backports-weakref==1.0.post1 24 | - bcrypt==3.1.4 25 | - binaryornot==0.4.4 26 | - blinker==1.4 27 | - boto==2.49.0 28 | - boto3==1.13.13 29 | - botocore==1.16.13 30 | - cachetools==4.1.0 31 | - certifi==2018.10.15 32 | - cffi==1.11.5 33 | - cfn-lint==0.32.1 34 | - chardet==3.0.4 35 | - click==6.7 36 | - configparser==3.5.0 37 | - cookiecutter==1.6.0 38 | - cookies==2.2.1 39 | - coverage==4.5.1 40 | - coveralls==1.3.0 41 | - cryptography==2.3 42 | - decorator==4.3.0 43 | - docker==3.4.1 44 | - docker-pycreds==0.3.0 45 | - dockerflow==2018.4.0 46 | - docopt==0.6.2 47 | - docutils==0.14 48 | - ecdsa==0.15 49 | - entrypoints==0.3 50 | - enum34==1.1.6 51 | - fabric==2.1.3 52 | - fakeredis==1.4.3 53 | - flake8==3.7.7 54 | - flask==1.0.2 55 | - flask-api==1.0 56 | - funcsigs==1.0.2 57 | - future==0.16.0 58 | - google-api-core==1.17.0 59 | - google-auth==1.15.0 60 | - google-cloud-bigtable==1.2.1 61 | - google-cloud-core==1.3.0 62 | - google-cloud-storage==1.19.1 63 | - googleapis-common-protos==1.51.0 64 | - grpc-google-iam-v1==0.12.3 65 | - grpcio==1.29.0 66 | - gunicorn==19.9.0 67 | - idna==2.7 68 | - importlib-metadata==1.6.0 69 | - invoke==1.1.0 70 | - ipaddress==1.0.22 71 | - iso8601==0.1.12 72 | - itsdangerous==0.24 73 | - jedi==0.12.1 74 | - jinja2==2.10.1 75 | - jinja2-time==0.2.0 76 | - jmespath==0.9.3 77 | - jsondiff==1.1.2 78 | - jsonpatch==1.25 79 | - jsonpickle==0.9.6 80 | - jsonpointer==2.0 81 | - jsonschema==3.2.0 82 | - junit-xml==1.9 83 | - mccabe==0.6.1 84 | - markus[datadog]==2.2.0 85 | - mock==2.0.0 86 | - more-itertools==4.2.0 87 | - mozilla-jsoncache==0.1.7 88 | - networkx==2.4 89 | - newrelic==5.14.1.144 90 | - packaging==17.1 91 | - paramiko==2.4.2 92 | - parso==0.3.1 93 | - pathlib2==2.3.2 94 | - pbr==4.1.0 95 | - pexpect==4.6.0 96 | - pickleshare==0.7.4 97 | - pip-api==0.0.1 98 | - pkginfo==1.4.2 99 | - pluggy==0.13.1 100 | - ply==3.11 101 | - poyo==0.4.1 102 | - prettytable==0.7.2 103 | - prompt-toolkit==1.0.15 104 | - protobuf==3.12.0 105 | - py==1.5.3 106 | - pyaml==17.12.1 107 | - pyasn1==0.4.3 108 | - pyasn1-modules==0.2.8 109 | - pycodestyle==2.5.0 110 | - pycparser==2.18 111 | - pyflakes==2.1.1 112 | - pygments==2.2.0 113 | - pynacl==1.2.1 114 | - pyparsing==2.2.0 115 | - pyrsistent==0.16.0 116 | - pytest==5.4.2 117 | - pytest-cov==2.5.1 118 | - pytest-flask==1.0.0 119 | - python-dateutil==2.6.1 120 | - python-decouple==3.3 121 | - python-jose==3.1.0 122 | - pytz==2018.5 123 | - pyyaml==5.3.1 124 | - redis==3.5.3 125 | - requests==2.23.0 126 | - requests-toolbelt==0.8.0 127 | - responses==0.9.0 128 | - rope==0.17.0 129 | - s3transfer==0.3.3 130 | - scandir==1.8 131 | - sentry-sdk==0.7.3 132 | - setuptools-scm==2.1.0 133 | - simplegeneric==0.8.1 134 | - spark==0.2.1 135 | - spark-parser==1.8.7 136 | - sshpubkeys==3.1.0 137 | - tox==3.0.0 138 | - tqdm==4.23.4 139 | - translationstring==1.3 140 | - twine==1.11.0 141 | - uncompyle2==2.0.0 142 | - uncompyle6==3.2.0 143 | - urllib3==1.25.9 144 | - virtualenv==16.0.0 145 | - wcwidth==0.1.7 146 | - websocket-client==0.48.0 147 | - werkzeug==1.0.1 148 | - whichcraft==0.4.1 149 | - wrapt==1.10.11 150 | - xdis==3.8.2 151 | - xmltodict==0.11.0 152 | - zipp==3.1.0 153 | -------------------------------------------------------------------------------- /prod-requirements.txt: -------------------------------------------------------------------------------- 1 | blinker==1.4 2 | certifi==2018.10.15 3 | newrelic==4.6.0.106 4 | gunicorn==19.9.0 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length=100 3 | exclude= 4 | # ignore the migrations since they are created faulty by default 5 | # No need to traverse our git directory 6 | .git, 7 | # There's no value in checking cache directories 8 | __pycache__, 9 | 10 | # ignore spaces around keyword arguments and dict entries, 11 | # which are very useful for alignment 12 | ignore=E221,E251,E241,E501 13 | 14 | [aliases] 15 | test=pytest 16 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="mozilla-taar3", 5 | use_scm_version=False, 6 | version="1.0.7", 7 | setup_requires=["setuptools_scm", "pytest-runner"], 8 | tests_require=["pytest"], 9 | include_package_data=True, 10 | packages=find_packages(exclude=["tests", "tests/*"]), 11 | description="Telemetry-Aware Addon Recommender", 12 | author="Mozilla Foundation", 13 | author_email="fx-data-dev@mozilla.org", 14 | url="https://github.com/mozilla/taar", 15 | license="MPL 2.0", 16 | install_requires=[], 17 | classifiers=[ 18 | "Development Status :: 3 - Alpha", 19 | "Environment :: Web Environment :: Mozilla", 20 | "Intended Audience :: Developers", 21 | "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)", 22 | "Programming Language :: Python", 23 | "Programming Language :: Python :: 3", 24 | "Programming Language :: Python :: 3.7", 25 | "Topic :: Internet :: WWW/HTTP", 26 | "Topic :: Scientific/Engineering :: Information Analysis", 27 | ], 28 | entry_points=""" 29 | [taarapi_app] 30 | app=taar.plugin:configure_plugin 31 | """, 32 | scripts=["bin/taar-redis.py"], 33 | zip_safe=False, 34 | ) 35 | -------------------------------------------------------------------------------- /taar/__init__.py: -------------------------------------------------------------------------------- 1 | import pkg_resources 2 | 3 | __version__ = pkg_resources.require("mozilla-taar3")[0].version 4 | -------------------------------------------------------------------------------- /taar/adapters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/taar/f542a1ec1ea50812c81a9782922447adc0a5bfab/taar/adapters/__init__.py -------------------------------------------------------------------------------- /taar/adapters/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/taar/f542a1ec1ea50812c81a9782922447adc0a5bfab/taar/adapters/tests/__init__.py -------------------------------------------------------------------------------- /taar/context.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | # copy paste from https://github.com/mozilla/srgutil to get rid of this heavy legacy dependency 6 | 7 | """ 8 | A Context is a customizable namespace. 9 | 10 | It works like a regular dictionary, but allows you to set a delegate 11 | explicitly to do attribute lookups. 12 | 13 | The primary benefit is that the context has a .child() method which 14 | lets you 'lock' a dictionary and clobber the namespace without 15 | affecting parent contexts. 16 | 17 | In practice this makes testing easier and allows us to specialize 18 | configuration information as we pass the context through an object 19 | chain. 20 | """ 21 | from taar.interfaces import IMozLogging, ITAARCache 22 | 23 | 24 | class InvalidInterface(Exception): 25 | """Raise this when impl() fails to export an implementation""" 26 | pass 27 | 28 | 29 | class Context: 30 | def __init__(self, delegate=None): 31 | if delegate is None: 32 | delegate = {} 33 | 34 | self._local_dict = {} 35 | self._delegate = delegate 36 | 37 | def __contains__(self, key): 38 | try: 39 | self[key] 40 | return True 41 | except KeyError: 42 | return False 43 | 44 | def __getitem__(self, key): 45 | # This is a little tricky, we want to lookup items in our 46 | # local namespace before we hit the delegate 47 | try: 48 | result = self._local_dict[key] 49 | except KeyError: 50 | result = self._delegate[key] 51 | return result 52 | 53 | def get(self, key, default=None): 54 | try: 55 | result = self[key] 56 | except KeyError: 57 | result = default 58 | return result 59 | 60 | def __setitem__(self, key, value): 61 | self._local_dict[key] = value 62 | 63 | def __delitem__(self, key): 64 | del self._local_dict[key] 65 | 66 | def wrap(self, ctx): 67 | ctx_child = ctx.child() 68 | this_child = self.child() 69 | this_child._delegate = ctx_child 70 | return this_child 71 | 72 | def child(self): 73 | """ In general, you should call this immediately in any 74 | constructor that receives a context """ 75 | 76 | return Context(self) 77 | 78 | def impl(self, iface): 79 | instance = self._local_dict[iface] 80 | if not isinstance(instance, iface): 81 | raise InvalidInterface("Instance [%s] doesn't implement requested interface.") 82 | return instance 83 | 84 | 85 | def package_context(): 86 | """ 87 | Prepare context with minimal dependencies for TAAR package to be used in Ensemble recommender Spark job 88 | """ 89 | from taar.settings import PackageCacheSettings 90 | from taar.logs.stubs import LoggingStub 91 | from taar.recommenders.cache import TAARCache 92 | 93 | ctx = Context() 94 | ctx['cache_settings'] = PackageCacheSettings 95 | ctx[IMozLogging] = LoggingStub(ctx) 96 | ctx[ITAARCache] = TAARCache(ctx) 97 | 98 | return ctx 99 | 100 | 101 | def app_context(): 102 | """ 103 | Prepare context for TAAR web servie 104 | """ 105 | from taar.settings import AppSettings, DefaultCacheSettings, RedisCacheSettings 106 | from taar.recommenders.cache import TAARCache 107 | from taar.recommenders.redis_cache import TAARCacheRedis 108 | from taar.logs.moz_logging import Logging 109 | 110 | ctx = Context() 111 | 112 | logger = Logging(ctx) 113 | logger.set_log_level(AppSettings.PYTHON_LOG_LEVEL) 114 | ctx[IMozLogging] = logger 115 | 116 | if AppSettings.DISABLE_REDIS: 117 | ctx['cache_settings'] = DefaultCacheSettings 118 | ctx[ITAARCache] = TAARCache.get_instance(ctx) 119 | else: 120 | ctx['cache_settings'] = RedisCacheSettings 121 | ctx[ITAARCache] = TAARCacheRedis.get_instance(ctx) 122 | 123 | from taar.recommenders import CollaborativeRecommender 124 | from taar.recommenders import SimilarityRecommender 125 | from taar.recommenders import LocaleRecommender 126 | 127 | # Note that the EnsembleRecommender is *not* in this map as it 128 | # needs to ensure that the recommender_map key is installed in the 129 | # context 130 | ctx["recommender_factory_map"] = { 131 | "collaborative": lambda: CollaborativeRecommender(ctx.child()), 132 | "similarity": lambda: SimilarityRecommender(ctx.child()), 133 | "locale": lambda: LocaleRecommender(ctx.child()), 134 | } 135 | 136 | return ctx 137 | -------------------------------------------------------------------------------- /taar/flask_app.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import sys 6 | from flask import Flask 7 | from dockerflow.flask import Dockerflow 8 | import optparse 9 | from decouple import config 10 | import importlib 11 | import sentry_sdk 12 | from sentry_sdk.integrations.flask import FlaskIntegration 13 | 14 | 15 | app = Flask(__name__) 16 | dockerflow = Dockerflow(app) 17 | 18 | # Hook the application plugin and configure it 19 | PLUGIN = config("TAAR_API_PLUGIN", default=None) 20 | 21 | 22 | sentry_sdk.init( 23 | dsn=config("SENTRY_DSN", ""), integrations=[FlaskIntegration()], 24 | ) 25 | 26 | # There should only be a single registered app for the taar-api 27 | if PLUGIN is None: 28 | sys.stderr.write("No plugin is defined.\n") 29 | sys.exit(1) 30 | 31 | 32 | # Load the function and configure the application 33 | sys.stdout.write("Loading [{}]\n".format(PLUGIN)) 34 | 35 | plugin_module = importlib.import_module(PLUGIN) 36 | configure_plugin = importlib.import_module(PLUGIN).configure_plugin 37 | APP_WRAPPER = configure_plugin(app) 38 | 39 | 40 | def flaskrun(app, default_host="127.0.0.1", default_port="8000"): 41 | """ 42 | Takes a flask.Flask instance and runs it. Parses 43 | command-line flags to configure the app. 44 | """ 45 | 46 | # Set up the command-line options 47 | parser = optparse.OptionParser() 48 | parser.add_option( 49 | "-H", 50 | "--host", 51 | help="Hostname of the Flask app " + "[default %s]" % default_host, 52 | default=default_host, 53 | ) 54 | parser.add_option( 55 | "-P", 56 | "--port", 57 | help="Port for the Flask app " + "[default %s]" % default_port, 58 | default=default_port, 59 | ) 60 | 61 | # Two options useful for debugging purposes, but 62 | # a bit dangerous so not exposed in the help message. 63 | parser.add_option( 64 | "-d", "--debug", action="store_true", dest="debug", help=optparse.SUPPRESS_HELP 65 | ) 66 | parser.add_option( 67 | "-p", 68 | "--profile", 69 | action="store_true", 70 | dest="profile", 71 | help=optparse.SUPPRESS_HELP, 72 | ) 73 | 74 | options, _ = parser.parse_args() 75 | 76 | # If the user selects the profiling option, then we need 77 | # to do a little extra setup 78 | if options.profile: 79 | from werkzeug.contrib.profiler import ProfilerMiddleware 80 | 81 | app.config["PROFILE"] = True 82 | app.wsgi_app = ProfilerMiddleware(app.wsgi_app, restrictions=[30]) 83 | options.debug = True 84 | 85 | app.run(debug=options.debug, host=options.host, port=int(options.port)) 86 | 87 | 88 | if __name__ == "__main__": 89 | flaskrun(app) 90 | -------------------------------------------------------------------------------- /taar/interfaces.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | # copy paste from https://github.com/mozilla/srgutil to get rid of this heavy legacy dependency 6 | try: 7 | from abc import ABC 8 | except Exception: 9 | from abc import ABCMeta 10 | 11 | class ABC(object): 12 | """Helper class that provides a standard way to create an ABC using 13 | inheritance. 14 | """ 15 | __metaclass__ = ABCMeta 16 | __slots__ = () 17 | 18 | 19 | class IMozLogging(ABC): 20 | def get_logger(self, name): 21 | """Get a logger with the current configuration 22 | """ 23 | 24 | def set_log_level(self, level): 25 | """Set the logs level, fox example 'DEBUG' 26 | """ 27 | 28 | 29 | class ITAARCache(ABC): 30 | def safe_load_data(self): 31 | raise NotImplementedError() 32 | 33 | def cache_context(self): 34 | raise NotImplementedError() 35 | -------------------------------------------------------------------------------- /taar/logs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/taar/f542a1ec1ea50812c81a9782922447adc0a5bfab/taar/logs/__init__.py -------------------------------------------------------------------------------- /taar/logs/moz_logging.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | # copy paste from https://github.com/mozilla/srgutil to get rid of this heavy legacy dependency 6 | from taar.interfaces import IMozLogging 7 | import logging.config 8 | import sys 9 | 10 | 11 | class ContextFilter(logging.Filter): 12 | """Enhances log messages with contextual information""" 13 | 14 | def __init__(self, logger, func): 15 | 16 | super().__init__() 17 | self._logger = logger 18 | self._func = func 19 | 20 | def filter(self, log_record): 21 | try: 22 | if self._func: 23 | self._func(log_record) 24 | except RuntimeError: 25 | pass 26 | 27 | return True 28 | 29 | def __enter__(self): 30 | self._logger.addFilter(self) 31 | 32 | def __exit__(self, type, value, traceback): 33 | self._logger.removeFilter(self) 34 | 35 | 36 | class Logging(IMozLogging): 37 | LOG_NAME = 'srg' 38 | _log_config = { 39 | # Note that the formatters.json.logger_name must match 40 | # loggers. key 41 | 'version': 1, 42 | 'formatters': { 43 | 'json': { 44 | '()': 'dockerflow.logging.JsonLogFormatter', 45 | 'logger_name': LOG_NAME 46 | } 47 | }, 48 | 'handlers': { 49 | 'console': { 50 | 'level': 'INFO', 51 | 'class': 'logging.StreamHandler', 52 | 'formatter': 'json', 53 | 'stream': sys.stdout 54 | }, 55 | }, 56 | 57 | 'loggers': { 58 | LOG_NAME: { 59 | 'handlers': ['console'], 60 | 'level': 'INFO' 61 | }, 62 | } 63 | } 64 | 65 | def __init__(self, ctx): 66 | self._ctx = ctx 67 | self._logger_prefix = '' 68 | self._apply_config() 69 | 70 | def set_log_level(self, log_level): 71 | self._log_config['loggers'][self.LOG_NAME]['level'] = log_level 72 | self._log_config['handlers']['console']['level'] = log_level 73 | self._apply_config() 74 | 75 | def _apply_config(self): 76 | self._logger_prefix = self._log_config['formatters']['json']['logger_name'] 77 | logging.config.dictConfig(self._log_config) 78 | 79 | def get_logger(self, name): 80 | return logging.getLogger("%s.%s" % (self._logger_prefix, name)) 81 | -------------------------------------------------------------------------------- /taar/logs/stubs.py: -------------------------------------------------------------------------------- 1 | from sys import exc_info 2 | 3 | from taar.interfaces import IMozLogging 4 | 5 | 6 | class EmergencyLogger: 7 | """ 8 | We need this one to get rid of python logging dependency in Ensemble spark job 9 | (see more detailed explanation in readme). 10 | It uses only print and logs only errors and warnings 11 | """ 12 | def debug(self, msg, *args, **kwags): 13 | pass 14 | 15 | def info(self, msg, *args, **kwags): 16 | pass 17 | 18 | def warn(self, msg, *args, **kwags): 19 | print(f'WARN: {msg}') 20 | 21 | def warning(self, msg, *args, **kwags): 22 | self.warn(msg) 23 | 24 | def error(self, msg, e=None, *args, **kwags): 25 | print(f'ERROR: {msg}, {e or exc_info()}') 26 | 27 | def exception(self, msg, *args, **kwargs): 28 | self.error(msg, *args, **kwargs) 29 | 30 | 31 | class LoggingStub(IMozLogging): 32 | def __init__(self, ctx): 33 | pass 34 | 35 | def get_logger(self, name): 36 | return EmergencyLogger() 37 | 38 | def set_log_level(self, level): 39 | pass 40 | -------------------------------------------------------------------------------- /taar/plugin.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | from flask import request 6 | import json 7 | 8 | import markus 9 | from sentry_sdk import capture_exception 10 | 11 | # TAAR specific libraries 12 | from taar.context import app_context 13 | from taar.logs.moz_logging import ContextFilter 14 | from taar.profile_fetcher import ProfileFetcher 15 | from taar.recommenders.guid_based_recommender import GuidBasedRecommender 16 | from taar.recommenders.recommendation_manager import RecommenderFactory, RecommendationManager 17 | from taar.settings import AppSettings 18 | 19 | 20 | def acquire_taarlite_singleton(PROXY_MANAGER): 21 | if PROXY_MANAGER.getTaarLite() is None: 22 | ctx = app_context() 23 | root_ctx = ctx.child() 24 | instance = GuidBasedRecommender(root_ctx) 25 | PROXY_MANAGER.setTaarLite(instance) 26 | return PROXY_MANAGER.getTaarLite() 27 | 28 | 29 | def acquire_taar_singleton(PROXY_MANAGER): 30 | if PROXY_MANAGER.getTaarRM() is None: 31 | ctx = app_context() 32 | 33 | profile_fetcher = ProfileFetcher(ctx) 34 | ctx["profile_fetcher"] = profile_fetcher 35 | 36 | # Lock the context down after we've got basic bits installed 37 | root_ctx = ctx.child() 38 | r_factory = RecommenderFactory(root_ctx) 39 | root_ctx["recommender_factory"] = r_factory 40 | instance = RecommendationManager(root_ctx.child()) 41 | PROXY_MANAGER.setTaarRM(instance) 42 | return PROXY_MANAGER.getTaarRM() 43 | 44 | 45 | class ResourceProxy(object): 46 | def __init__(self): 47 | self._resource = None 48 | self._taarlite_resource = None 49 | 50 | def setTaarRM(self, rsrc): 51 | self._resource = rsrc 52 | 53 | def getTaarRM(self): 54 | return self._resource 55 | 56 | def setTaarLite(self, rsrc): 57 | self._taarlite_resource = rsrc 58 | 59 | def getTaarLite(self): 60 | return self._taarlite_resource 61 | 62 | 63 | PROXY_MANAGER = ResourceProxy() 64 | 65 | 66 | def clean_promoted_guids(raw_promoted_guids): 67 | """ Verify that the promoted GUIDs are formatted correctly, 68 | otherwise strip it down into an empty list. 69 | """ 70 | valid = True 71 | 72 | for row in raw_promoted_guids: 73 | if len(row) != 2: 74 | valid = False 75 | break 76 | 77 | if not ( 78 | (isinstance(row[0], str)) 79 | and (isinstance(row[1], int) or isinstance(row[1], float)) # noqa 80 | ): 81 | valid = False 82 | break 83 | 84 | if valid: 85 | return raw_promoted_guids 86 | return [] 87 | 88 | 89 | def merge_promoted_guids(promoted_guids, recommended_guids): 90 | guids = set() 91 | final = [] 92 | tmp = sorted( 93 | promoted_guids + [x for x in recommended_guids], 94 | key=lambda x: x[1], 95 | reverse=True, 96 | ) 97 | for guid, weight in tmp: 98 | if guid not in guids: 99 | final.append((guid, weight)) 100 | guids.add(guid) 101 | return final 102 | 103 | 104 | def configure_plugin(app): # noqa: C901 105 | """ 106 | This is a factory function that configures all the routes for 107 | flask given a particular library. 108 | """ 109 | 110 | markus.configure( 111 | backends=[ 112 | { 113 | # Log metrics to local instance of statsd 114 | # server. Use DatadogMetrics client 115 | "class": "markus.backends.datadog.DatadogMetrics", 116 | "options": { 117 | "statsd_host": AppSettings.STATSD_HOST, 118 | "statsd_port": AppSettings.STATSD_PORT, 119 | "statsd_namespace": "", 120 | }, 121 | } 122 | ] 123 | ) 124 | 125 | @app.route("/taarlite/api/v1/addon_recommendations//") 126 | def taarlite_recommendations(guid): 127 | """Return a list of recommendations provided a telemetry client_id.""" 128 | # Use the module global PROXY_MANAGER 129 | global PROXY_MANAGER 130 | taarlite_recommender = acquire_taarlite_singleton(PROXY_MANAGER) 131 | 132 | cdict = {"guid": guid} 133 | normalization_type = request.args.get("normalize", None) 134 | if normalization_type is not None: 135 | cdict["normalize"] = normalization_type 136 | 137 | def set_extra(record): 138 | record.url = request.path 139 | record.guid = guid 140 | 141 | with ContextFilter(taarlite_recommender.logger, set_extra): 142 | recommendations = taarlite_recommender.recommend( 143 | client_data=cdict, limit=AppSettings.TAARLITE_MAX_RESULTS 144 | ) 145 | 146 | if len(recommendations) != AppSettings.TAARLITE_MAX_RESULTS: 147 | recommendations = [] 148 | 149 | # Strip out weights from TAAR results to maintain compatibility 150 | # with TAAR 1.0 151 | jdata = {"results": [x[0] for x in recommendations]} 152 | 153 | response = app.response_class( 154 | response=json.dumps(jdata), status=200, mimetype="application/json" 155 | ) 156 | return response 157 | 158 | @app.route( 159 | "/v1/api/client_has_addon///", methods=["GET"], 160 | ) 161 | def client_has_addon(hashed_client_id, addon_id): 162 | # Use the module global PROXY_MANAGER 163 | global PROXY_MANAGER 164 | recommendation_manager = acquire_taar_singleton(PROXY_MANAGER) 165 | pf = recommendation_manager._ctx["profile_fetcher"] 166 | 167 | client_meta = pf.get(hashed_client_id) 168 | if client_meta is None: 169 | # no valid client metadata was found for the given 170 | # clientId 171 | result = {"results": False, "error": "No client found"} 172 | response = app.response_class( 173 | response=json.dumps(result), status=200, mimetype="application/json", 174 | ) 175 | return response 176 | 177 | result = {"results": addon_id in client_meta.get("installed_addons", [])} 178 | response = app.response_class( 179 | response=json.dumps(result), status=200, mimetype="application/json" 180 | ) 181 | return response 182 | 183 | @app.route("/v1/api/recommendations//", methods=["GET", "POST"]) 184 | def recommendations(hashed_client_id): 185 | """Return a list of recommendations provided a telemetry client_id.""" 186 | # Use the module global PROXY_MANAGER 187 | global PROXY_MANAGER 188 | 189 | extra_data = {} 190 | extra_data["options"] = {} 191 | extra_data["options"]["promoted"] = [] 192 | 193 | try: 194 | if request.method == "POST": 195 | json_data = request.data 196 | # At least Python3.5 returns request.data as bytes 197 | # type instead of a string type. 198 | # Both Python2.7 and Python3.7 return a string type 199 | if type(json_data) == bytes: 200 | json_data = json_data.decode("utf8") 201 | 202 | if json_data != "": 203 | post_data = json.loads(json_data) 204 | raw_promoted_guids = post_data.get("options", {}).get( 205 | "promoted", [] 206 | ) 207 | promoted_guids = clean_promoted_guids(raw_promoted_guids) 208 | extra_data["options"]["promoted"] = promoted_guids 209 | 210 | except Exception as e: 211 | jdata = {} 212 | jdata["results"] = [] 213 | jdata["error"] = "Invalid JSON in POST: {}".format(e) 214 | capture_exception(e) 215 | return app.response_class( 216 | response=json.dumps(jdata, status=400, mimetype="application/json") 217 | ) 218 | 219 | # Coerce the uuid.UUID type into a string 220 | client_id = str(hashed_client_id) 221 | 222 | locale = request.args.get("locale", None) 223 | if locale is not None: 224 | extra_data["locale"] = locale 225 | 226 | platform = request.args.get("platform", None) 227 | if platform is not None: 228 | extra_data["platform"] = platform 229 | 230 | recommendation_manager = acquire_taar_singleton(PROXY_MANAGER) 231 | 232 | def set_extra(record): 233 | record.url = request.path 234 | if locale: 235 | record.locale = locale 236 | if platform: 237 | record.platform = platform 238 | record.client_id = client_id 239 | record.method = request.method 240 | 241 | with ContextFilter(recommendation_manager.logger, set_extra): 242 | recommendations = recommendation_manager.recommend( 243 | client_id=client_id, limit=AppSettings.TAAR_MAX_RESULTS, extra_data=extra_data 244 | ) 245 | 246 | promoted_guids = extra_data.get("options", {}).get("promoted", []) 247 | recommendations = merge_promoted_guids(promoted_guids, recommendations) 248 | 249 | # Strip out weights from TAAR results to maintain compatibility 250 | # with TAAR 1.0 251 | jdata = {"results": [x[0] for x in recommendations]} 252 | 253 | response = app.response_class( 254 | response=json.dumps(jdata), status=200, mimetype="application/json" 255 | ) 256 | return response 257 | 258 | class MyPlugin: 259 | def set(self, config_options): 260 | """ 261 | This setter is primarily so that we can instrument the 262 | cached RecommendationManager implementation under test. 263 | 264 | All plugins should implement this set method to enable 265 | overwriting configuration options with a TAAR library. 266 | """ 267 | global PROXY_MANAGER 268 | if "PROXY_RESOURCE" in config_options: 269 | PROXY_MANAGER._resource = config_options["PROXY_RESOURCE"] 270 | 271 | return MyPlugin() 272 | -------------------------------------------------------------------------------- /taar/profile_fetcher.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | from taar.interfaces import IMozLogging 6 | from google.cloud import bigtable 7 | from google.cloud.bigtable import column_family 8 | from google.cloud.bigtable import row_filters 9 | import json 10 | import zlib 11 | import datetime 12 | import markus 13 | 14 | from taar.settings import AppSettings 15 | 16 | metrics = markus.get_metrics("taar") 17 | 18 | 19 | class BigTableProfileController: 20 | """ 21 | This class implements the profile database in BigTable 22 | """ 23 | 24 | def __init__(self, ctx, project_id, instance_id, table_id): 25 | self._ctx = ctx 26 | self._project_id = project_id 27 | self._instance_id = instance_id 28 | self._table_id = table_id 29 | self._column_family_id = "profile" 30 | self._column_name = "payload".encode() 31 | 32 | # Define the GC policy to retain only the most recent version 33 | max_age_rule = column_family.MaxAgeGCRule(datetime.timedelta(days=90)) 34 | max_versions_rule = column_family.MaxVersionsGCRule(1) 35 | self._gc_rule = column_family.GCRuleUnion( 36 | rules=[max_age_rule, max_versions_rule] 37 | ) 38 | 39 | self._client = bigtable.Client(project=project_id, admin=False) 40 | self._instance = self._client.instance(self._instance_id) 41 | 42 | def create_table(self): 43 | # admin needs to be set to True here so that we can create the 44 | # table 45 | admin_client = bigtable.Client(project=self._project_id, admin=True) 46 | instance = admin_client.instance(self._instance_id) 47 | print("Creating the {} table.".format(self._table_id)) 48 | table = instance.table(self._table_id) 49 | 50 | column_families = {self._column_family_id: self._gc_rule} 51 | table.create(column_families=column_families) 52 | 53 | def set_client_profile(self, client_profile): 54 | 55 | # Keys must be UTF8 encoded 56 | row_key = client_profile["client_id"].encode("utf8") 57 | 58 | table = self._instance.table(self._table_id) 59 | 60 | row = table.direct_row(row_key) 61 | row.set_cell( 62 | self._column_family_id, 63 | self._column_name, 64 | zlib.compress(json.dumps(client_profile).encode("utf8")), 65 | timestamp=datetime.datetime.utcnow(), 66 | ) 67 | table.mutate_rows([row]) 68 | 69 | def get_client_profile(self, client_id): 70 | """This fetches a single client record out of GCP BigTable 71 | """ 72 | try: 73 | row_key = client_id.encode() 74 | table = self._instance.table(self._table_id) 75 | row_filter = row_filters.CellsColumnLimitFilter(1) 76 | row = table.read_row(row_key, row_filter) 77 | cell = row.cells[self._column_family_id][self._column_name][0] 78 | jdata = json.loads(zlib.decompress(cell.value).decode("utf-8")) 79 | return jdata 80 | except Exception: 81 | logger = self._ctx[IMozLogging].get_logger("taar") 82 | logger.warning(f"Error loading client profile for {client_id}") 83 | return None 84 | 85 | 86 | class ProfileFetcher: 87 | """ Fetch the latest information for a client on the backing 88 | datastore 89 | """ 90 | 91 | def __init__(self, ctx): 92 | self._ctx = ctx 93 | self.logger = self._ctx[IMozLogging].get_logger("taar") 94 | self.__client = None 95 | 96 | @property 97 | def _client(self): 98 | if self.__client is None: 99 | self.__client = BigTableProfileController( 100 | self._ctx, 101 | project_id=AppSettings.BIGTABLE_PROJECT_ID, 102 | instance_id=AppSettings.BIGTABLE_INSTANCE_ID, 103 | table_id=AppSettings.BIGTABLE_TABLE_ID, 104 | ) 105 | return self.__client 106 | 107 | def set_client(self, client): 108 | self.__client = client 109 | 110 | @metrics.timer_decorator("bigtable_read") 111 | def get(self, client_id): 112 | try: 113 | profile_data = self._client.get_client_profile(client_id) 114 | 115 | if profile_data is None: 116 | self.logger.debug( 117 | "Client profile not found", extra={"client_id": client_id} 118 | ) 119 | return None 120 | 121 | addon_ids = [ 122 | addon["addon_id"] 123 | for addon in profile_data.get("active_addons", []) 124 | if not addon.get("is_system", False) 125 | ] 126 | 127 | return { 128 | "client_id": client_id, 129 | "geo_city": profile_data.get("city", ""), 130 | "subsession_length": profile_data.get("subsession_length", 0), 131 | "locale": profile_data.get("locale", ""), 132 | "os": profile_data.get("os", ""), 133 | "installed_addons": addon_ids, 134 | "disabled_addons_ids": profile_data.get("disabled_addons_ids", []), 135 | "bookmark_count": profile_data.get("places_bookmarks_count", 0), 136 | "tab_open_count": profile_data.get( 137 | "scalar_parent_browser_engagement_tab_open_event_count", 0 138 | ), 139 | "total_uri": profile_data.get( 140 | "scalar_parent_browser_engagement_total_uri_count", 0 141 | ), 142 | "unique_tlds": profile_data.get( 143 | "scalar_parent_browser_engagement_unique_domains_count", 0 144 | ), 145 | } 146 | except Exception as e: 147 | # We just want to catch any kind of error here to make 148 | # sure nothing breaks 149 | self.logger.error("Error loading client data", e) 150 | return None 151 | -------------------------------------------------------------------------------- /taar/recommenders/__init__.py: -------------------------------------------------------------------------------- 1 | from .collaborative_recommender import CollaborativeRecommender 2 | from .locale_recommender import LocaleRecommender 3 | from .similarity_recommender import SimilarityRecommender 4 | 5 | 6 | __all__ = [ 7 | "CollaborativeRecommender", 8 | "LocaleRecommender", 9 | "SimilarityRecommender", 10 | ] 11 | -------------------------------------------------------------------------------- /taar/recommenders/base_recommender.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | 3 | 4 | class AbstractRecommender: 5 | """Base class for recommenders. 6 | 7 | Subclasses must implement can_recommend and recommend. 8 | """ 9 | 10 | __metaclass__ = ABCMeta 11 | 12 | @abstractmethod 13 | def can_recommend(self, client_data, extra_data={}): 14 | """Tell whether this recommender can recommend the given client.""" 15 | pass 16 | 17 | @abstractmethod 18 | def recommend(self, client_data, limit, extra_data={}): 19 | """Return a list of recommendations for the given client.""" 20 | pass 21 | 22 | def __str__(self): 23 | return self.__class__.__name__ 24 | -------------------------------------------------------------------------------- /taar/recommenders/collaborative_recommender.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | from taar.interfaces import IMozLogging, ITAARCache 6 | import numpy as np 7 | import operator as op 8 | 9 | from taar.recommenders.base_recommender import AbstractRecommender 10 | 11 | 12 | def java_string_hashcode(s): 13 | h = 0 14 | for c in s: 15 | h = (31 * h + ord(c)) & 0xFFFFFFFF 16 | return ((h + 0x80000000) & 0xFFFFFFFF) - 0x80000000 17 | 18 | 19 | def positive_hash(s): 20 | return java_string_hashcode(s) & 0x7FFFFF 21 | 22 | 23 | class CollaborativeRecommender(AbstractRecommender): 24 | """ The addon recommendation interface to the collaborative filtering model. 25 | 26 | Usage example:: 27 | 28 | recommender = CollaborativeRecommender() 29 | dists = recommender.recommend(client_info) 30 | """ 31 | 32 | def __init__(self, ctx): 33 | self._ctx = ctx 34 | 35 | self.logger = self._ctx[IMozLogging].get_logger("taar") 36 | 37 | self._cache = self._ctx[ITAARCache] 38 | 39 | def _get_cache(self, extra_data): 40 | tmp = extra_data.get("cache", None) 41 | if tmp is None: 42 | tmp = self._cache.cache_context() 43 | return tmp 44 | 45 | def can_recommend(self, client_data, extra_data={}): 46 | cache = self._get_cache(extra_data) 47 | # We can't recommend if we don't have our data files. 48 | if ( 49 | cache["raw_item_matrix"] is None 50 | or cache["collab_model"] is None 51 | or cache["addon_mapping"] is None 52 | ): 53 | return False 54 | 55 | # We only get meaningful recommendation if a client has at least an 56 | # addon installed. 57 | if len(client_data.get("installed_addons", [])) > 0: 58 | return True 59 | 60 | return False 61 | 62 | def _recommend(self, client_data, limit, extra_data): 63 | cache = self._get_cache(extra_data) 64 | 65 | installed_addons_as_hashes = [ 66 | positive_hash(addon_id) 67 | for addon_id in client_data.get("installed_addons", []) 68 | ] 69 | 70 | # Build the query vector by setting the position of the queried addons to 1.0 71 | # and the other to 0.0. 72 | query_vector = np.array( 73 | [ 74 | 1.0 if (entry.get("id") in installed_addons_as_hashes) else 0.0 75 | for entry in cache["raw_item_matrix"] 76 | ] 77 | ) 78 | 79 | # Build the user factors matrix. 80 | user_factors = np.matmul(query_vector, cache["collab_model"]) 81 | user_factors_transposed = np.transpose(user_factors) 82 | 83 | # Compute the distance between the user and all the addons in the latent 84 | # space. 85 | distances = {} 86 | for addon in cache["raw_item_matrix"]: 87 | # We don't really need to show the items we requested. 88 | # They will always end up with the greatest score. Also 89 | # filter out legacy addons from the suggestions. 90 | hashed_id = addon.get("id") 91 | str_hashed_id = str(hashed_id) 92 | if ( 93 | hashed_id in installed_addons_as_hashes 94 | or str_hashed_id not in cache["addon_mapping"] 95 | or cache["addon_mapping"][str_hashed_id].get("isWebextension", False) 96 | is False 97 | ): 98 | continue 99 | 100 | dist = np.dot(user_factors_transposed, addon.get("features")) 101 | # Read the addon ids from the "addon_mapping" looking it 102 | # up by 'id' (which is an hashed value). 103 | addon_id = cache["addon_mapping"][str_hashed_id].get("id") 104 | distances[addon_id] = dist 105 | 106 | # Sort the suggested addons by their score and return the 107 | # sorted list of addon ids. 108 | sorted_dists = sorted(distances.items(), key=op.itemgetter(1), reverse=True) 109 | recommendations = [(s[0], s[1]) for s in sorted_dists[:limit]] 110 | return recommendations 111 | 112 | def recommend(self, client_data, limit, extra_data={}): 113 | # Addons identifiers are stored as positive hash values within the model. 114 | 115 | recommendations = self._recommend(client_data, limit, extra_data) 116 | 117 | log_data = ( 118 | client_data["client_id"], 119 | str([r[0] for r in recommendations]), 120 | ) 121 | self.logger.debug( 122 | "collaborative_recommender_triggered, " 123 | "client_id: [%s], " 124 | "guids: [%s]" % log_data 125 | ) 126 | 127 | return recommendations 128 | -------------------------------------------------------------------------------- /taar/recommenders/debug.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import contextlib 6 | import time 7 | 8 | 9 | @contextlib.contextmanager 10 | def log_timer_debug(msg, logger): 11 | start_time = time.time() 12 | try: 13 | yield 14 | finally: 15 | end_time = time.time() 16 | logger.debug(msg + f" Completed in {end_time-start_time} seconds") 17 | 18 | 19 | @contextlib.contextmanager 20 | def log_timer_info(msg, logger): 21 | start_time = time.time() 22 | try: 23 | yield 24 | finally: 25 | end_time = time.time() 26 | logger.info(msg + f" Completed in {end_time-start_time} seconds") 27 | -------------------------------------------------------------------------------- /taar/recommenders/ensemble_recommender.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import itertools 6 | 7 | import markus 8 | from sentry_sdk import capture_exception 9 | 10 | from taar.interfaces import IMozLogging, ITAARCache 11 | from taar.recommenders.debug import log_timer_debug 12 | from taar.utils import hasher 13 | from taar.recommenders.base_recommender import AbstractRecommender 14 | 15 | metrics = markus.get_metrics("taar") 16 | 17 | 18 | def is_test_client(client_id): 19 | """ any client_id where the GUID is composed of a single digit 20 | (repeating) is a test id """ 21 | return len(set(client_id.replace("-", ""))) == 1 22 | 23 | 24 | class EnsembleRecommender(AbstractRecommender): 25 | """ 26 | The EnsembleRecommender is a collection of recommenders where the 27 | results from each recommendation is amplified or dampened by a 28 | factor. The aggregate results are combines and used to recommend 29 | addons for users. 30 | """ 31 | 32 | def __init__(self, ctx): 33 | self.RECOMMENDER_KEYS = ["collaborative", "similarity", "locale"] 34 | self._ctx = ctx 35 | 36 | self._redis_cache = self._ctx[ITAARCache] 37 | self.logger = self._ctx[IMozLogging].get_logger("taar.ensemble") 38 | 39 | assert "recommender_factory" in self._ctx 40 | 41 | self._init_from_ctx() 42 | 43 | def _get_cache(self, extra_data): 44 | tmp = extra_data.get("cache", None) 45 | if tmp is None: 46 | tmp = self._redis_cache.cache_context() 47 | return tmp 48 | 49 | def getWeights(self): 50 | return self._redis_cache.ensemble_weights() 51 | 52 | def _init_from_ctx(self): 53 | # Copy the map of the recommenders 54 | self._recommender_map = {} 55 | 56 | recommender_factory = self._ctx["recommender_factory"] 57 | for rkey in self.RECOMMENDER_KEYS: 58 | self._recommender_map[rkey] = recommender_factory.create(rkey) 59 | 60 | self.logger.info("EnsembleRecommender initialized") 61 | 62 | def can_recommend(self, client_data, extra_data={}): 63 | """The ensemble recommender is always going to be 64 | available if at least one recommender is available""" 65 | result = sum( 66 | [ 67 | self._recommender_map[rkey].can_recommend(client_data) 68 | for rkey in self.RECOMMENDER_KEYS 69 | ] 70 | ) 71 | self.logger.debug("Ensemble can_recommend: {}".format(result)) 72 | return result 73 | 74 | @metrics.timer_decorator("ensemble_recommend") 75 | def recommend(self, client_data, limit, extra_data={}): 76 | cache = self._get_cache(extra_data) 77 | client_id = client_data.get("client_id", "no-client-id") 78 | 79 | if is_test_client(client_id): 80 | whitelist = cache["whitelist"] 81 | samples = whitelist[:limit] 82 | self.logger.info("Test ID detected [{}]".format(client_id)) 83 | 84 | # Compute a stable weight for any whitelisted addon based 85 | # on the sha256 hash of the GUID 86 | p = [(int(hasher(s), 16) % 100) / 100.0 for s in samples] 87 | results = list(zip(samples, p)) 88 | else: 89 | try: 90 | metrics.incr("error_ensemble", value=1) 91 | results = self._recommend(client_data, limit, extra_data) 92 | except Exception as e: 93 | results = [] 94 | self.logger.exception(f"Ensemble recommender crashed for {client_id}") 95 | capture_exception(e) 96 | return results 97 | 98 | def _recommend(self, client_data, limit, extra_data={}): 99 | """ 100 | Ensemble recommendations are aggregated from individual 101 | recommenders. The ensemble recommender applies a weight to 102 | the recommendation outputs of each recommender to reorder the 103 | recommendations to be a better fit. 104 | 105 | The intuitive understanding is that the total space of 106 | recommended addons across all recommenders will include the 107 | 'true' addons that should be recommended better than any 108 | individual recommender. The ensemble method simply needs to 109 | weight each recommender appropriate so that the ordering is 110 | correct. 111 | """ 112 | cache = self._get_cache(extra_data) 113 | self.logger.debug("Ensemble recommend invoked") 114 | preinstalled_addon_ids = client_data.get("installed_addons", []) 115 | 116 | # Compute an extended limit by adding the length of 117 | # the list of any preinstalled addons. 118 | extended_limit = limit + len(preinstalled_addon_ids) 119 | 120 | flattened_results = [] 121 | ensemble_weights = cache["ensemble_weights"] 122 | 123 | for rkey in self.RECOMMENDER_KEYS: 124 | self._recommend_single(client_data, ensemble_weights, extended_limit, extra_data, flattened_results, rkey) 125 | 126 | # Sort the results by the GUID 127 | flattened_results.sort(key=lambda item: item[0]) 128 | 129 | # group by the guid, sum up the weights for recurring GUID 130 | # suggestions across all recommenders 131 | guid_grouper = itertools.groupby(flattened_results, lambda item: item[0]) 132 | 133 | ensemble_suggestions = [] 134 | for (guid, guid_group) in guid_grouper: 135 | weight_sum = sum([v for (g, v) in guid_group]) 136 | item = (guid, weight_sum) 137 | ensemble_suggestions.append(item) 138 | 139 | # Sort in reverse order (greatest weight to least) 140 | ensemble_suggestions.sort(key=lambda x: -x[1]) 141 | 142 | filtered_ensemble_suggestions = [ 143 | (guid, weight) 144 | for (guid, weight) in ensemble_suggestions 145 | if guid not in preinstalled_addon_ids 146 | ] 147 | 148 | results = filtered_ensemble_suggestions[:limit] 149 | 150 | log_data = ( 151 | client_data["client_id"], 152 | extra_data.get("guid_randomization", False), 153 | str(ensemble_weights), 154 | str([r[0] for r in results]), 155 | ) 156 | self.logger.debug( 157 | "client_id: [%s], guid_randomization: [%s], ensemble_weight: [%s], guids: [%s]" 158 | % log_data 159 | ) 160 | return results 161 | 162 | def _recommend_single(self, client_data, ensemble_weights, extended_limit, extra_data, flattened_results, rkey): 163 | with log_timer_debug(f"{rkey} recommend invoked", self.logger): 164 | recommender = self._recommender_map[rkey] 165 | if not recommender.can_recommend(client_data, extra_data): 166 | return 167 | with metrics.timer(f"{rkey}_recommend"): 168 | try: 169 | raw_results = recommender.recommend( 170 | client_data, extended_limit, extra_data 171 | ) 172 | except Exception as e: 173 | metrics.incr(f"error_{rkey}", value=1) 174 | self.logger.exception( 175 | "{} recommender crashed for {}".format(rkey, 176 | client_data.get("client_id", "no-client-id") 177 | ), 178 | e, 179 | ) 180 | reweighted_results = [] 181 | for guid, weight in raw_results: 182 | item = (guid, weight * ensemble_weights[rkey]) 183 | reweighted_results.append(item) 184 | flattened_results.extend(reweighted_results) 185 | -------------------------------------------------------------------------------- /taar/recommenders/guid_based_recommender.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | 6 | from taar.interfaces import IMozLogging, ITAARCache 7 | 8 | import markus 9 | 10 | from taar.recommenders.debug import log_timer_debug 11 | 12 | metrics = markus.get_metrics("taar") 13 | 14 | 15 | NORM_MODE_ROWNORMSUM = "rownorm_sum" 16 | NORM_MODE_ROWCOUNT = "row_count" 17 | NORM_MODE_ROWSUM = "row_sum" 18 | NORM_MODE_GUIDCEPTION = "guidception" 19 | 20 | 21 | class GuidBasedRecommender: 22 | """ A recommender class that returns top N addons based on a 23 | passed addon identifier. This will load a json file containing 24 | updated top n addons coinstalled with the addon passed as an input 25 | parameter based on periodically updated addon-addon 26 | coinstallation frequency table generated from Longitdudinal 27 | Telemetry data. This recommender will drive recommendations 28 | surfaced on addons.mozilla.org 29 | 30 | 31 | We store the JSON data for the GUID coinstallation in memory. This 32 | consumes ~ 15.8MB of heap. 33 | 34 | In [10]: from pympler import asizeof 35 | 36 | In [11]: jdata = json.load(open('guid_coinstallation.json')) 37 | 38 | In [12]: asizeof.asizeof(jdata) 39 | Out[12]: 15784672 40 | 41 | Each of the data normalization dictionaries is also stored in 42 | memory. 43 | """ 44 | 45 | _addons_coinstallations = None 46 | _guid_maps = {} 47 | 48 | # Define recursion levels for guid-ception 49 | RECURSION_LEVELS = 3 50 | 51 | def __init__(self, ctx): 52 | self._ctx = ctx 53 | self.logger = self._ctx[IMozLogging].get_logger("taarlite") 54 | 55 | self._cache = ctx[ITAARCache] 56 | self.logger.info("GUIDBasedRecommender is initialized") 57 | 58 | def cache_ready(self): 59 | return self._cache.is_active() 60 | 61 | def can_recommend(self, client_data): 62 | # We can't recommend if we don't have our data files. 63 | if not self._cache.is_active(): 64 | return False 65 | 66 | # If we have data coming from other sources, we can use that for 67 | # recommending. 68 | addon_guid = client_data.get("guid", None) 69 | if not isinstance(addon_guid, str): 70 | return False 71 | 72 | # Use a dictionary keyed on the query guid 73 | if not self._cache.has_coinstalls_for(addon_guid): 74 | return False 75 | 76 | if not self._cache.get_coinstalls(addon_guid): 77 | return False 78 | 79 | return True 80 | 81 | @metrics.timer_decorator("guid_recommendation") 82 | def recommend(self, client_data, limit=4): 83 | """ 84 | TAAR lite will yield 4 recommendations for the AMO page 85 | """ 86 | 87 | if not self._cache.is_active(): 88 | return [] 89 | 90 | with log_timer_debug(f"Results computed", self.logger): 91 | 92 | with log_timer_debug("get client data", self.logger): 93 | addon_guid = client_data.get("guid") 94 | 95 | # Get the raw co-installation result dictionary 96 | with log_timer_debug("Get filtered coinstallations", self.logger): 97 | result_dict = self._cache.get_filtered_coinstall(addon_guid, {}) 98 | 99 | with log_timer_debug("acquire normalization method", self.logger): 100 | normalize = client_data.get("normalize", NORM_MODE_ROWNORMSUM) 101 | 102 | norm_dict = { 103 | "none": lambda guid, x: x, 104 | NORM_MODE_ROWCOUNT: self.norm_row_count, 105 | NORM_MODE_ROWSUM: self.norm_row_sum, 106 | NORM_MODE_ROWNORMSUM: self.norm_rownorm_sum, 107 | NORM_MODE_GUIDCEPTION: self.norm_guidception, 108 | } 109 | 110 | if normalize is not None and normalize not in norm_dict.keys(): 111 | # Yield no results if the normalization method is not 112 | # specified 113 | self.logger.warning( 114 | "Invalid normalization parameter detected: [%s]" % normalize 115 | ) 116 | return [] 117 | 118 | # Bind the normalization method 119 | norm_method = norm_dict[normalize] 120 | 121 | with log_timer_debug( 122 | f"Compute normalization using method:{normalize}", self.logger 123 | ): 124 | # Apply normalization 125 | tmp_result_dict = norm_method(addon_guid, result_dict) 126 | 127 | # Augment the result_dict with the installation counts 128 | # and then we can sort using lexical sorting of strings. 129 | # The idea here is to get something in the form of 130 | # 0000.0000.0000 131 | # The computed weight takes the first and second segments of 132 | # integers. The third segment is the installation count of 133 | # the addon but is zero padded. 134 | 135 | TWICE_LIMIT = limit * 2 136 | with log_timer_debug( 137 | f"Augment {TWICE_LIMIT} with installation counts and resorted", 138 | self.logger, 139 | ): 140 | result_list = [] 141 | rank_sorted = sorted( 142 | tmp_result_dict.items(), key=lambda x: x[1], reverse=True 143 | ) 144 | for k, v in rank_sorted[:TWICE_LIMIT]: 145 | lex_value = "{0:020.10f}.{1:010d}".format( 146 | v, self._cache.get_rankings(k, 0) 147 | ) 148 | result_list.append((k, lex_value)) 149 | 150 | # Sort the result list in descending order by weight 151 | result_list.sort(key=lambda x: x[1], reverse=True) 152 | 153 | self.logger.info( 154 | "Addon related recommendations results", 155 | extra={'guid': str(addon_guid), 'recs': [str(r[0]) for r in result_list[:limit]]} 156 | ) 157 | 158 | return result_list[:limit] 159 | 160 | def norm_row_count(self, key_guid, input_coinstall_dict): 161 | """This normalization method counts the unique times that a 162 | GUID is coinstalled with any other GUID. 163 | 164 | This dampens weight of any suggested GUID inversely 165 | proportional to it's overall popularity. 166 | """ 167 | 168 | output_result_dict = {} 169 | for result_guid, result_count in input_coinstall_dict.items(): 170 | output_result_dict[result_guid] = ( 171 | 1.0 * result_count / self._cache.guid_maps_rowcount(result_guid) 172 | ) 173 | return output_result_dict 174 | 175 | def norm_row_sum(self, key_guid, input_coinstall_dict): 176 | """This normalization normalizes the weights for the suggested 177 | coinstallation GUIDs based on the sum of the weights for the 178 | coinstallation GUIDs given a key GUID. 179 | """ 180 | 181 | def generate_row_sum_list(): 182 | for guid, guid_weight in input_coinstall_dict.items(): 183 | norm_guid_weight = ( 184 | guid_weight * 1.0 / self._cache.guid_maps_count_map(guid) 185 | ) 186 | 187 | yield guid, norm_guid_weight 188 | 189 | return dict(generate_row_sum_list()) 190 | 191 | def norm_rownorm_sum(self, key_guid, input_coinstall_dict): 192 | """This normalization is the same as norm_row_sum, but we also 193 | divide the result by the sum of 194 | (addon coinstall instances)/(addon coinstall total instances) 195 | 196 | The testcase for this scenario lays out the math more 197 | explicitly. 198 | """ 199 | with log_timer_debug("normalize row weights for coinstall dict", self.logger): 200 | tmp_dict = self._normalize_row_weights(input_coinstall_dict) 201 | 202 | with log_timer_debug( 203 | f"normalizing output_dict of size: {len(tmp_dict)}", self.logger 204 | ): 205 | output_dict = {} 206 | for output_guid, output_guid_weight in tmp_dict.items(): 207 | guid_row_norm_list = self._cache.guid_maps_row_norm( 208 | output_guid, [] 209 | ) 210 | if len(guid_row_norm_list) == 0: 211 | self.logger.warning( 212 | "Can't find GUID_ROW_NORM data for [{}]".format(output_guid) 213 | ) 214 | continue 215 | norm_sum = sum(guid_row_norm_list) 216 | if norm_sum == 0: 217 | self.logger.warning( 218 | "Sum of GUID_ROW_NORM data for [{}] is zero.".format( 219 | output_guid 220 | ) 221 | ) 222 | continue 223 | output_dict[output_guid] = output_guid_weight / norm_sum 224 | 225 | return output_dict 226 | 227 | def norm_guidception(self, key_guid, input_coinstall_dict): 228 | tmp_dict = self._normalize_row_weights(input_coinstall_dict) 229 | 230 | return self._compute_recursive_results(tmp_dict, self.RECURSION_LEVELS) 231 | 232 | def _normalize_row_weights(self, coinstall_dict): 233 | # Compute an intermediary dictionary that is a row normalized 234 | # co-install. That is - each coinstalled guid weight is 235 | # divided by the sum of the weights for all coinstalled guids 236 | # on this row. 237 | tmp_dict = {} 238 | coinstall_total_weight = sum(coinstall_dict.values()) 239 | for coinstall_guid, coinstall_weight in coinstall_dict.items(): 240 | tmp_dict[coinstall_guid] = coinstall_weight / coinstall_total_weight 241 | return tmp_dict 242 | 243 | def _recursion_penalty(self, level): 244 | """ Return a factor to apply to the weight for a guid 245 | recommendation. 246 | """ 247 | dampener = 1.0 - (1.0 * (self.RECURSION_LEVELS - level) / self.RECURSION_LEVELS) 248 | dampener *= dampener 249 | return dampener 250 | 251 | def _compute_recursive_results(self, row_normalized_coinstall, level): 252 | if level <= 0: 253 | return row_normalized_coinstall 254 | 255 | # consolidated_coinstall_dict will capture values 256 | consolidated_coinstall_dict = {} 257 | 258 | # Add this level's guid weight to the consolidated result 259 | dampener = self._recursion_penalty(level) 260 | for ( 261 | recommendation_guid, 262 | recommendation_guid_weight, 263 | ) in row_normalized_coinstall.items(): 264 | for guid, guid_weight in row_normalized_coinstall.items(): 265 | weight = consolidated_coinstall_dict.get(guid, 0) 266 | weight += dampener * guid_weight 267 | consolidated_coinstall_dict[guid] = weight 268 | 269 | # Add in the next level 270 | level -= 1 271 | for guid in consolidated_coinstall_dict.keys(): 272 | next_level_coinstalls = self._cache.get_coinstalls(guid, {}) 273 | if next_level_coinstalls != {}: 274 | # Normalize the next bunch of suggestions 275 | next_level_coinstalls = self._normalize_row_weights( 276 | next_level_coinstalls 277 | ) 278 | 279 | next_level_results = self._compute_recursive_results( 280 | next_level_coinstalls, level 281 | ) 282 | for (next_level_guid, next_level_weight,) in next_level_results.items(): 283 | weight = consolidated_coinstall_dict.get(guid, 0) 284 | weight += next_level_weight 285 | consolidated_coinstall_dict[guid] = weight 286 | 287 | # normalize the final results 288 | return self._normalize_row_weights(consolidated_coinstall_dict) 289 | -------------------------------------------------------------------------------- /taar/recommenders/locale_recommender.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | 6 | from taar.interfaces import IMozLogging, ITAARCache 7 | 8 | from .base_recommender import AbstractRecommender 9 | 10 | 11 | class LocaleRecommender(AbstractRecommender): 12 | """ A recommender class that returns top N addons based on the client geo-locale. 13 | 14 | This will load a json file containing updated top n addons in use per geo locale 15 | updated periodically by a separate process on airflow using Longitdudinal Telemetry 16 | data. 17 | 18 | This recommender may provide useful recommendations when collaborative_recommender 19 | may not work. 20 | """ 21 | 22 | def __init__(self, ctx): 23 | self._ctx = ctx 24 | 25 | self.logger = self._ctx[IMozLogging].get_logger("taar") 26 | 27 | self._cache = self._ctx[ITAARCache] 28 | 29 | def _get_cache(self, extra_data): 30 | tmp = extra_data.get("cache", None) 31 | if tmp is None: 32 | tmp = self._cache.cache_context() 33 | return tmp 34 | 35 | def can_recommend(self, client_data, extra_data={}): 36 | cache = self._get_cache(extra_data) 37 | 38 | # We can't recommend if we don't have our data files. 39 | if cache["top_addons_per_locale"] is None: 40 | return False 41 | 42 | # If we have data coming from other sources, we can use that for 43 | # recommending. 44 | client_locale = client_data.get("locale", None) or extra_data.get( 45 | "locale", None 46 | ) 47 | if not isinstance(client_locale, str): 48 | return False 49 | 50 | if client_locale not in cache["top_addons_per_locale"]: 51 | return False 52 | 53 | if not cache["top_addons_per_locale"].get(client_locale): 54 | return False 55 | 56 | return True 57 | 58 | def recommend(self, client_data, limit, extra_data={}): 59 | cache = self._get_cache(extra_data) 60 | # If we have data coming from multiple sourecs, prefer the one 61 | # from 'client_data'. 62 | client_locale = client_data.get("locale") or extra_data.get("locale", None) 63 | result_list = cache["top_addons_per_locale"].get(client_locale, [])[:limit] 64 | 65 | if "locale" not in client_data: 66 | try: 67 | client_data["locale"] = extra_data["locale"] 68 | except KeyError: 69 | client_data["locale"] = None 70 | 71 | log_data = (client_data["locale"], str([r[0] for r in result_list])) 72 | self.logger.debug( 73 | "locale_recommender_triggered, " 74 | "client_locale: [%s], guids: [%s]" % log_data 75 | ) 76 | return result_list 77 | -------------------------------------------------------------------------------- /taar/recommenders/randomizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module re-orders the (GUID, weight) 2-tuples using 3 | numpy.random.choice 4 | """ 5 | 6 | import numpy as np 7 | 8 | 9 | def in_experiment(client_id, xp_prob=0.5): 10 | """ 11 | Return whether or not this client_id is in the experiment. 12 | 13 | xp_prob is a probability between 0.0 and 1.0 which is the 14 | chance that the experimental branch is selected. 15 | """ 16 | hex_client = "".join([c for c in client_id.lower() if c in "abcdef0123456789"]) 17 | int_client = int(hex_client, 16) 18 | return int((int_client % 100) < (xp_prob * 100)) 19 | 20 | 21 | def reorder_guids(guid_weight_tuples, size=None): 22 | """ 23 | This reorders (GUID, weight) 2-tuples based on the weight using 24 | random selection, without replacement. 25 | 26 | @size denotes the length of the output. 27 | """ 28 | if guid_weight_tuples is None or len(guid_weight_tuples) == 0: 29 | return [] 30 | 31 | weights = np.array([weight for (guid, weight) in guid_weight_tuples]) 32 | guids = [guid for (guid, weight) in guid_weight_tuples] 33 | guid_map = dict(zip(guids, guid_weight_tuples)) 34 | 35 | if size is None: 36 | size = len(guids) 37 | else: 38 | size = min(size, len(guids)) 39 | 40 | # Normalize the weights so that they're probabilities 41 | # Scale first, weights can be negative (for example, collaborative filtering similarity scores) 42 | scaled_weights = weights - np.min(weights) + np.finfo(float).eps 43 | probabilities = scaled_weights / np.sum(scaled_weights) 44 | 45 | choices = np.random.choice(guids, size=size, replace=False, p=probabilities) 46 | return [guid_map[guid] for guid in choices] 47 | -------------------------------------------------------------------------------- /taar/recommenders/recommendation_manager.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import markus 6 | 7 | from taar.interfaces import IMozLogging, ITAARCache 8 | from taar.recommenders.debug import log_timer_debug 9 | from taar.recommenders.ensemble_recommender import ( 10 | EnsembleRecommender, 11 | is_test_client, 12 | ) 13 | from taar.recommenders.randomizer import reorder_guids 14 | 15 | metrics = markus.get_metrics("taar") 16 | 17 | 18 | class RecommenderFactory: 19 | """ 20 | A RecommenderFactory provides support to create recommenders. 21 | 22 | The existence of a factory enables injection of dependencies into 23 | the RecommendationManager and eases the implementation of test 24 | harnesses. 25 | """ 26 | 27 | def __init__(self, ctx): 28 | self._ctx = ctx 29 | # This map is set in the default context 30 | self._recommender_factory_map = self._ctx["recommender_factory_map"] 31 | 32 | def get_names(self): 33 | return self._recommender_factory_map.keys() 34 | 35 | def create(self, recommender_name): 36 | return self._recommender_factory_map[recommender_name]() 37 | 38 | 39 | class RecommendationManager: 40 | """This class determines which of the set of recommendation 41 | engines will actually be used to generate recommendations.""" 42 | 43 | def __init__(self, ctx): 44 | """Initialize the user profile fetcher and the recommenders. 45 | """ 46 | self._ctx = ctx 47 | self.logger = self._ctx[IMozLogging].get_logger("taar") if self._ctx[IMozLogging] else None 48 | 49 | assert "profile_fetcher" in self._ctx 50 | 51 | self.profile_fetcher = ctx["profile_fetcher"] 52 | 53 | self._ensemble_recommender = EnsembleRecommender(self._ctx.child()) 54 | 55 | # The whitelist data is only used for test client IDs 56 | 57 | self._cache = self._ctx[ITAARCache] 58 | 59 | @metrics.timer_decorator("profile_recommendation") 60 | def recommend(self, client_id, limit, extra_data={}): 61 | """Return recommendations for the given client. 62 | 63 | The recommendation logic will go through each recommender and 64 | pick the first one that "can_recommend". 65 | 66 | :param client_id: the client unique id. 67 | :param limit: the maximum number of recommendations to return. 68 | :param extra_data: a dictionary with extra client data. 69 | """ 70 | 71 | with log_timer_debug("recommmend executed", self.logger): 72 | # Read everything from redis now 73 | with log_timer_debug("redis read", self.logger): 74 | extra_data["cache"] = self._cache.cache_context() 75 | 76 | if is_test_client(client_id): 77 | # Just create a stub client_info blob 78 | client_info = { 79 | "client_id": client_id, 80 | } 81 | else: 82 | with log_timer_debug("bigtable fetched data", self.logger): 83 | client_info = self.profile_fetcher.get(client_id) 84 | 85 | if client_info is None: 86 | self.logger.warning( 87 | "Defaulting to empty results. No client info fetched from storage backend." 88 | ) 89 | return [] 90 | 91 | # Fetch back all possible whitelisted addons for this 92 | # client 93 | extra_data["guid_randomization"] = True 94 | whitelist = extra_data["cache"]["whitelist"] 95 | results = self._ensemble_recommender.recommend( 96 | client_info, len(whitelist), extra_data 97 | ) 98 | 99 | results = reorder_guids(results, limit) 100 | 101 | self.logger.info( 102 | f"Client recommendations results", 103 | extra={'client_id': client_id, 'recs': [r[0] for r in results]} 104 | ) 105 | 106 | return results 107 | -------------------------------------------------------------------------------- /taar/recommenders/redis_cache.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import json 6 | import os 7 | import threading 8 | import redis 9 | 10 | from taar.recommenders.cache import TAARCache, RANKING_PREFIX, COINSTALL_PREFIX 11 | 12 | 13 | # This marks which of the redis databases is currently 14 | # active for read 15 | ACTIVE_DB = "active_db" 16 | 17 | # This is a mutex to block multiple writers from redis 18 | UPDATE_CHECK = "update_mutex|" 19 | 20 | 21 | class PrefixStripper: 22 | def __init__(self, prefix, iterator, cast_to_str=False): 23 | self._prefix = prefix 24 | self._iter = iterator 25 | self._cast_to_str = cast_to_str 26 | 27 | def __iter__(self): 28 | return self 29 | 30 | def __next__(self): 31 | result = self._iter.__next__() 32 | result = result[len(self._prefix):] 33 | if self._cast_to_str: 34 | result = str(result) 35 | return result 36 | 37 | 38 | class TAARCacheRedis(TAARCache): 39 | """ 40 | This class manages a redis instance to hold onto the taar-lite 41 | GUID->GUID co-installation data 42 | """ 43 | 44 | _instance = None 45 | 46 | @classmethod 47 | def get_instance(cls, ctx): 48 | if cls._instance is None: 49 | cls._instance = TAARCacheRedis(ctx, i_didnt_read_the_docs=False) 50 | return cls._instance 51 | 52 | def __init__(self, ctx, i_didnt_read_the_docs=True): 53 | super(TAARCacheRedis, self).__init__(ctx) 54 | 55 | if i_didnt_read_the_docs: 56 | raise RuntimeError( 57 | "You cannot call this method directly - use get_instance" 58 | ) 59 | 60 | self._last_db = None 61 | # Keep an integer handle (or None) on the last known database 62 | self._last_db = None 63 | 64 | rcon = self.init_redis_connections() 65 | 66 | self._r0 = rcon[0] 67 | self._r1 = rcon[1] 68 | self._r2 = rcon[2] 69 | 70 | def reset(self): 71 | # Clear out the r0 bookkeeping to reset the database 72 | return self._r0.flushdb() 73 | 74 | def info(self): 75 | """ 76 | Dump bookkeeping metadata to logs 77 | """ 78 | meta = {} 79 | for key in self._r0.scan_iter(): 80 | meta[key.decode("utf8")] = self._r0.get(key).decode("utf8") 81 | if len(meta) == 0: 82 | self.logger.info("Bookkeeping data for TAARLite cache was empty") 83 | else: 84 | self.logger.info("TAARLite cache info", extra=meta) 85 | 86 | def init_redis_connections(self): 87 | """ 88 | Bind connections to redis databases. This sits in its own 89 | method to enable mocking for tests 90 | """ 91 | return { 92 | 0: redis.Redis(host=self._settings.REDIS_HOST, port=self._settings.REDIS_PORT, db=0), 93 | 1: redis.Redis(host=self._settings.REDIS_HOST, port=self._settings.REDIS_PORT, db=1), 94 | 2: redis.Redis(host=self._settings.REDIS_HOST, port=self._settings.REDIS_PORT, db=2), 95 | } 96 | 97 | def safe_load_data(self): 98 | """ 99 | This is a multiprocess, multithread safe method to safely load 100 | data into the cache. 101 | 102 | If a concurrent calls to this method are invoked, only the first 103 | call will have any effect. 104 | """ 105 | # Pin the first thread ID to try to update data 106 | # Note that nx flag so that this is only set if the 107 | # UPDATE_CHECK is not previously set 108 | # 109 | # The thread barrier will autoexpire in 10 minutes in the 110 | # event of process termination inside the critical section. 111 | self._r0.set(UPDATE_CHECK, self._ident, nx=True, ex=self._settings.TAARLITE_MUTEX_TTL) 112 | self.logger.info(f"UPDATE_CHECK field is set: {self._ident}") 113 | 114 | # This is a concurrency barrier to make sure only the pinned 115 | # thread can update redis 116 | update_ident = self._r0.get(UPDATE_CHECK).decode("utf8") 117 | if update_ident != self._ident: 118 | self.logger.info( 119 | "Cache update lock has already been acquired by another process" 120 | ) 121 | return 122 | 123 | # We're past the thread barrier - load the data and clear the 124 | # barrier when done 125 | try: 126 | self._load_data() 127 | finally: 128 | self._r0.delete(UPDATE_CHECK) 129 | self.logger.info("UPDATE_CHECK field is cleared") 130 | 131 | def _db_get(self, key, default=None, db=None): 132 | tmp = (db or self._db()).get(key) 133 | if tmp: 134 | return json.loads(tmp.decode("utf8")) 135 | return default 136 | 137 | def _db_set(self, key, val, db): 138 | db.set(key, json.dumps(val)) 139 | 140 | def key_iter_ranking(self): 141 | return PrefixStripper( 142 | RANKING_PREFIX, self._db().scan_iter(match=RANKING_PREFIX + "*") 143 | ) 144 | 145 | def key_iter_coinstall(self): 146 | return PrefixStripper( 147 | COINSTALL_PREFIX, self._db().scan_iter(match=COINSTALL_PREFIX + "*") 148 | ) 149 | 150 | def is_active(self): 151 | """ 152 | return True if data is loaded 153 | """ 154 | # Any value in ACTIVE_DB indicates that data is live 155 | return self._r0.get(ACTIVE_DB) is not None 156 | 157 | def ensure_db_loaded(self): 158 | _ = self._db() # make sure we've computed data from the live redis instance 159 | 160 | def _db(self): 161 | """ 162 | This dereferences the ACTIVE_DB pointer to get the current 163 | active redis instance 164 | """ 165 | active_db = self._r0.get(ACTIVE_DB) 166 | 167 | if active_db is not None: 168 | db = int(active_db.decode("utf8")) 169 | 170 | if db == 1: 171 | # Run all callback functions to preprocess model data 172 | live_db = self._r1 173 | elif db == 2: 174 | live_db = self._r2 175 | 176 | self._update_data_callback(db, live_db) 177 | 178 | return live_db 179 | 180 | def _update_data_callback(self, db_num, db): 181 | """ 182 | Preprocess data when the current redis instance does not match 183 | the last known instance. 184 | """ 185 | if db_num == self._last_db: 186 | return 187 | 188 | self._last_db = db_num 189 | 190 | self._build_cache_context(db) 191 | self.logger.info("Completed precomputing normalized data") 192 | 193 | @property 194 | def _ident(self): 195 | """ pid/thread identity """ 196 | return f"{os.getpid()}_{threading.get_ident()}" 197 | 198 | def _load_data(self): 199 | active_db = self._r0.get(ACTIVE_DB) 200 | if active_db is not None: 201 | active_db = int(active_db.decode("utf8")) 202 | if active_db == 1: 203 | next_active_db = 2 204 | else: 205 | next_active_db = 1 206 | else: 207 | next_active_db = 1 208 | 209 | if next_active_db == 1: 210 | db = self._r1 211 | else: 212 | db = self._r2 213 | 214 | # Clear this database before we do anything with it 215 | db.flushdb() 216 | 217 | self._copy_data(db) 218 | 219 | self._r0.set(ACTIVE_DB, next_active_db) 220 | self.logger.info(f"Active DB is set to {next_active_db}") 221 | -------------------------------------------------------------------------------- /taar/recommenders/similarity_recommender.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | from taar.recommenders.base_recommender import AbstractRecommender 6 | from itertools import groupby 7 | from scipy.spatial import distance 8 | from taar.interfaces import IMozLogging, ITAARCache 9 | import numpy as np 10 | 11 | FLOOR_DISTANCE_ADJUSTMENT = 0.001 12 | 13 | CATEGORICAL_FEATURES = ["geo_city", "locale", "os"] 14 | CONTINUOUS_FEATURES = [ 15 | "subsession_length", 16 | "bookmark_count", 17 | "tab_open_count", 18 | "total_uri", 19 | "unique_tlds", 20 | ] 21 | 22 | 23 | class SimilarityRecommender(AbstractRecommender): 24 | """ A recommender class that returns top N addons based on the 25 | client similarity with a set of candidate addon donors. 26 | 27 | Several telemetry fields are used to compute pairwise similarity 28 | with the donors and similarities are converted into a likelihood 29 | ratio of being a good match versus not being a good match. These 30 | quantities are then used to rank specific addons for 31 | recommendation. 32 | 33 | This will load a json file containing updated list of addon donors 34 | updated periodically by a separate weekly process using 35 | Longitdudinal Telemetry data. 36 | 37 | This recommender may provide useful recommendations when 38 | collaborative_recommender may not work. 39 | """ 40 | 41 | def __init__(self, ctx): 42 | self._ctx = ctx 43 | 44 | self._cache = self._ctx[ITAARCache] 45 | 46 | self.logger = self._ctx[IMozLogging].get_logger("taar") 47 | 48 | def _get_cache(self, extra_data): 49 | tmp = extra_data.get("cache", None) 50 | if tmp is None: 51 | tmp = self._cache.cache_context() 52 | return tmp 53 | 54 | """ 55 | End private properties 56 | """ 57 | 58 | def can_recommend(self, client_data, extra_data={}): 59 | cache = self._get_cache(extra_data) 60 | # We can't recommend if we don't have our data files. 61 | if cache["donors_pool"] is None or cache["lr_curves"] is None: 62 | return False 63 | 64 | # Check that the client info contains a non-None value for each required 65 | # telemetry field. 66 | REQUIRED_FIELDS = CATEGORICAL_FEATURES + CONTINUOUS_FEATURES 67 | 68 | has_fields = all( 69 | [client_data.get(f, None) is not None for f in REQUIRED_FIELDS] 70 | ) 71 | if not has_fields: 72 | # Can not add extra info because client_id may not be available. 73 | self.logger.error("Unusable client data encountered") 74 | return has_fields 75 | 76 | def get_lr(self, score, cache): 77 | """Compute a :float: likelihood ratio from a provided similarity score when compared 78 | to two probability density functions which are computed and pre-loaded during init. 79 | 80 | The numerator indicates the probability density that a particular similarity score 81 | corresponds to a 'good' addon donor, i.e. a client that is similar in the sense of 82 | telemetry variables. The denominator indicates the probability density that a particular 83 | similarity score corresponds to a 'poor' addon donor 84 | 85 | :param score: A similarity score between a pair of objects. 86 | :returns: The approximate float likelihood ratio corresponding to provided score. 87 | """ 88 | # Find the index of the closest value that was precomputed in lr_curves 89 | # This will significantly speed up |get_lr|. 90 | 91 | # The lr_curves_cache is a list of scalar distance 92 | # measurements 93 | lr_curves_cache = np.array([s[0] for s in cache["lr_curves"]]) 94 | 95 | # np.argmin produces the index to the part of the curve 96 | # where distance is the smallest to the score which we are 97 | # inspecting currently. 98 | idx = np.argmin(abs(score - lr_curves_cache)) 99 | 100 | numer_val = cache["lr_curves"][idx][1][0] 101 | denum_val = cache["lr_curves"][idx][1][1] 102 | 103 | # Compute LR based on numerator and denominator values 104 | return float(numer_val) / float(denum_val) 105 | 106 | # # # CAUTION! # # # 107 | # Any changes to this function must be reflected in the corresponding ETL job. 108 | # https://github.com/mozilla/python_mozetl/blob/master/mozetl/taar/taar_similarity.py 109 | # 110 | def compute_clients_dist(self, client_data, cache): 111 | client_categorical_feats = [ 112 | client_data.get(specified_key) for specified_key in CATEGORICAL_FEATURES 113 | ] 114 | client_continuous_feats = [ 115 | client_data.get(specified_key) for specified_key in CONTINUOUS_FEATURES 116 | ] 117 | 118 | # Compute the distances between the user and the cached continuous features. 119 | cont_features = distance.cdist( 120 | cache["continuous_features"], 121 | np.array([client_continuous_feats]), 122 | "canberra", 123 | ) 124 | 125 | # Compute the distances between the user and the cached categorical features. 126 | cat_features = np.array( 127 | [ 128 | [distance.hamming(x, client_categorical_feats)] 129 | for x in cache["categorical_features"] 130 | ] 131 | ) 132 | 133 | # See the "Note about cdist optimization" in README.md for why we only use cdist once. 134 | 135 | # Take the product of similarities to attain a univariate similarity score. 136 | # Note that the addition of 0.001 to the continuous features 137 | # sets a floor value to the distance in continuous similarity 138 | # scores. There is no such floor value set for categorical 139 | # features so this adjustment prioritizes categorical 140 | # similarity over continous similarity 141 | return (cont_features + FLOOR_DISTANCE_ADJUSTMENT) * cat_features 142 | 143 | def get_similar_donors(self, client_data, cache): 144 | """Computes a set of :float: similarity scores between a client and a set of candidate 145 | donors for which comparable variables have been measured. 146 | 147 | A custom similarity metric is defined in this function that combines the Hamming distance 148 | for categorical variables with the Canberra distance for continuous variables into a 149 | univariate similarity metric between the client and a set of candidate donors loaded during 150 | init. 151 | 152 | :param client_data: a client data payload including a subset fo telemetry fields. 153 | :return: the sorted approximate likelihood ratio (np.array) corresponding to the 154 | internally computed similarity score and a list of indices that link 155 | each LR score with the related donor in the |self.donors_pool|. 156 | """ 157 | # Compute the distance between self and any comparable client. 158 | distances = self.compute_clients_dist(client_data, cache) 159 | 160 | # Compute the LR based on precomputed distributions that relate the score 161 | # to a probability of providing good addon recommendations. 162 | 163 | lrs_from_scores = np.array( 164 | [self.get_lr(distances[i], cache) for i in range(cache["num_donors"])] 165 | ) 166 | 167 | # Sort the LR values (descending) and return the sorted values together with 168 | # the original indices. 169 | indices = (-lrs_from_scores).argsort() 170 | return lrs_from_scores[indices], indices 171 | 172 | def _recommend(self, client_data, limit, extra_data={}): 173 | cache = self._get_cache(extra_data) 174 | 175 | donor_set_ranking, indices = self.get_similar_donors(client_data, cache) 176 | donor_log_lrs = np.log(donor_set_ranking) 177 | # 1.0 corresponds to a log likelihood ratio of 0 meaning that donors are equally 178 | # likely to be 'good'. A value > 0.0 is sufficient, but we like this to be high. 179 | if donor_log_lrs[0] < 0.1: 180 | self.logger.warning( 181 | "Addons recommended with very low similarity score, perhaps donor set is unrepresentative", 182 | extra={"maximum_similarity": donor_set_ranking[0]}, 183 | ) 184 | 185 | # Retrieve the indices of the highest ranked donors and then append their 186 | # installed addons. 187 | index_lrs_iter = zip(indices[donor_log_lrs > 0.0], donor_log_lrs) 188 | recommendations = [] 189 | for (index, lrs) in index_lrs_iter: 190 | for term in cache["donors_pool"][index]["active_addons"]: 191 | candidate = (term, lrs) 192 | recommendations.append(candidate) 193 | # Sort recommendations on key (guid name) 194 | recommendations = sorted(recommendations, key=lambda x: x[0]) 195 | recommendations_out = [] 196 | # recommendations must be sorted for this to work. 197 | for guid_key, group in groupby(recommendations, key=lambda x: x[0]): 198 | recommendations_out.append((guid_key, sum(j for i, j in group))) 199 | # now re-sort on the basis of LLR. 200 | recommendations_out = sorted(recommendations_out, key=lambda x: -x[1]) 201 | 202 | log_data = ( 203 | client_data["client_id"], 204 | str([r[0] for r in recommendations_out[:limit]]), 205 | ) 206 | self.logger.debug( 207 | "similarity_recommender_triggered, " 208 | "client_id: [%s], guids: [%s]" % log_data 209 | ) 210 | return recommendations_out 211 | 212 | def recommend(self, client_data, limit, extra_data={}): 213 | recommendations_out = self._recommend(client_data, limit, extra_data) 214 | return recommendations_out[:limit] 215 | -------------------------------------------------------------------------------- /taar/recommenders/ua_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | RE_PLATFORM = re.compile("(linux|windows|macintosh|android|fxios).*firefox") 4 | 5 | LINUX = 1 6 | WINDOWS = 2 7 | MACINTOSH = 3 8 | ANDROID = 4 9 | FXIOS = 5 10 | 11 | OSNAME_TO_ID = { 12 | "linux": LINUX, 13 | "windows": WINDOWS, 14 | "macintosh": MACINTOSH, 15 | "android": ANDROID, 16 | "fxios": FXIOS, 17 | } 18 | 19 | 20 | def parse_ua(user_agent): 21 | """ 22 | Return one of the constants for platform selection, otherwise 23 | return None if the platform cannot be determined. Any non-firefox 24 | agent us automatically short circuited to be None. 25 | """ 26 | ua = user_agent.lower() 27 | matches = RE_PLATFORM.findall(ua) 28 | if len(matches) != 1: 29 | return None 30 | return OSNAME_TO_ID[matches[0]] 31 | -------------------------------------------------------------------------------- /taar/settings.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | 6 | from decouple import config 7 | 8 | 9 | class AppSettings: 10 | PYTHON_LOG_LEVEL = config("PYTHON_LOG_LEVEL", "INFO") 11 | STATSD_HOST = config("STATSD_HOST", default="localhost", cast=str) 12 | STATSD_PORT = config("STATSD_PORT", default=8125, cast=int) 13 | 14 | DISABLE_REDIS = config("DISABLE_REDIS", False, cast=bool) 15 | 16 | TAAR_MAX_RESULTS = config("TAAR_MAX_RESULTS", default=10, cast=int) 17 | TAARLITE_MAX_RESULTS = config("TAARLITE_MAX_RESULTS", default=4, cast=int) 18 | 19 | # Bigtable config 20 | BIGTABLE_PROJECT_ID = config( 21 | "BIGTABLE_PROJECT_ID", default="cfr-personalization-experiment" 22 | ) 23 | BIGTABLE_INSTANCE_ID = config("BIGTABLE_INSTANCE_ID", default="taar-profile") 24 | BIGTABLE_TABLE_ID = config("BIGTABLE_TABLE_ID", default="taar_profile") 25 | 26 | 27 | class DefaultCacheSettings: 28 | DISABLE_TAAR_LITE = config("DISABLE_TAAR_LITE", False, cast=bool) 29 | DISABLE_ENSEMBLE = config("DISABLE_ENSEMBLE", False, cast=bool) 30 | 31 | TAAR_ENSEMBLE_BUCKET = config("TAAR_ENSEMBLE_BUCKET", default="test_ensemble_bucket") 32 | TAAR_ENSEMBLE_KEY = config("TAAR_ENSEMBLE_KEY", default="test_ensemble_key") 33 | 34 | TAAR_WHITELIST_BUCKET = config("TAAR_WHITELIST_BUCKET", default="test_whitelist_bucket") 35 | TAAR_WHITELIST_KEY = config("TAAR_WHITELIST_KEY", default="test_whitelist_key") 36 | 37 | TAAR_ITEM_MATRIX_BUCKET = config("TAAR_ITEM_MATRIX_BUCKET", default="test_matrix_bucket") 38 | TAAR_ITEM_MATRIX_KEY = config("TAAR_ITEM_MATRIX_KEY", default="test_matrix_key") 39 | TAAR_ADDON_MAPPING_BUCKET = config("TAAR_ADDON_MAPPING_BUCKET", default="test_mapping_bucket") 40 | TAAR_ADDON_MAPPING_KEY = config("TAAR_ADDON_MAPPING_KEY", default="test_mapping_key") 41 | 42 | TAAR_LOCALE_BUCKET = config("TAAR_LOCALE_BUCKET", default="test_locale_bucket") 43 | TAAR_LOCALE_KEY = config("TAAR_LOCALE_KEY", default="test_locale_key") 44 | 45 | TAAR_SIMILARITY_BUCKET = config("TAAR_SIMILARITY_BUCKET", default="test_similarity_bucket") 46 | TAAR_SIMILARITY_DONOR_KEY = config("TAAR_SIMILARITY_DONOR_KEY", default="test_similarity_donor_key") 47 | TAAR_SIMILARITY_LRCURVES_KEY = config("TAAR_SIMILARITY_LRCURVES_KEY", default="test_similarity_lrcurves_key") 48 | 49 | # TAAR-lite configuration below 50 | 51 | TAARLITE_GUID_COINSTALL_BUCKET = config("TAARLITE_GUID_COINSTALL_BUCKET", "telemetry-parquet") 52 | TAARLITE_GUID_COINSTALL_KEY = config("TAARLITE_GUID_COINSTALL_KEY", "taar/lite/guid_coinstallation.json") 53 | 54 | TAARLITE_GUID_RANKING_KEY = config("TAARLITE_GUID_RANKING_KEY", "taar/lite/guid_install_ranking.json") 55 | 56 | TAARLITE_TRUNCATE = config("TAARLITE_TRUNCATE", AppSettings.TAARLITE_MAX_RESULTS * 5, cast=int) 57 | 58 | 59 | class RedisCacheSettings(DefaultCacheSettings): 60 | # 4 hour liviliness for TAARLITE data 61 | TAARLITE_TTL = config("TAARLITE_TTL", 60 * 60 * 4, cast=int) 62 | 63 | # TAARlite needs redis backed mutex's to protect critical sections 64 | # Set a default TAARLite mutex TTL of 1 hour to fully populate the 65 | # redis cache 66 | TAARLITE_MUTEX_TTL = config("TAARLITE_MUTEX_TTL", 60 * 60, cast=int) 67 | 68 | REDIS_HOST = config("REDIS_HOST", "localhost", cast=str) 69 | REDIS_PORT = config("REDIS_PORT", 6379, cast=int) 70 | 71 | 72 | class PackageCacheSettings(DefaultCacheSettings): 73 | TAAR_LOCALE_BUCKET = 'moz-fx-data-taar-pr-prod-e0f7-prod-models' 74 | TAAR_LOCALE_KEY = 'taar/locale/top10_dict.json.bz2' 75 | 76 | TAAR_SIMILARITY_BUCKET = 'moz-fx-data-taar-pr-prod-e0f7-prod-models' 77 | TAAR_SIMILARITY_DONOR_KEY = 'taar/similarity/donors.json.bz2' 78 | TAAR_SIMILARITY_LRCURVES_KEY = 'taar/similarity/lr_curves.json.bz2' 79 | 80 | TAAR_ITEM_MATRIX_BUCKET = 'moz-fx-data-taar-pr-prod-e0f7-prod-models' 81 | TAAR_ITEM_MATRIX_KEY = 'addon_recommender/item_matrix.json.bz2' 82 | TAAR_ADDON_MAPPING_BUCKET = 'moz-fx-data-taar-pr-prod-e0f7-prod-models' 83 | TAAR_ADDON_MAPPING_KEY = 'addon_recommender/addon_mapping.json.bz2' 84 | 85 | DISABLE_TAAR_LITE = True 86 | DISABLE_ENSEMBLE = True 87 | -------------------------------------------------------------------------------- /taar/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # this page intentionally left blank 2 | -------------------------------------------------------------------------------- /taar/utils.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import hashlib 6 | 7 | 8 | def hasher(client_id): 9 | return hashlib.new("sha256", client_id.encode("utf8")).hexdigest() 10 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/taar/f542a1ec1ea50812c81a9782922447adc0a5bfab/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """ 2 | These are global fixtures automagically loaded by pytest 3 | """ 4 | 5 | import pytest 6 | from taar.context import app_context 7 | 8 | FAKE_LOCALE_DATA = { 9 | "te-ST": [ 10 | "{1e6b8bce-7dc8-481c-9f19-123e41332b72}", 11 | "some-other@nice-addon.com", 12 | "{66d1eed2-a390-47cd-8215-016e9fa9cc55}", 13 | "{5f1594c3-0d4c-49dd-9182-4fbbb25131a7}", 14 | ], 15 | "en": ["some-uuid@test-addon.com", "other-addon@some-id.it"], 16 | } 17 | 18 | 19 | @pytest.fixture(scope='function') 20 | def test_ctx(): 21 | ctx = app_context() 22 | return ctx 23 | 24 | 25 | @pytest.fixture 26 | def TAARLITE_MOCK_DATA(): 27 | return { 28 | "guid-1": { 29 | "guid-2": 1000, 30 | "guid-3": 100, 31 | "guid-4": 10, 32 | "guid-5": 1, 33 | "guid-6": 1, 34 | }, 35 | "guid-2": { 36 | "guid-1": 50, 37 | "guid-3": 40, 38 | "guid-4": 20, 39 | "guid-8": 30, 40 | "guid-9": 10, 41 | }, 42 | "guid-3": {"guid-1": 100, "guid-2": 40, "guid-4": 70}, 43 | "guid-4": {"guid-2": 20}, 44 | "guid-6": {"guid-1": 5, "guid-7": 100, "guid-8": 100, "guid-9": 100}, 45 | "guid-8": {"guid-2": 30}, 46 | "guid-9": {"guid-2": 10}, 47 | } 48 | 49 | 50 | @pytest.fixture 51 | def TAARLITE_TIE_MOCK_DATA(): 52 | return { 53 | "guid-1": {"guid-2": 100, "guid-3": 100, "guid-4": 100, "guid-5": 100}, 54 | "guid-2": {"guid-1": 100, "guid-3": 100, "guid-4": 100, "guid-5": 100}, 55 | "guid-3": {"guid-1": 100, "guid-2": 100, "guid-4": 100, "guid-5": 100}, 56 | "guid-4": {"guid-1": 20, "guid-2": 20, "guid-3": 20, "guid-5": 20}, 57 | "guid-5": {"guid-1": 20, "guid-2": 20, "guid-3": 20, "guid-4": 20}, 58 | } 59 | 60 | 61 | @pytest.fixture 62 | def TAARLITE_MOCK_GUID_RANKING(): 63 | return { 64 | "guid-1": 10, 65 | "guid-2": 9, 66 | "guid-3": 8, 67 | "guid-4": 7, 68 | "guid-5": 6, 69 | "guid-6": 5, 70 | "guid-7": 4, 71 | "guid-8": 3, 72 | "guid-9": 2, 73 | } 74 | 75 | 76 | @pytest.fixture 77 | def TAARLITE_CUTOFF_GUID_RANKING(): 78 | return { 79 | "guid-1": 10000, 80 | "guid-2": 9000, 81 | "guid-3": 8000, 82 | "guid-4": 7, 83 | "guid-5": 6000, 84 | "guid-6": 5000, 85 | "guid-7": 4000, 86 | "guid-8": 3000, 87 | "guid-9": 2000, 88 | } 89 | -------------------------------------------------------------------------------- /tests/mocks.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | 6 | class MockRecommender: 7 | """The MockRecommender takes in a map of GUID->weight.""" 8 | 9 | def __init__(self, guid_map): 10 | self._guid_map = guid_map 11 | 12 | def can_recommend(self, *args, **kwargs): 13 | return True 14 | 15 | def recommend(self, *args, **kwargs): 16 | return sorted(self._guid_map.items(), key=lambda item: -item[1]) 17 | 18 | 19 | class MockRecommenderFactory: 20 | """ 21 | A RecommenderFactory provides support to create recommenders. 22 | 23 | The existence of a factory enables injection of dependencies into 24 | the RecommendationManager and eases the implementation of test 25 | harnesses. 26 | """ 27 | 28 | def __init__(self, **kwargs): 29 | mock_legacy = MockRecommender({"abc": 1.0, "bcd": 1.1, "cde": 1.2}) 30 | mock_locale = MockRecommender({"def": 2.0, "efg": 2.1, "fgh": 2.2, "abc": 2.3}) 31 | mock_collaborative = MockRecommender( 32 | {"ghi": 3.0, "hij": 3.1, "ijk": 3.2, "def": 3.3} 33 | ) 34 | mock_similarity = MockRecommender( 35 | {"jkl": 4.0, "klm": 4.1, "lmn": 4.2, "ghi": 4.3} 36 | ) 37 | 38 | self._recommender_factory_map = { 39 | "legacy": lambda: mock_legacy, 40 | "collaborative": lambda: mock_collaborative, 41 | "similarity": lambda: mock_similarity, 42 | "locale": lambda: mock_locale, 43 | } 44 | 45 | # Clobber any kwarg passed in recommenders 46 | for key in self._recommender_factory_map.keys(): 47 | self._recommender_factory_map[key] = kwargs.get( 48 | key, self._recommender_factory_map[key] 49 | ) 50 | 51 | def get_names(self): 52 | return self._recommender_factory_map.keys() 53 | 54 | def create(self, recommender_name): 55 | return self._recommender_factory_map[recommender_name]() 56 | 57 | 58 | class MockProfileController: 59 | def __init__(self, mock_profile): 60 | self._profile = mock_profile 61 | 62 | def get_client_profile(self, client_id): 63 | return self._profile 64 | -------------------------------------------------------------------------------- /tests/noop_fixtures.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Noop helpers 4 | """ 5 | 6 | import mock 7 | from taar.recommenders.redis_cache import TAARCacheRedis 8 | 9 | 10 | def noop_taarlite_dataload(stack): 11 | # no-op the taarlite rankdata 12 | stack.enter_context( 13 | mock.patch.object(TAARCacheRedis, "_update_rank_data", return_value=None) 14 | ) 15 | # no-op the taarlite guidguid data 16 | stack.enter_context( 17 | mock.patch.object(TAARCacheRedis, "_update_coinstall_data", return_value=None, ) 18 | ) 19 | return stack 20 | 21 | 22 | def noop_taarlocale_dataload(stack): 23 | # no-op the taarlite rankdata 24 | stack.enter_context( 25 | mock.patch.object(TAARCacheRedis, "_update_locale_data", return_value=None) 26 | ) 27 | return stack 28 | 29 | 30 | def noop_taarcollab_dataload(stack): 31 | # no-op the taar collab 32 | stack.enter_context( 33 | mock.patch.object(TAARCacheRedis, "_update_collab_data", return_value=None) 34 | ) 35 | return stack 36 | 37 | 38 | def noop_taarsimilarity_dataload(stack): 39 | # no-op the taar collab 40 | stack.enter_context( 41 | mock.patch.object(TAARCacheRedis, "_update_similarity_data", return_value=None) 42 | ) 43 | return stack 44 | 45 | 46 | def noop_taarensemble_dataload(stack): 47 | # no-op the taar collab 48 | stack.enter_context( 49 | mock.patch.object(TAARCacheRedis, "_update_ensemble_data", return_value=None) 50 | ) 51 | stack.enter_context( 52 | mock.patch.object(TAARCacheRedis, "_update_whitelist_data", return_value=None) 53 | ) 54 | return stack 55 | -------------------------------------------------------------------------------- /tests/similarity_data.py: -------------------------------------------------------------------------------- 1 | # Each of these records is identical with respect to the 2 | # CATEGORICAL_FEATURES and CONTINUOUS_FEATURES of the TAAR client 3 | # generated by `generate_a_fake_taar_client` except for the 4 | # `total_uri` field. This gives us a deterministic way to order the 5 | # recommendations provided by the SimilarityRecommender based on 6 | # continuous feature similarity. 7 | 8 | CONTINUOUS_FEATURE_FIXTURE_DATA = [ 9 | { 10 | "active_addons": [ 11 | "{test-guid-1}", 12 | "{test-guid-2}", 13 | "{test-guid-3}", 14 | "{test-guid-4}", 15 | ], 16 | "geo_city": "brasilia-br", 17 | "subsession_length": 4911, 18 | "locale": "br-PT", 19 | "os": "mac", 20 | "bookmark_count": 7, 21 | "tab_open_count": 4, 22 | "total_uri": 190, 23 | "unique_tlds": 21, 24 | }, 25 | { 26 | "active_addons": [ 27 | "{test-guid-5}", 28 | "{test-guid-6}", 29 | "{test-guid-1}", 30 | "{test-guid-8}", 31 | ], 32 | "geo_city": "brasilia-br", 33 | "subsession_length": 4911, 34 | "locale": "br-PT", 35 | "os": "mac", 36 | "bookmark_count": 7, 37 | "tab_open_count": 4, 38 | "total_uri": 200, 39 | "unique_tlds": 21, 40 | }, 41 | { 42 | "active_addons": [ 43 | "{test-guid-9}", 44 | "{test-guid-10}", 45 | "{test-guid-11}", 46 | "{test-guid-12}", 47 | ], 48 | "geo_city": "brasilia-br", 49 | "subsession_length": 4911, 50 | "locale": "br-PT", 51 | "os": "mac", 52 | "bookmark_count": 7, 53 | "tab_open_count": 4, 54 | "total_uri": 222, 55 | "unique_tlds": 21, 56 | }, 57 | { 58 | "active_addons": ["{test-guid-13}", "{test-guid-14}"], 59 | "geo_city": "brasilia-br", 60 | "subsession_length": 4911, 61 | "locale": "br-PT", 62 | "os": "mac", 63 | "bookmark_count": 7, 64 | "tab_open_count": 4, 65 | "total_uri": 210, 66 | "unique_tlds": 21, 67 | }, 68 | ] 69 | 70 | # Match the fixture taar client, but vary the geo_city to test only 71 | # the categorical feature matching. 72 | 73 | # Additionally the second donor contains the only duplicate recommendation 74 | # of "{test-guid-1}" 75 | 76 | CATEGORICAL_FEATURE_FIXTURE_DATA = [ 77 | { 78 | "active_addons": [ 79 | "{test-guid-1}", 80 | "{test-guid-2}", 81 | "{test-guid-3}", 82 | "{test-guid-4}", 83 | ], 84 | "geo_city": "brasilia-br", 85 | "subsession_length": 4911, 86 | "locale": "br-PT", 87 | "os": "mac", 88 | "bookmark_count": 7, 89 | "tab_open_count": 4, 90 | "total_uri": 222, 91 | "unique_tlds": 21, 92 | }, 93 | { 94 | # "{test-guid-1}" appears in duplicate here. 95 | "active_addons": [ 96 | "{test-guid-5}", 97 | "{test-guid-6}", 98 | "{test-guid-1}", 99 | "{test-guid-8}", 100 | ], 101 | "geo_city": "toronto-ca", 102 | "subsession_length": 4911, 103 | "locale": "br-PT", 104 | "os": "mac", 105 | "bookmark_count": 7, 106 | "tab_open_count": 4, 107 | "total_uri": 222, 108 | "unique_tlds": 21, 109 | }, 110 | { 111 | "active_addons": [ 112 | "{test-guid-9}", 113 | "{test-guid-10}", 114 | "{test-guid-11}", 115 | "{test-guid-12}", 116 | ], 117 | "geo_city": "brasilia-br", 118 | "subsession_length": 4911, 119 | "locale": "br-PT", 120 | "os": "mac", 121 | "bookmark_count": 7, 122 | "tab_open_count": 4, 123 | "total_uri": 222, 124 | "unique_tlds": 21, 125 | }, 126 | { 127 | "active_addons": ["{test-guid-13}", "{test-guid-1}"], 128 | "geo_city": "toronto-ca", 129 | "subsession_length": 4911, 130 | "locale": "br-PT", 131 | "os": "mac", 132 | "bookmark_count": 7, 133 | "tab_open_count": 4, 134 | "total_uri": 222, 135 | "unique_tlds": 21, 136 | }, 137 | ] 138 | -------------------------------------------------------------------------------- /tests/test_collaborativerecommender.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | """ 6 | Test cases for the TAAR CollaborativeRecommender 7 | """ 8 | 9 | import contextlib 10 | 11 | import fakeredis 12 | import mock 13 | import numpy 14 | 15 | from taar.interfaces import ITAARCache 16 | from taar.recommenders.collaborative_recommender import CollaborativeRecommender 17 | from taar.recommenders.collaborative_recommender import positive_hash 18 | from taar.recommenders.redis_cache import TAARCacheRedis 19 | from .noop_fixtures import ( 20 | noop_taarlocale_dataload, 21 | noop_taarlite_dataload, 22 | noop_taarensemble_dataload, 23 | noop_taarsimilarity_dataload, 24 | ) 25 | 26 | """ 27 | We need to generate a synthetic list of addons and relative weights 28 | for co-occurance. It's important to note that the 29 | CollaborativeRecommender model expects that addon IDs are hashed using 30 | the Java hash function. 31 | """ 32 | 33 | 34 | def noop_other_recommenders(stack): 35 | stack = noop_taarlocale_dataload(stack) 36 | stack = noop_taarlite_dataload(stack) 37 | stack = noop_taarsimilarity_dataload(stack) 38 | stack = noop_taarensemble_dataload(stack) 39 | return stack 40 | 41 | 42 | @contextlib.contextmanager 43 | def mock_install_none_mock_data(ctx): 44 | """ 45 | Overload the 'real' addon model and mapping URLs responses so that 46 | we always get 404 errors. 47 | """ 48 | with contextlib.ExitStack() as stack: 49 | TAARCacheRedis._instance = None 50 | 51 | stack.enter_context( 52 | mock.patch.object( 53 | TAARCacheRedis, "_fetch_collaborative_item_matrix", return_value="", 54 | ) 55 | ) 56 | stack.enter_context( 57 | mock.patch.object( 58 | TAARCacheRedis, "_fetch_collaborative_mapping_data", return_value="", 59 | ) 60 | ) 61 | 62 | stack = noop_other_recommenders(stack) 63 | 64 | # Patch fakeredis in 65 | stack.enter_context( 66 | mock.patch.object( 67 | TAARCacheRedis, 68 | "init_redis_connections", 69 | return_value={ 70 | 0: fakeredis.FakeStrictRedis(db=0), 71 | 1: fakeredis.FakeStrictRedis(db=1), 72 | 2: fakeredis.FakeStrictRedis(db=2), 73 | }, 74 | ) 75 | ) 76 | 77 | # Initialize redis 78 | cache = TAARCacheRedis.get_instance(ctx) 79 | cache.safe_load_data() 80 | ctx[ITAARCache] = cache 81 | yield stack 82 | 83 | 84 | @contextlib.contextmanager 85 | def mock_install_mock_data(ctx): 86 | addon_space = [ 87 | {"id": "addon1.id", "name": "addon1.name", "isWebextension": True}, 88 | {"id": "addon2.id", "name": "addon2.name", "isWebextension": True}, 89 | {"id": "addon3.id", "name": "addon3.name", "isWebextension": True}, 90 | {"id": "addon4.id", "name": "addon4.name", "isWebextension": True}, 91 | {"id": "addon5.id", "name": "addon5.name", "isWebextension": True}, 92 | ] 93 | 94 | fake_addon_matrix = [] 95 | for i, addon in enumerate(addon_space): 96 | row = { 97 | "id": positive_hash(addon["id"]), 98 | "features": [0, 0.2, 0.0, 0.1, 0.15], 99 | } 100 | row["features"][i] = 1.0 101 | fake_addon_matrix.append(row) 102 | 103 | fake_mapping = {} 104 | for addon in addon_space: 105 | java_hash = positive_hash(addon["id"]) 106 | fake_mapping[str(java_hash)] = addon 107 | 108 | with contextlib.ExitStack() as stack: 109 | TAARCacheRedis._instance = None 110 | stack.enter_context( 111 | mock.patch.object( 112 | TAARCacheRedis, 113 | "_fetch_collaborative_item_matrix", 114 | return_value=fake_addon_matrix, 115 | ) 116 | ) 117 | stack.enter_context( 118 | mock.patch.object( 119 | TAARCacheRedis, 120 | "_fetch_collaborative_mapping_data", 121 | return_value=fake_mapping, 122 | ) 123 | ) 124 | 125 | stack = noop_other_recommenders(stack) 126 | 127 | # Patch fakeredis in 128 | stack.enter_context( 129 | mock.patch.object( 130 | TAARCacheRedis, 131 | "init_redis_connections", 132 | return_value={ 133 | 0: fakeredis.FakeStrictRedis(db=0), 134 | 1: fakeredis.FakeStrictRedis(db=1), 135 | 2: fakeredis.FakeStrictRedis(db=2), 136 | }, 137 | ) 138 | ) 139 | 140 | # Initialize redis 141 | cache = TAARCacheRedis.get_instance(ctx) 142 | cache.safe_load_data() 143 | ctx[ITAARCache] = cache 144 | yield stack 145 | 146 | 147 | def test_cant_recommend(test_ctx): 148 | with mock_install_mock_data(test_ctx): 149 | r = CollaborativeRecommender(test_ctx) 150 | 151 | # Test that we can't recommend if we have not enough client info. 152 | assert not r.can_recommend({}) 153 | assert not r.can_recommend({"installed_addons": []}) 154 | 155 | 156 | def test_can_recommend(test_ctx): 157 | with mock_install_mock_data(test_ctx): 158 | r = CollaborativeRecommender(test_ctx) 159 | 160 | # Check that we can recommend if the user has at least an addon. 161 | assert r.can_recommend( 162 | { 163 | "installed_addons": ["uBlock0@raymondhill.net"], 164 | "client_id": "test-client", 165 | } 166 | ) 167 | 168 | 169 | def test_can_recommend_no_model(test_ctx): 170 | with mock_install_none_mock_data(test_ctx): 171 | r = CollaborativeRecommender(test_ctx) 172 | 173 | # We should never be able to recommend if something went wrong with the model. 174 | assert not r.can_recommend({}) 175 | assert not r.can_recommend({"installed_addons": []}) 176 | assert not r.can_recommend({"installed_addons": ["uBlock0@raymondhill.net"]}) 177 | 178 | 179 | def test_empty_recommendations(test_ctx): 180 | # Tests that the empty recommender always recommends an empty list 181 | # of addons if we have no addons 182 | with mock_install_none_mock_data(test_ctx): 183 | r = CollaborativeRecommender(test_ctx) 184 | assert not r.can_recommend({}) 185 | 186 | # Note that calling recommend() if can_recommend has failed is not 187 | # defined. 188 | 189 | 190 | def test_best_recommendation(test_ctx): 191 | # Make sure the structure of the recommendations is correct and that we 192 | # recommended the the right addon. 193 | with mock_install_mock_data(test_ctx): 194 | r = CollaborativeRecommender(test_ctx) 195 | 196 | # An non-empty set of addons should give a list of recommendations 197 | fixture_client_data = { 198 | "installed_addons": ["addon4.id"], 199 | "client_id": "test_client", 200 | } 201 | assert r.can_recommend(fixture_client_data) 202 | recommendations = r.recommend(fixture_client_data, 1) 203 | 204 | assert isinstance(recommendations, list) 205 | assert len(recommendations) == 1 206 | 207 | # Verify that addon2 - the most heavy weighted addon was 208 | # recommended 209 | result = recommendations[0] 210 | assert type(result) is tuple 211 | assert len(result) == 2 212 | assert result[0] == "addon2.id" 213 | assert type(result[1]) is numpy.float64 214 | assert numpy.isclose(result[1], numpy.float64("0.3225")) 215 | 216 | 217 | def test_recommendation_weights(test_ctx): 218 | """ 219 | Weights should be ordered greatest to lowest 220 | """ 221 | with mock_install_mock_data(test_ctx): 222 | r = CollaborativeRecommender(test_ctx) 223 | 224 | # An non-empty set of addons should give a list of recommendations 225 | fixture_client_data = { 226 | "installed_addons": ["addon4.id"], 227 | "client_id": "test_client", 228 | } 229 | assert r.can_recommend(fixture_client_data) 230 | recommendations = r.recommend(fixture_client_data, 2) 231 | assert isinstance(recommendations, list) 232 | assert len(recommendations) == 2 233 | 234 | # Verify that addon2 - the most heavy weighted addon was 235 | # recommended 236 | result = recommendations[0] 237 | assert type(result) is tuple 238 | assert len(result) == 2 239 | assert result[0] == "addon2.id" 240 | assert type(result[1]) is numpy.float64 241 | assert numpy.isclose(result[1], numpy.float64("0.3225")) 242 | 243 | # Verify that addon2 - the most heavy weighted addon was 244 | # recommended 245 | result = recommendations[1] 246 | assert type(result) is tuple 247 | assert len(result) == 2 248 | assert result[0] == "addon5.id" 249 | assert type(result[1]) is numpy.float64 250 | assert numpy.isclose(result[1], numpy.float64("0.29")) 251 | -------------------------------------------------------------------------------- /tests/test_ensemblerecommender.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | from taar.interfaces import ITAARCache 5 | from taar.recommenders.ensemble_recommender import EnsembleRecommender 6 | import mock 7 | import contextlib 8 | import fakeredis 9 | from taar.recommenders.redis_cache import TAARCacheRedis 10 | from .noop_fixtures import ( 11 | noop_taarlocale_dataload, 12 | noop_taarcollab_dataload, 13 | noop_taarlite_dataload, 14 | noop_taarsimilarity_dataload, 15 | ) 16 | from .mocks import MockRecommenderFactory 17 | 18 | from markus import TIMING 19 | from markus.testing import MetricsMock 20 | 21 | EXPECTED = {"collaborative": 1000, "similarity": 100, "locale": 10} 22 | 23 | 24 | def noop_loaders(stack): 25 | stack = noop_taarlocale_dataload(stack) 26 | stack = noop_taarcollab_dataload(stack) 27 | stack = noop_taarlite_dataload(stack) 28 | stack = noop_taarsimilarity_dataload(stack) 29 | return stack 30 | 31 | 32 | @contextlib.contextmanager 33 | def mock_install_mock_ensemble_data(ctx): 34 | DATA = {"ensemble_weights": EXPECTED} 35 | 36 | WHITELIST_DATA = [ 37 | "2.0@disconnect.me", 38 | "@contain-facebook", 39 | "@testpilot-containers", 40 | "CookieAutoDelete@kennydo.com", 41 | "FirefoxColor@mozilla.com", 42 | "adblockultimate@adblockultimate.net", 43 | "addon@darkreader.org", 44 | "adguardadblocker@adguard.com", 45 | "adnauseam@rednoise.org", 46 | "clearcache@michel.de.almeida", 47 | "copyplaintext@eros.man", 48 | "default-bookmark-folder@gustiaux.com", 49 | "enhancerforyoutube@maximerf.addons.mozilla.org", 50 | "extension@one-tab.com", 51 | "extension@tabliss.io", 52 | "firefox-addon@myki.co", 53 | "firefox@ghostery.com", 54 | "forecastfox@s3_fix_version", 55 | "forget-me-not@lusito.info", 56 | "foxyproxy@eric.h.jung", 57 | "foxytab@eros.man", 58 | "gmailnoads@mywebber.com", 59 | ] 60 | 61 | with contextlib.ExitStack() as stack: 62 | TAARCacheRedis._instance = None 63 | stack.enter_context( 64 | mock.patch.object(TAARCacheRedis, "_fetch_ensemble_weights", return_value=DATA, ) 65 | ) 66 | 67 | stack.enter_context( 68 | mock.patch.object( 69 | TAARCacheRedis, "_fetch_whitelist", return_value=WHITELIST_DATA, 70 | ) 71 | ) 72 | 73 | stack = noop_loaders(stack) 74 | 75 | # Patch fakeredis in 76 | stack.enter_context( 77 | mock.patch.object( 78 | TAARCacheRedis, 79 | "init_redis_connections", 80 | return_value={ 81 | 0: fakeredis.FakeStrictRedis(db=0), 82 | 1: fakeredis.FakeStrictRedis(db=1), 83 | 2: fakeredis.FakeStrictRedis(db=2), 84 | }, 85 | ) 86 | ) 87 | 88 | # Initialize redis 89 | cache = TAARCacheRedis.get_instance(ctx) 90 | cache.safe_load_data() 91 | ctx[ITAARCache] = cache 92 | yield stack 93 | 94 | 95 | def test_weight_cache(test_ctx): 96 | with mock_install_mock_ensemble_data(test_ctx): 97 | factory = MockRecommenderFactory() 98 | test_ctx["recommender_factory"] = factory 99 | 100 | test_ctx["recommender_map"] = { 101 | "collaborative": factory.create("collaborative"), 102 | "similarity": factory.create("similarity"), 103 | "locale": factory.create("locale"), 104 | } 105 | 106 | r = EnsembleRecommender(test_ctx) 107 | actual = r.getWeights() 108 | assert EXPECTED == actual 109 | 110 | 111 | def test_recommendations(test_ctx): 112 | with MetricsMock() as mm: 113 | with mock_install_mock_ensemble_data(test_ctx): 114 | EXPECTED_RESULTS = [ 115 | ("ghi", 3430.0), 116 | ("def", 3320.0), 117 | ("ijk", 3200.0), 118 | ("hij", 3100.0), 119 | ("lmn", 420.0), 120 | ] 121 | 122 | factory = MockRecommenderFactory() 123 | test_ctx["recommender_factory"] = factory 124 | 125 | test_ctx["recommender_map"] = { 126 | "collaborative": factory.create("collaborative"), 127 | "similarity": factory.create("similarity"), 128 | "locale": factory.create("locale"), 129 | } 130 | r = EnsembleRecommender(test_ctx) 131 | client = {"client_id": "12345"} # Anything will work here 132 | 133 | recommendation_list = r.recommend(client, 5) 134 | assert isinstance(recommendation_list, list) 135 | assert recommendation_list == EXPECTED_RESULTS 136 | 137 | assert mm.has_record(TIMING, "taar.ensemble_recommend") 138 | assert mm.has_record(TIMING, "taar.collaborative_recommend") 139 | assert mm.has_record(TIMING, "taar.locale_recommend") 140 | assert mm.has_record(TIMING, "taar.similarity_recommend") 141 | 142 | 143 | def test_preinstalled_guids(test_ctx): 144 | with mock_install_mock_ensemble_data(test_ctx): 145 | EXPECTED_RESULTS = [ 146 | ("ghi", 3430.0), 147 | ("ijk", 3200.0), 148 | ("lmn", 420.0), 149 | ("klm", 409.99999999999994), 150 | ("abc", 23.0), 151 | ] 152 | 153 | factory = MockRecommenderFactory() 154 | test_ctx["recommender_factory"] = factory 155 | 156 | test_ctx["recommender_map"] = { 157 | "collaborative": factory.create("collaborative"), 158 | "similarity": factory.create("similarity"), 159 | "locale": factory.create("locale"), 160 | } 161 | r = EnsembleRecommender(test_ctx) 162 | 163 | # 'hij' should be excluded from the suggestions list 164 | # The other two addon GUIDs 'def' and 'jkl' will never be 165 | # recommended anyway and should have no impact on results 166 | client = {"client_id": "12345", "installed_addons": ["def", "hij", "jkl"]} 167 | 168 | recommendation_list = r.recommend(client, 5) 169 | print(recommendation_list) 170 | assert isinstance(recommendation_list, list) 171 | assert recommendation_list == EXPECTED_RESULTS 172 | 173 | 174 | def test_mock_client_ids(test_ctx): 175 | with mock_install_mock_ensemble_data(test_ctx): 176 | 177 | EXPECTED_RESULTS = [ 178 | ("2.0@disconnect.me", 0.17), 179 | ("@contain-facebook", 0.25), 180 | ("@testpilot-containers", 0.72), 181 | ("CookieAutoDelete@kennydo.com", 0.37), 182 | ("FirefoxColor@mozilla.com", 0.32), 183 | ] 184 | 185 | factory = MockRecommenderFactory() 186 | test_ctx["recommender_factory"] = factory 187 | 188 | test_ctx["recommender_map"] = { 189 | "collaborative": factory.create("collaborative"), 190 | "similarity": factory.create("similarity"), 191 | "locale": factory.create("locale"), 192 | } 193 | r = EnsembleRecommender(test_ctx) 194 | 195 | # 'hij' should be excluded from the suggestions list 196 | # The other two addon GUIDs 'def' and 'jkl' will never be 197 | # recommended anyway and should have no impact on results 198 | client = {"client_id": "11111"} 199 | 200 | recommendation_list = r.recommend(client, 5) 201 | assert isinstance(recommendation_list, list) 202 | assert recommendation_list == EXPECTED_RESULTS 203 | -------------------------------------------------------------------------------- /tests/test_integration.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import logging 3 | import uuid 4 | 5 | from flask import Flask 6 | from flask import url_for 7 | 8 | import pytest 9 | 10 | from taar.settings import AppSettings 11 | from taar.context import app_context 12 | from .test_guid_based_recommender import mock_coinstall_ranking_context 13 | 14 | try: 15 | from unittest.mock import MagicMock 16 | except Exception: 17 | from mock import MagicMock 18 | 19 | 20 | def hasher(uuid): 21 | return hashlib.new("sha256", str(uuid).encode("utf8")).hexdigest() 22 | 23 | 24 | @pytest.fixture 25 | def app(): 26 | from taar.plugin import configure_plugin 27 | from taar.plugin import PROXY_MANAGER 28 | 29 | flask_app = Flask("test") 30 | 31 | # Clobber the default recommendation manager with a MagicMock 32 | mock_recommender = MagicMock() 33 | PROXY_MANAGER.setTaarRM(mock_recommender) 34 | 35 | configure_plugin(flask_app) 36 | 37 | return flask_app 38 | 39 | 40 | def test_empty_results_by_default(client, app): 41 | # The default behaviour under test should be that the 42 | # RecommendationManager simply no-ops everything so we get back an 43 | # empty result list. 44 | res = client.post("/v1/api/recommendations/not_a_real_hash/") 45 | assert res.json == {"results": []} 46 | 47 | 48 | def test_only_promoted_addons_post(client, app): 49 | # POSTing a JSON blob allows us to specify promoted addons to the 50 | # TAAR service. 51 | res = client.post( 52 | "/v1/api/recommendations/not_a_real_hash/", 53 | json=dict( 54 | {"options": {"promoted": [["guid1", 10], ["guid2", 5], ["guid55", 8]]}} 55 | ), 56 | follow_redirects=True, 57 | ) 58 | # The result should order the GUIDs in descending order of weight 59 | assert res.json == {"results": ["guid1", "guid55", "guid2"]} 60 | 61 | 62 | class FakeRecommendationManager(object): 63 | def __init__(self, *args, **kwargs): 64 | self.logger = logging.getLogger('test') 65 | 66 | 67 | class StaticRecommendationManager(FakeRecommendationManager): 68 | 69 | # Recommenders must return a list of 2-tuple results 70 | # with (GUID, weight) 71 | def recommend(self, client_id, limit, extra_data={}): 72 | result = [ 73 | ("test-addon-1", 1.0), 74 | ("test-addon-2", 1.0), 75 | ("test-addon-N", 1.0), 76 | ] 77 | return result 78 | 79 | 80 | class LocaleRecommendationManager(FakeRecommendationManager): 81 | def recommend(self, client_id, limit, extra_data={}): 82 | if extra_data.get("locale", None) == "en-US": 83 | return [("addon-Locale", 1.0)] 84 | return [] 85 | 86 | 87 | class EmptyRecommendationManager(FakeRecommendationManager): 88 | def recommend(self, client_id, limit, extra_data={}): 89 | return [] 90 | 91 | 92 | class PlatformRecommendationManager(FakeRecommendationManager): 93 | def recommend(self, client_id, limit, extra_data={}): 94 | if extra_data.get("platform", None) == "WOW64": 95 | return [("addon-WOW64", 1.0)] 96 | return [] 97 | 98 | 99 | class ProfileFetcherEnabledRecommendationManager(FakeRecommendationManager): 100 | def __init__(self, *args, **kwargs): 101 | self._ctx = app_context() 102 | self._ctx["profile_fetcher"] = kwargs["profile_fetcher"] 103 | super(ProfileFetcherEnabledRecommendationManager, self).__init__(args, kwargs) 104 | 105 | 106 | @pytest.fixture 107 | def locale_recommendation_manager(monkeypatch): 108 | # Force the plugin configuration 109 | import os 110 | 111 | os.environ["TAAR_API_PLUGIN"] = "taar.plugin" 112 | 113 | import taar.flask_app 114 | 115 | taar.flask_app.APP_WRAPPER.set({"PROXY_RESOURCE": LocaleRecommendationManager()}) 116 | 117 | 118 | @pytest.fixture 119 | def empty_recommendation_manager(monkeypatch): 120 | # Force the plugin configuration 121 | import os 122 | 123 | os.environ["TAAR_API_PLUGIN"] = "taar.plugin" 124 | 125 | import taar.flask_app 126 | 127 | taar.flask_app.APP_WRAPPER.set({"PROXY_RESOURCE": EmptyRecommendationManager()}) 128 | 129 | 130 | @pytest.fixture 131 | def platform_recommendation_manager(monkeypatch): 132 | # Force the plugin configuration 133 | import os 134 | 135 | os.environ["TAAR_API_PLUGIN"] = "taar.plugin" 136 | 137 | import taar.flask_app 138 | 139 | taar.flask_app.APP_WRAPPER.set({"PROXY_RESOURCE": PlatformRecommendationManager()}) 140 | 141 | 142 | @pytest.fixture 143 | def static_recommendation_manager(monkeypatch): 144 | # Force the plugin configuration 145 | import os 146 | 147 | os.environ["TAAR_API_PLUGIN"] = "taar.plugin" 148 | 149 | import taar.flask_app 150 | 151 | taar.flask_app.APP_WRAPPER.set({"PROXY_RESOURCE": StaticRecommendationManager()}) 152 | 153 | 154 | @pytest.fixture 155 | def profile_enabled_rm(monkeypatch): 156 | # Force the plugin configuration 157 | import os 158 | 159 | os.environ["TAAR_API_PLUGIN"] = "taar.plugin" 160 | 161 | import taar.flask_app 162 | 163 | mock_profile = {"installed_addons": ["addon_119", "addon_219"]} 164 | 165 | class MockPF: 166 | def get(self, hashed_client_id): 167 | return mock_profile 168 | 169 | pf = MockPF() 170 | pfm = ProfileFetcherEnabledRecommendationManager(profile_fetcher=pf) 171 | taar.flask_app.APP_WRAPPER.set({"PROXY_RESOURCE": pfm}) 172 | 173 | 174 | def test_empty_recommendation(client, empty_recommendation_manager): 175 | response = client.post( 176 | url_for("recommendations", hashed_client_id=hasher(uuid.uuid4())) 177 | ) 178 | assert response.status_code == 200 179 | assert response.headers["Content-Type"] == "application/json" 180 | assert response.data == b'{"results": []}' 181 | 182 | 183 | def test_locale_recommendation(client, locale_recommendation_manager): 184 | response = client.post( 185 | url_for("recommendations", hashed_client_id=hasher(uuid.uuid4())) 186 | + "?locale=en-US" 187 | ) 188 | assert response.status_code == 200 189 | assert response.headers["Content-Type"] == "application/json" 190 | assert response.data == b'{"results": ["addon-Locale"]}' 191 | 192 | response = client.post( 193 | url_for("recommendations", hashed_client_id=hasher(uuid.uuid4())) 194 | ) 195 | assert response.status_code == 200 196 | assert response.headers["Content-Type"] == "application/json" 197 | assert response.data == b'{"results": []}' 198 | 199 | 200 | def test_platform_recommendation(client, platform_recommendation_manager): 201 | uri = url_for("recommendations", hashed_client_id=hasher(uuid.uuid4())) + "?platform=WOW64" 202 | response = client.post(uri) 203 | assert response.status_code == 200 204 | assert response.headers["Content-Type"] == "application/json" 205 | assert response.data == b'{"results": ["addon-WOW64"]}' 206 | 207 | response = client.post( 208 | url_for("recommendations", hashed_client_id=hasher(uuid.uuid4())) 209 | ) 210 | assert response.status_code == 200 211 | assert response.headers["Content-Type"] == "application/json" 212 | assert response.data == b'{"results": []}' 213 | 214 | 215 | def test_simple_request(client, static_recommendation_manager): 216 | url = url_for("recommendations", hashed_client_id=hasher(uuid.uuid4())) 217 | response = client.post(url) 218 | assert response.status_code == 200 219 | assert response.headers["Content-Type"] == "application/json" 220 | expected = b'{"results": ["test-addon-1", "test-addon-2", "test-addon-N"]}' 221 | assert response.data == expected 222 | 223 | 224 | def test_mixed_and_promoted_and_taar_adodns(client, static_recommendation_manager): 225 | """ 226 | Test that we can provide addon suggestions that also get clobbered 227 | by the promoted addon set. 228 | """ 229 | url = url_for("recommendations", hashed_client_id=hasher(uuid.uuid4())) 230 | res = client.post( 231 | url, 232 | json=dict( 233 | {"options": {"promoted": [["guid1", 10], ["guid2", 5], ["guid55", 8]]}} 234 | ), 235 | follow_redirects=True, 236 | ) 237 | # The result should order the GUIDs in descending order of weight 238 | expected = { 239 | "results": [ 240 | "guid1", 241 | "guid55", 242 | "guid2", 243 | "test-addon-1", 244 | "test-addon-2", 245 | "test-addon-N", 246 | ] 247 | } 248 | assert res.json == expected 249 | 250 | 251 | def test_overlapping_mixed_and_promoted_and_taar_adodns( 252 | client, static_recommendation_manager 253 | ): 254 | """ 255 | Test that we can provide addon suggestions that also get clobbered 256 | by the promoted addon set. 257 | """ 258 | url = url_for("recommendations", hashed_client_id=hasher(uuid.uuid4())) 259 | res = client.post( 260 | url, 261 | json=dict( 262 | { 263 | "options": { 264 | "promoted": [["test-addon-1", 10], ["guid2", 5], ["guid55", 8], ] 265 | } 266 | } 267 | ), 268 | follow_redirects=True, 269 | ) 270 | # The result should order the GUIDs in descending order of weight 271 | expected = { 272 | "results": ["test-addon-1", "guid55", "guid2", "test-addon-2", "test-addon-N", ] 273 | } 274 | assert res.json == expected 275 | 276 | 277 | def test_client_addon_lookup_no_client(client, profile_enabled_rm): 278 | """ 279 | test that we can see if a client has an addon installed 280 | """ 281 | hashed_client_id = hasher(uuid.uuid4()) 282 | addon_id = "abc123" 283 | 284 | url = url_for( 285 | "client_has_addon", hashed_client_id=hashed_client_id, addon_id=addon_id 286 | ) 287 | res = client.get(url, follow_redirects=True) 288 | 289 | _ = {"results": False} 290 | assert res.json["results"] is False 291 | 292 | 293 | def test_client_has_addon(client, profile_enabled_rm): 294 | """ 295 | test that we can see if a client has an addon installed 296 | """ 297 | 298 | hashed_client_id = hasher(uuid.uuid4()) 299 | addon_id = "addon_119" 300 | 301 | url = url_for( 302 | "client_has_addon", hashed_client_id=hashed_client_id, addon_id=addon_id 303 | ) 304 | res = client.get(url, follow_redirects=True) 305 | 306 | expected = {"results": True} 307 | assert res.json == expected 308 | 309 | 310 | def test_client_has_no_addon(client, profile_enabled_rm): 311 | """ 312 | test that we can see if a client has an addon installed 313 | """ 314 | 315 | hashed_client_id = hasher(uuid.uuid4()) 316 | addon_id = "addon_984932434" 317 | 318 | url = url_for( 319 | "client_has_addon", hashed_client_id=hashed_client_id, addon_id=addon_id 320 | ) 321 | res = client.get(url, follow_redirects=True) 322 | 323 | assert res.json["results"] is False 324 | 325 | 326 | def test_taarlite(client, test_ctx, TAARLITE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING): 327 | """ 328 | Check that the result size of taarlite is TAARLITE_MAX_RESULTS 329 | """ 330 | 331 | with mock_coinstall_ranking_context( 332 | test_ctx, TAARLITE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING 333 | ): 334 | url = url_for("taarlite_recommendations", guid="guid-1", ) 335 | res = client.get(url, follow_redirects=True) 336 | 337 | assert len(res.json["results"]) == AppSettings.TAARLITE_MAX_RESULTS 338 | assert res.json["results"] == ["guid-5", "guid-6", "guid-3", "guid-2"] 339 | -------------------------------------------------------------------------------- /tests/test_localerecommender.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import bz2 6 | import contextlib 7 | import json 8 | 9 | import fakeredis 10 | import mock 11 | from google.cloud import storage 12 | 13 | from taar.interfaces import ITAARCache 14 | from taar.recommenders.locale_recommender import LocaleRecommender 15 | from taar.recommenders.redis_cache import TAARCacheRedis 16 | from taar.settings import DefaultCacheSettings 17 | from .noop_fixtures import ( 18 | noop_taarcollab_dataload, 19 | noop_taarlite_dataload, 20 | noop_taarsimilarity_dataload, 21 | noop_taarensemble_dataload, 22 | ) 23 | 24 | FAKE_LOCALE_DATA = { 25 | "te-ST": [ 26 | ["{1e6b8bce-7dc8-481c-9f19-123e41332b72}", 0.1], 27 | ["some-other@nice-addon.com", 0.2], 28 | ["{66d1eed2-a390-47cd-8215-016e9fa9cc55}", 0.3], 29 | ["{5f1594c3-0d4c-49dd-9182-4fbbb25131a7}", 0.4], 30 | ], 31 | "en": [["other-addon@some-id.it", 0.3], ["some-uuid@test-addon.com", 0.1]], 32 | } 33 | 34 | 35 | def install_mock_data(ctx): 36 | ctx = ctx.child() 37 | 38 | byte_data = json.dumps(FAKE_LOCALE_DATA).encode("utf8") 39 | byte_data = bz2.compress(byte_data) 40 | 41 | client = storage.Client() 42 | bucket = client.get_bucket(DefaultCacheSettings.TAAR_LOCALE_BUCKET) 43 | blob = bucket.blob(DefaultCacheSettings.TAAR_LOCALE_KEY) 44 | blob.upload_from_string(byte_data) 45 | 46 | return ctx 47 | 48 | 49 | @contextlib.contextmanager 50 | def mock_locale_data(ctx): 51 | with contextlib.ExitStack() as stack: 52 | TAARCacheRedis._instance = None 53 | stack.enter_context( 54 | mock.patch.object( 55 | TAARCacheRedis, "_fetch_locale_data", return_value=FAKE_LOCALE_DATA, 56 | ) 57 | ) 58 | 59 | stack = noop_taarlite_dataload(stack) 60 | stack = noop_taarcollab_dataload(stack) 61 | stack = noop_taarsimilarity_dataload(stack) 62 | stack = noop_taarensemble_dataload(stack) 63 | 64 | # Patch fakeredis in 65 | stack.enter_context( 66 | mock.patch.object( 67 | TAARCacheRedis, 68 | "init_redis_connections", 69 | return_value={ 70 | 0: fakeredis.FakeStrictRedis(db=0), 71 | 1: fakeredis.FakeStrictRedis(db=1), 72 | 2: fakeredis.FakeStrictRedis(db=2), 73 | }, 74 | ) 75 | ) 76 | 77 | # Initialize redis 78 | cache = TAARCacheRedis.get_instance(ctx) 79 | cache.safe_load_data() 80 | ctx[ITAARCache] = cache 81 | yield stack 82 | 83 | 84 | def test_can_recommend(test_ctx): 85 | with mock_locale_data(test_ctx): 86 | r = LocaleRecommender(test_ctx) 87 | 88 | # Test that we can't recommend if we have not enough client info. 89 | assert not r.can_recommend({}) 90 | assert not r.can_recommend({"locale": []}) 91 | 92 | # Check that we can recommend if the user has at least an addon. 93 | assert r.can_recommend({"locale": "en"}) 94 | 95 | 96 | def test_can_recommend_no_model(test_ctx): 97 | with mock_locale_data(test_ctx): 98 | r = LocaleRecommender(test_ctx) 99 | 100 | # We should never be able to recommend if something went 101 | # wrong with the model. 102 | assert not r.can_recommend({}) 103 | assert not r.can_recommend({"locale": []}) 104 | assert not r.can_recommend({"locale": "it"}) 105 | 106 | 107 | def test_recommendations(test_ctx): 108 | """Test that the locale recommender returns the correct 109 | locale dependent addons. 110 | 111 | The JSON output for this recommender should be a list of 2-tuples 112 | of (GUID, weight). 113 | """ 114 | with mock_locale_data(test_ctx): 115 | r = LocaleRecommender(test_ctx) 116 | 117 | recommendations = r.recommend({"locale": "en"}, 10) 118 | 119 | # Make sure the structure of the recommendations is correct and that we 120 | # recommended the the right addon. 121 | assert isinstance(recommendations, list) 122 | assert len(recommendations) == len(FAKE_LOCALE_DATA["en"]) 123 | 124 | # Make sure that the reported addons are the one from the fake data. 125 | for (addon_id, weight), (expected_id, expected_weight) in zip( 126 | recommendations, FAKE_LOCALE_DATA["en"] 127 | ): 128 | assert addon_id == expected_id 129 | assert weight == expected_weight 130 | 131 | 132 | def test_recommender_extra_data(test_ctx): 133 | # Test that the recommender uses locale data from the "extra" 134 | # section if available. 135 | def validate_recommendations(data, expected_locale): 136 | # Make sure the structure of the recommendations is correct and that we 137 | # recommended the the right addon. 138 | data = sorted(data, key=lambda x: x[1], reverse=True) 139 | assert isinstance(data, list) 140 | assert len(data) == len(FAKE_LOCALE_DATA[expected_locale]) 141 | 142 | # Make sure that the reported addons are the one from the fake data. 143 | for (addon_id, weight), (expected_id, expected_weight) in zip( 144 | data, FAKE_LOCALE_DATA[expected_locale] 145 | ): 146 | assert addon_id == expected_id 147 | assert weight == expected_weight 148 | 149 | with mock_locale_data(test_ctx): 150 | r = LocaleRecommender(test_ctx) 151 | recommendations = r.recommend({}, 10, extra_data={"locale": "en"}) 152 | validate_recommendations(recommendations, "en") 153 | 154 | # Make sure that we favour client data over the extra data. 155 | recommendations = r.recommend( 156 | {"locale": "en"}, 10, extra_data={"locale": "te-ST"} 157 | ) 158 | validate_recommendations(recommendations, "en") 159 | -------------------------------------------------------------------------------- /tests/test_profile_fetcher.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | from taar.profile_fetcher import ProfileFetcher 6 | from taar.profile_fetcher import BigTableProfileController 7 | from google.cloud import bigtable 8 | import copy 9 | import json 10 | import zlib 11 | from mock import MagicMock 12 | 13 | 14 | class MockProfileController: 15 | def __init__(self, mock_profile): 16 | self._profile = mock_profile 17 | 18 | def get_client_profile(self, client_id): 19 | return self._profile 20 | 21 | 22 | def test_profile_fetcher_returns_none(test_ctx): 23 | fetcher = ProfileFetcher(test_ctx) 24 | fetcher.set_client(MockProfileController(None)) 25 | assert fetcher.get("random-client-id") is None 26 | 27 | 28 | MOCK_DATA = { 29 | "profile": { 30 | u"scalar_parent_browser_engagement_total_uri_count": 791, 31 | u"city": u"Rome", 32 | u"scalar_parent_browser_engagement_tab_open_event_count": 46, 33 | u"subsession_start_date": u"2017-09-20T10:00:00.0+02:00", 34 | u"subsession_length": 3785, 35 | u"places_bookmarks_count": 0, 36 | u"scalar_parent_browser_engagement_unique_domains_count": 11, 37 | u"os": u"Windows_NT", 38 | u"active_addons": [ 39 | {u"addon_id": u"e10srollout@mozilla.org"}, 40 | {u"addon_id": u"firefox@getpocket.com"}, 41 | {u"addon_id": u"webcompat@mozilla.org", "is_system": True}, 42 | ], 43 | u"locale": "it-IT", 44 | }, 45 | "expected_result": { 46 | "client_id": "random-client-id", 47 | "bookmark_count": 0, 48 | "disabled_addons_ids": [], 49 | "geo_city": "Rome", 50 | "os": "Windows_NT", 51 | "subsession_length": 3785, 52 | "tab_open_count": 46, 53 | "total_uri": 791, 54 | "unique_tlds": 11, 55 | "installed_addons": ["e10srollout@mozilla.org", "firefox@getpocket.com",], 56 | "locale": "it-IT", 57 | }, 58 | } 59 | 60 | 61 | def test_profile_fetcher_returns_dict(test_ctx): 62 | fetcher = ProfileFetcher(test_ctx) 63 | 64 | mock_data = MOCK_DATA["profile"] 65 | mock_profile_controller = MockProfileController(mock_data) 66 | fetcher.set_client(mock_profile_controller) 67 | 68 | # Note that active_addons in the raw JSON source is remapped to 69 | # 'installed_addons' 70 | assert fetcher.get("random-client-id") == MOCK_DATA["expected_result"] 71 | 72 | 73 | def test_dont_crash_without_active_addons(test_ctx): 74 | mock_data = copy.deepcopy(MOCK_DATA["profile"]) 75 | del mock_data["active_addons"] 76 | mock_profile_controller = MockProfileController(mock_data) 77 | 78 | fetcher = ProfileFetcher(test_ctx) 79 | fetcher.set_client(mock_profile_controller) 80 | 81 | expected = copy.deepcopy(MOCK_DATA["expected_result"]) 82 | expected["installed_addons"][:] = [] 83 | assert fetcher.get("random-client-id") == expected 84 | 85 | 86 | def test_crashy_profile_controller(test_ctx, monkeypatch): 87 | def mock_bigtable_client(*args, **kwargs): 88 | class MockClient: 89 | def __init__(self, *args, **kwargs): 90 | pass 91 | 92 | def instance(self, *args, **kwargs): 93 | return MagicMock() 94 | 95 | return MockClient 96 | 97 | monkeypatch.setattr(bigtable, "Client", mock_bigtable_client) 98 | 99 | pc = BigTableProfileController( 100 | test_ctx, "mock_project_id", "mock_instance_id", "mock_table_id" 101 | ) 102 | assert pc.get_client_profile("exception_raising_client_id") is None 103 | 104 | 105 | def test_profile_controller(test_ctx, monkeypatch): 106 | class MockCell: 107 | client_profile = {"key": "with_some_data"} 108 | value = zlib.compress(json.dumps(client_profile).encode("utf8")) 109 | 110 | mc = MockCell() 111 | 112 | def mock_bigtable_client(*args, **kwargs): 113 | class MockTable: 114 | def __init__(self, table_id): 115 | pass 116 | 117 | def read_row(self, *args, **kwargs): 118 | class MockRow: 119 | @property 120 | def cells(self): 121 | magic_cn = MagicMock() 122 | magic_cn.__getitem__.return_value = mc 123 | 124 | magic_cf = MagicMock() 125 | magic_cf.__getitem__.return_value = magic_cn 126 | 127 | mm = MagicMock() 128 | mm.__getitem__.return_value = magic_cf 129 | return mm 130 | 131 | return MockRow() 132 | 133 | class MockInstance: 134 | def table(self, table_id): 135 | return MockTable(table_id) 136 | 137 | class MockClient: 138 | def instance(self, *args, **kwargs): 139 | return MockInstance(*args, **kwargs) 140 | 141 | return MockClient 142 | 143 | monkeypatch.setattr(bigtable, "Client", mock_bigtable_client) 144 | 145 | pc = BigTableProfileController( 146 | test_ctx, "mock_project_id", "mock_instance_id", "mock_table_id" 147 | ) 148 | jdata = pc.get_client_profile("a_mock_client") 149 | assert jdata == {"key": "with_some_data"} 150 | 151 | 152 | def test_profile_controller_no_user(test_ctx, monkeypatch): 153 | def mock_bigtable_client(*args, **kwargs): 154 | class MockTable: 155 | def __init__(self, table_id): 156 | pass 157 | 158 | def read_row(self, *args, **kwargs): 159 | return None 160 | 161 | class MockInstance: 162 | def table(self, table_id): 163 | return MockTable(table_id) 164 | 165 | class MockClient: 166 | def instance(self, *args, **kwargs): 167 | return MockInstance(*args, **kwargs) 168 | 169 | return MockClient 170 | 171 | monkeypatch.setattr(bigtable, "Client", mock_bigtable_client) 172 | 173 | pc = BigTableProfileController( 174 | test_ctx, "mock_project_id", "mock_instance_id", "mock_table_id" 175 | ) 176 | jdata = pc.get_client_profile("a_mock_client") 177 | assert jdata is None 178 | -------------------------------------------------------------------------------- /tests/test_randomizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test that we can reorder (GUID, weight) tuples based on random 3 | selection based on probability, 4 | """ 5 | 6 | from taar.recommenders.randomizer import reorder_guids 7 | from taar.recommenders.randomizer import in_experiment 8 | 9 | import numpy as np 10 | from collections import Counter 11 | 12 | 13 | def most_frequent(List): 14 | occurence_count = Counter(List) 15 | return occurence_count.most_common(1)[0][0] 16 | 17 | 18 | def test_reorder_guids(): 19 | # These weights are selected carefully so that they are different 20 | # enough that a randomized selection using the weighted inputs 21 | # will be stable 'enough' that we should be able to pass tests 22 | # consistently over a sufficiently large sample 23 | 24 | # Fix the random seed so that we get stable results between test 25 | # runs 26 | np.random.seed(seed=42) 27 | 28 | guid_weight_tuples = [ 29 | ("guid0", -0.60), 30 | ("guid1", -0.30), 31 | ("guid2", 0.09), 32 | ("guid3", 0.30), 33 | ("guid4", 2.5), 34 | ] 35 | 36 | # Run this 100 times to get the average ordering 37 | results = [] 38 | limit = 4 39 | for i in range(100): 40 | results.append(reorder_guids(guid_weight_tuples, size=limit)) 41 | 42 | best_result = [] 43 | for i in range(limit): 44 | best_result.append(most_frequent([row[i] for row in results])[0]) 45 | 46 | assert best_result == ["guid4", "guid3", "guid2", "guid1"] 47 | 48 | 49 | def test_reorder_guids_size_less_than_limit(): 50 | guid_weight_tuples = [ 51 | ("guid0", -0.60), 52 | ("guid1", -0.30)] 53 | limit = 4 54 | 55 | reordered = reorder_guids(guid_weight_tuples, size=limit) 56 | 57 | assert len(reordered) == 2 58 | 59 | 60 | def test_experimental_branch_guid(): 61 | """ 62 | Test the experimental cutoff selection code. 63 | 64 | The evaluation should be stable for a given probability and 65 | client_id. 66 | """ 67 | for i in range(10, 100, 10): 68 | id = hex(i)[2:] 69 | cutoff = (i + 9.0) / 100 70 | 71 | total = sum([in_experiment(id, cutoff) for i in range(100)]) 72 | assert total == 100 73 | 74 | total = sum([in_experiment(id, cutoff - 0.1) for i in range(100)]) 75 | assert total == 0 76 | 77 | 78 | def test_in_experiment_zero_prob(): 79 | """ 80 | Test the edge case when some client IDs go to experiment even with 0 probability. 81 | """ 82 | assert not in_experiment('0ace1ca2a3519332ab93e76a049fe74091fa8fc9063399caa8545dd23f93de5c', xp_prob=0.0) 83 | -------------------------------------------------------------------------------- /tests/test_recommendation_manager.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | from taar.recommenders.recommendation_manager import RecommendationManager 6 | from taar.recommenders.base_recommender import AbstractRecommender 7 | 8 | from .noop_fixtures import ( 9 | noop_taarlocale_dataload, 10 | noop_taarcollab_dataload, 11 | noop_taarsimilarity_dataload, 12 | noop_taarlite_dataload, 13 | ) 14 | 15 | from .mocks import MockRecommenderFactory 16 | 17 | import operator 18 | from functools import reduce 19 | 20 | import numpy as np 21 | from markus import TIMING 22 | from markus.testing import MetricsMock 23 | 24 | import mock 25 | import contextlib 26 | import fakeredis 27 | from taar.recommenders.redis_cache import TAARCacheRedis 28 | 29 | 30 | @contextlib.contextmanager 31 | def mock_install_mock_curated_data(ctx): 32 | mock_data = [] 33 | for i in range(20): 34 | mock_data.append(str(i) * 16) 35 | 36 | mock_ensemble_weights = { 37 | "ensemble_weights": {"collaborative": 1000, "similarity": 100, "locale": 10,} 38 | } 39 | 40 | with contextlib.ExitStack() as stack: 41 | TAARCacheRedis._instance = None 42 | 43 | stack.enter_context( 44 | mock.patch.object(TAARCacheRedis, "_fetch_whitelist", return_value=mock_data) 45 | ) 46 | stack.enter_context( 47 | mock.patch.object( 48 | TAARCacheRedis, 49 | "_fetch_ensemble_weights", 50 | return_value=mock_ensemble_weights, 51 | ) 52 | ) 53 | 54 | stack = noop_taarlite_dataload(stack) 55 | stack = noop_taarcollab_dataload(stack) 56 | stack = noop_taarlocale_dataload(stack) 57 | stack = noop_taarsimilarity_dataload(stack) 58 | 59 | stack.enter_context( 60 | mock.patch.object(TAARCacheRedis, "_fetch_whitelist", return_value=mock_data) 61 | ) 62 | 63 | # Patch fakeredis in 64 | stack.enter_context( 65 | mock.patch.object( 66 | TAARCacheRedis, 67 | "init_redis_connections", 68 | return_value={ 69 | 0: fakeredis.FakeStrictRedis(db=0), 70 | 1: fakeredis.FakeStrictRedis(db=1), 71 | 2: fakeredis.FakeStrictRedis(db=2), 72 | }, 73 | ) 74 | ) 75 | 76 | class DefaultMockProfileFetcher: 77 | def get(self, client_id): 78 | return {"client_id": client_id} 79 | 80 | mock_fetcher = DefaultMockProfileFetcher() 81 | 82 | ctx["profile_fetcher"] = mock_fetcher 83 | ctx["recommender_factory"] = MockRecommenderFactory() 84 | 85 | # Initialize redis 86 | TAARCacheRedis.get_instance(ctx).safe_load_data() 87 | 88 | yield stack 89 | 90 | 91 | class StubRecommender(AbstractRecommender): 92 | """ A shared, stub recommender that can be used for testing. 93 | """ 94 | 95 | def __init__(self, can_recommend, stub_recommendations): 96 | self._can_recommend = can_recommend 97 | self._recommendations = stub_recommendations 98 | 99 | def can_recommend(self, client_info, extra_data={}): 100 | return self._can_recommend 101 | 102 | def recommend(self, client_data, limit, extra_data={}): 103 | return self._recommendations 104 | 105 | 106 | def test_none_profile_returns_empty_list(test_ctx): 107 | with mock_install_mock_curated_data(test_ctx): 108 | 109 | class MockProfileFetcher: 110 | def get(self, client_id): 111 | return None 112 | 113 | test_ctx["profile_fetcher"] = MockProfileFetcher() 114 | 115 | rec_manager = RecommendationManager(test_ctx) 116 | assert rec_manager.recommend("random-client-id", 10) == [] 117 | 118 | 119 | def test_simple_recommendation(test_ctx): 120 | # Fix the random seed so that we get stable results between test 121 | # runs 122 | np.random.seed(seed=42) 123 | 124 | with mock_install_mock_curated_data(test_ctx): 125 | EXPECTED_RESULTS = [ 126 | ("def", 3320.0), 127 | ("klm", 409.99999999999994), 128 | ("hij", 3100.0), 129 | ("ijk", 3200.0), 130 | ("ghi", 3430.0), 131 | ("lmn", 420.0), 132 | ("jkl", 400.0), 133 | ("abc", 23.0), 134 | ("fgh", 22.0), 135 | ("efg", 21.0) 136 | ] 137 | 138 | with MetricsMock() as mm: 139 | manager = RecommendationManager(test_ctx) 140 | recommendation_list = manager.recommend("some_ignored_id", 10) 141 | 142 | assert isinstance(recommendation_list, list) 143 | assert recommendation_list == EXPECTED_RESULTS 144 | 145 | assert mm.has_record(TIMING, stat="taar.profile_recommendation") 146 | 147 | 148 | def test_fixed_client_id_valid(test_ctx): 149 | with mock_install_mock_curated_data(test_ctx): 150 | manager = RecommendationManager(test_ctx) 151 | recommendation_list = manager.recommend("111111", 10) 152 | assert len(recommendation_list) == 10 153 | 154 | 155 | def test_fixed_client_id_empty_list(test_ctx): 156 | class NoClientFetcher: 157 | def get(self, client_id): 158 | return None 159 | 160 | with mock_install_mock_curated_data(test_ctx): 161 | test_ctx["profile_fetcher"] = NoClientFetcher() 162 | 163 | manager = RecommendationManager(test_ctx) 164 | recommendation_list = manager.recommend("not_a_real_client_id", 10) 165 | 166 | assert len(recommendation_list) == 0 167 | 168 | 169 | def test_experimental_randomization(test_ctx): 170 | with mock_install_mock_curated_data(test_ctx): 171 | 172 | manager = RecommendationManager(test_ctx) 173 | raw_list = manager.recommend("111111", 10) 174 | 175 | # Clobber the experiment probability to be 100% to force a 176 | # reordering. 177 | test_ctx["TAAR_EXPERIMENT_PROB"] = 1.0 178 | 179 | manager = RecommendationManager(test_ctx) 180 | rand_list = manager.recommend("111111", 10) 181 | 182 | """ 183 | The two lists should be : 184 | 185 | * different (guid, weight) lists (possibly just order) 186 | * same length 187 | """ 188 | assert ( 189 | reduce( 190 | operator.and_, 191 | [ 192 | (t1[0] == t2[0] and t1[1] == t2[1]) 193 | for t1, t2 in zip(rand_list, raw_list) 194 | ], 195 | ) 196 | is False 197 | ) 198 | 199 | assert len(rand_list) == len(raw_list) 200 | --------------------------------------------------------------------------------