├── data ├── __init__.py ├── queries │ └── marreco │ │ └── datajet │ │ ├── search.sql │ │ ├── orderconfirmation.sql │ │ ├── purchase.sql │ │ └── productview.sql ├── help.py └── exporter.py ├── tests ├── __init__.py ├── data │ ├── test_template.html │ ├── build_query_test │ ├── test_macro_template.html │ ├── search_mock.json │ ├── orderconfirmation_mock.json │ └── productview_mock.json ├── system │ ├── data │ │ ├── neighbor │ │ │ ├── transformed_2.json │ │ │ ├── transformed_1.json │ │ │ └── train │ │ │ │ ├── 1 │ │ │ │ └── train.json │ │ │ │ └── 2 │ │ │ │ └── train.json │ │ ├── top_seller │ │ │ ├── train │ │ │ │ ├── 1 │ │ │ │ │ └── train.json │ │ │ │ └── 2 │ │ │ │ │ └── train.json │ │ │ └── datajet_test.json │ │ └── datajet_test.json │ └── spark_jobs │ │ ├── test_top_seller.py │ │ └── test_neighbor.py └── unit │ ├── spark_jobs │ ├── test_factory.py │ ├── test_base.py │ ├── test_run_marreco.py │ ├── test_top_seller.py │ └── test_neighbor.py │ └── data │ ├── test_help.py │ └── test_exporter.py ├── spark_jobs ├── __init__.py ├── factory.py ├── run_marreco.py ├── base.py ├── top_seller.py └── neighbor.py ├── bin ├── pytest.sh ├── pytest_system_neighbor.sh ├── export_datajet.sh ├── dataproc_top_seller.py ├── dataproc_neighbor.sh ├── utils.sh ├── create_cluster.sh ├── launch_jupyter_interface.sh └── export_datajet.py ├── .coveragerc ├── requirements.txt ├── notebooks ├── .gitignore ├── marreco_dense_dimsum.ipynb ├── marreco_df.ipynb ├── marreco_dimsum_internal.ipynb ├── rdd_marreco_test.ipynb └── marreco_dimsum_sparse.ipynb ├── .gitignore ├── LICENSE ├── nox.py └── README.md /data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spark_jobs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/data/test_template.html: -------------------------------------------------------------------------------- 1 |

{{name}}

2 | -------------------------------------------------------------------------------- /bin/pytest.sh: -------------------------------------------------------------------------------- 1 | py.test tests/unit/ --quiet --cov=. --cov-fail-under=100 2 | -------------------------------------------------------------------------------- /bin/pytest_system_neighbor.sh: -------------------------------------------------------------------------------- 1 | py.test tests/system/spark_jobs/test_neighbor.py --quiet --cov=. --cov-fail-under=100 2 | -------------------------------------------------------------------------------- /tests/data/build_query_test: -------------------------------------------------------------------------------- 1 | SELECT 2 | data 3 | FROM table 4 | WHERE init_days = {{days_interval}} and {{days_interval_end}} 5 | -------------------------------------------------------------------------------- /bin/export_datajet.sh: -------------------------------------------------------------------------------- 1 | python export_datajet.py --days_init=2 --days_end=1 --uri gs://lbanor/pyspark/{day}/train{idx}*.gz --table=dj1 --dataset=simona 2 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | 4 | [report] 5 | fail_under = 100 6 | show_missing = True 7 | exclude_lines = 8 | if __name__ == .__main__.: 9 | -------------------------------------------------------------------------------- /tests/data/test_macro_template.html: -------------------------------------------------------------------------------- 1 | {% macro func(v1, v2) %} 2 |

value of v1: {{v1}} 3 |

value of v2: {{v2}} 4 | {% endmacro %} 5 | {{func(v1, v2)}} 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Jinja2==2.9.5 2 | mock==2.0.0 3 | pytest==3.2.0 4 | pytest-cov==2.5.1 5 | #pyspark==2.2.0 6 | google-cloud-bigquery==0.26.0 7 | google-cloud-storage==1.3.1 8 | numpy==1.12.1 9 | -------------------------------------------------------------------------------- /tests/system/data/neighbor/transformed_2.json: -------------------------------------------------------------------------------- 1 | {"user_id":"3","interacted_items":[{"key":"1","score":0.5},{"key":"0","score":0.5}]} 2 | {"user_id":"2","interacted_items":[{"key":"0","score":6.0}]} 3 | {"user_id":"0","interacted_items":[{"key":"3","score":1.0}]} 4 | {"user_id":"1","interacted_items":[{"key":"1","score":1.0}]} 5 | -------------------------------------------------------------------------------- /notebooks/.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | *.sw[op] 3 | 4 | # C extensions 5 | *.so 6 | 7 | # Packages 8 | *.egg 9 | *.egg-info 10 | dist 11 | build 12 | eggs 13 | parts 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | __pycache__ 21 | 22 | # Unit test / coverage reports 23 | .coverage 24 | .nox 25 | .tox 26 | .cache 27 | htmlcov 28 | 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | *.sw[op] 3 | *.crc 4 | *.gz 5 | _SUCCESS 6 | 7 | # Packages 8 | *.egg 9 | *.egg-info 10 | dist 11 | build 12 | eggs 13 | parts 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | __pycache__ 21 | 22 | # Unit test / coverage reports 23 | .coverage 24 | .nox 25 | .tox 26 | .cache 27 | htmlcov 28 | 29 | #GCP keys 30 | key.json 31 | -------------------------------------------------------------------------------- /bin/dataproc_top_seller.py: -------------------------------------------------------------------------------- 1 | gcloud dataproc jobs submit pyspark --cluster=test3 --py-files=base.py,factory.py,top_seller.py --bucket=lbanor run_marreco.py -- --days_init=4 --days_end=2 --source_uri=gs://lbanor/pyspark/train_{}_*.gz --inter_uri=gs://lbanor/pyspark/marreco/top_seller/intermediate/{} --force=no --top_seller_uri=gs://lbanor/pyspark/marreco/top_seller/results --algorithm=top_seller 2 | -------------------------------------------------------------------------------- /bin/dataproc_neighbor.sh: -------------------------------------------------------------------------------- 1 | gcloud dataproc jobs submit pyspark --cluster=test3 --py-files=base.py,factory.py,neighbor.py --bucket=lbanor run_marreco.py -- --days_init=4 --days_end=4 --source_uri=gs://lbanor/pyspark/{}/train*.gz --inter_uri=gs://lbanor/pyspark/marreco/neighbor/intermediate/{} --threshold=0.1 --force=no --users_matrix_uri=gs://lbanor/pyspark/marreco/neighbor/user_matrix --decay=0.03 --w_browse=0.5 --w_purchase=6.0 --neighbor_uri=gs://lbanor/pyspark/marreco/neighbor/neighbor_matrix --algorithm=neighbor 2 | -------------------------------------------------------------------------------- /tests/system/data/neighbor/transformed_1.json: -------------------------------------------------------------------------------- 1 | {"user_id":"2","interacted_items":[{"key":"1","score":1.0}, {"key":"2","score":0.5}, {"key":"3","score":0.5}]} 2 | {"user_id":"3","interacted_items":[{"key":"0","score":0.5}, {"key":"1","score":0.5}, {"key":"2","score":6.0}, {"key":"3","score":6.0}]} 3 | {"user_id":"0","interacted_items":[{"key":"0","score":0.5}, {"key":"1","score":1.0}, {"key":"2","score":0.5}, {"key":"3","score":1.0}]} 4 | {"user_id":"1","interacted_items":[{"key":"0","score":1.0}, {"key":"1","score":1.0}, {"key":"2","score":1.0}, {"key":"3","score":0.5}]} 5 | -------------------------------------------------------------------------------- /bin/utils.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | function_exists () { 4 | declare -f -F $1 > /dev/null 5 | return $? 6 | } 7 | 8 | throw () { 9 | echo "$*" >&2 10 | echo 11 | function_exists usage && usage 12 | exit 1 13 | } 14 | 15 | get_metadata_property () { 16 | [[ -z $1 ]] && throw "missing function param for DATAPROC_CLUSTER_NAME" || DATAPROC_CLUSTER_NAME=$1 17 | [[ -z $2 ]] && throw "missing function param for METADATA_KEY" || METADATA_KEY=$2 18 | # Get $DATAPROC_CLUSTER_NAME metadata value for key $METADATA_KEY... 19 | gcloud dataproc clusters describe $DATAPROC_CLUSTER_NAME | python -c "import sys,yaml; cluster = yaml.load(sys.stdin); print(cluster['config']['gceClusterConfig']['metadata']['$METADATA_KEY'])" 20 | } 21 | -------------------------------------------------------------------------------- /tests/data/search_mock.json: -------------------------------------------------------------------------------- 1 | { 2 | "event": { 3 | "schema_version": 1, 4 | "user": { 5 | "location": {} 6 | }, 7 | "device": { 8 | "origin": "web" 9 | }, 10 | "source": { 11 | "tracker": "hawk", 12 | "url": "/", 13 | "url_referrer": "/" 14 | }, 15 | "created_at": 1502582400127, 16 | "type": "search_response", 17 | "details": { 18 | "generation_ms": 69, 19 | "request": { 20 | "category_dept": 1, 21 | "facet_count": 1000, 22 | "facets": ["brand", "price", "size", "gender", "color", "categories_slugs", "categories_ids", "owner", "category"], 23 | "fields": "*", 24 | "filters": { 25 | "brand.slug": ["calvin-klein-kids"], 26 | "categories_ids": ["257"] 27 | }, 28 | "gs": 3, 29 | "rq": 5, 30 | "size": 48, 31 | "sort": "relevance", 32 | "top_product_ids": [""] 33 | }, 34 | "response": { 35 | "count": 189, 36 | "id": "d7c104cab610e7edf07290428c4db4e6ec49fcc1", 37 | "items": ["CA947APM37XCS", 38 | "CA947APM24OVZ" 39 | ] 40 | } 41 | } 42 | }, 43 | "created_at": 1502582400127 44 | } 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Willian Fuks 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/data/orderconfirmation_mock.json: -------------------------------------------------------------------------------- 1 | {"event":{"schema_version":1,"user":{"location":{}},"identifiers":{"bid":{"value":"03651597b830fbbee9c7f4299989bd48","type":"bid"},"djUCID":{"value":"610574c802ba3b33","type":"djUCID"}},"device":{"client":"Mozilla/5.0 (Linux; Android 7.0; XT1635-02 Build/NPN25.137-24-1; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/59.0.3071.125 Mobile Safari/537.36 [FB_IAB/FB4A;FBAV/136.0.0.22.91;]","os":"Linux armv7l","origin":"web"},"source":{"tracker":"fish","url":"m.dafiti.com.br/checkout/success/","url_referrer":"checkout.dafiti.com.br/checkout/finish/"},"created_at":1502582416663,"local_timestamp":1502582415616,"type":"orderconfirmation","details":{"order_id":"15965531","products":[{"id":"","price":{"current":74.5},"group_id":"DA923SHF35RHK","skus":["DA923SHF35RHK"],"main_category_path":[{"name":"calcados","slug":"calcados"},{"name":"calcados femininos","slug":"calcados-femininos"}]},{"id":"","price":{"current":74.5},"group_id":"VI618SHF69UQC","skus":["VI618SHF69UQC"],"main_category_path":[{"name":"calcados","slug":"calcados"},{"name":"calcados femininos","slug":"calcados-femininos"}]}],"quantities":[1,1]}},"created_at":1502582416663} 2 | -------------------------------------------------------------------------------- /bin/create_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | function usage { 5 | echo "Creates a Dataproc cluster with a Jupyter interface." 6 | echo "usage $0: [-h] [-n=name] [-b=bucket]" 7 | echo " -h display help" 8 | echo " -n=name name of cluster to create" 9 | echo " -b=bucket name of bucket in GCS for persistence" 10 | exit 1 11 | } 12 | 13 | for i in "$@" 14 | do 15 | case $i in 16 | -n=*) 17 | CLUSTER_NAME="${i#*=}" 18 | shift 19 | ;; 20 | -b=*) 21 | BUCKET_NAME="${i#*=}" 22 | shift 23 | ;; 24 | -h) 25 | usage 26 | ;; 27 | *) 28 | ;; 29 | esac 30 | done 31 | 32 | 33 | [[ -z $CLUSTER_NAME ]] && usage 34 | [[ -z $BUCKET_NAME ]] && usage 35 | 36 | gcloud dataproc clusters create $CLUSTER_NAME \ 37 | --metadata "JUPYTER_PORT=8124,JUPYTER_CONDA_PACKAGES=numpy:pandas:scikit-learn:jinja2:mock:pytest:pytest-cov" \ 38 | --initialization-actions \ 39 | gs://dataproc-initialization-actions/jupyter/jupyter.sh \ 40 | --bucket $BUCKET_NAME \ 41 | --num-workers 2 42 | #--worker-machine-type=n1-highcpu-8 \ 43 | #--master-machine-type=n1-highcpu-8 44 | -------------------------------------------------------------------------------- /tests/system/data/neighbor/train/2/train.json: -------------------------------------------------------------------------------- 1 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}} 2 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}} 3 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}} 4 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}} 5 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}} 6 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "0"}}}} 7 | {"event": {"identifiers": {"djUCID": {"value": "2", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "orderconfirmation", "details": {"products": [{"group_id": "0"}]}}} 8 | -------------------------------------------------------------------------------- /data/queries/marreco/datajet/search.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | data.* 4 | FROM( 5 | SELECT 6 | ARRAY( 7 | SELECT AS STRUCT 8 | STRUCT(1 as schema_version, STRUCT(STRUCT(NULL) AS location, "" AS gender) as user, 9 | STRUCT(STRUCT("11" AS value, "bid" AS type) AS bid, STRUCT("_392" AS value, "customer_user_id" AS type) AS customer_user_id, STRUCT(fullvisitorid AS value, "djUCID" as type) AS djUCID) AS identifiers, 10 | STRUCT(device.browser AS client, device.operatingSystem AS os, device.deviceCategory AS origin) AS device, 11 | STRUCT("fish" AS tracker, page.pagePath AS url, referer AS url_referrer) AS source, 12 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS created_at, 13 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS local_timestamp, 14 | "search" AS type, 15 | STRUCT(REGEXP_EXTRACT(page.pagePath, r'/\?q=(.*)') AS query, "keyword" AS query_type) AS details) event, 16 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS created_at 17 | FROM UNNEST(hits) WHERE REGEXP_CONTAINS(page.pagePath, r'/\?q=')) data 18 | FROM `{{dataset}}.ga_sessions_*` 19 | WHERE True 20 | AND EXISTS(SELECT 1 FROM UNNEST(hits) WHERE REGEXP_CONTAINS(page.pagePath, r'/\?q=')) 21 | AND _TABLE_SUFFIX BETWEEN FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL {{days_interval}} DAY)) AND FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL {{days_interval_end}} DAY)) 22 | ), 23 | UNNEST(data) data 24 | -------------------------------------------------------------------------------- /tests/data/productview_mock.json: -------------------------------------------------------------------------------- 1 | {"event":{"schema_version":1,"user":{"location":{}},"identifiers":{"bid":{"value":"dcb7b9b540188da2ef245e15785d2ecb","type":"bid"},"djUCID":{"value":"25e35a54c8cace51","type":"djUCID"}},"device":{"client":"Mozilla/5.0 (Linux; Android 4.4.4; SM-G530BT Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36","os":"Linux armv7l","origin":"web"},"source":{"tracker":"fish","url":"m.dafiti.com.br/Moletom-Enfim-Estampado-Azul-2923423.html","url_referrer":"m.dafiti.com.br/catalog/?q=Enfim\\u0026wtqs=1\\u0026dft_capi=1\\u0026page=7"},"created_at":1502582400021,"local_timestamp":1502589588052,"type":"productview","details":{"product":{"id":"","title":"Moletom Enfim Estampado Azul","brand":{"name":"Enfim"},"price":{"current":84.99},"group_id":"MA042APM76IPJ","skus":["MA042APM76IPJ"],"categories":[[{"name":"masculino","slug":"masculino"},{"name":"roupas masculinas","slug":"roupas-masculinas"},{"name":"moletons","slug":"moletons"},{"name":"moletom aberto","slug":"moletom-aberto"}]],"main_category_path":[{"name":"masculino","slug":"masculino"},{"name":"roupas masculinas","slug":"roupas-masculinas"},{"name":"moletons","slug":"moletons"},{"name":"moletom aberto","slug":"moletom-aberto"}],"url":"m.dafiti.com.br/Moletom-Enfim-Estampado-Azul-2923423.html","images":["https://dafitistatic-a.akamaihd.net/p/Enfim-Moletom-Enfim-Estampado-Azul-5611-3243292-1-zoom.jpg"],"colors":["Azul"]}}},"created_at":1502582400021} 2 | -------------------------------------------------------------------------------- /tests/system/data/top_seller/train/2/train.json: -------------------------------------------------------------------------------- 1 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}} 2 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}} 3 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}} 4 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}} 5 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}} 6 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "0"}}}} 7 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "orderconfirmation", "details": {"products": [{"group_id": "0"}], "quantities": [1]}}} 8 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "orderconfirmation", "details": {"products": [{"group_id": "1"}], "quantities": [1]}}} 9 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "orderconfirmation", "details": {"products": [{"group_id": "2"}], "quantities": [2]}}} 10 | -------------------------------------------------------------------------------- /tests/unit/spark_jobs/test_factory.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2017 Willian Fuks 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | import unittest 24 | import sys 25 | import os 26 | import mock 27 | 28 | sys.path.append('./spark_jobs') 29 | 30 | class Test_factory(unittest.TestCase): 31 | @staticmethod 32 | def _get_target_class(): 33 | from factory import MarrecoFactory 34 | 35 | 36 | return MarrecoFactory 37 | 38 | 39 | def test_factor_alg(self): 40 | klass = self._get_target_class() 41 | with self.assertRaises(ValueError): 42 | klass._factor_alg('test') 43 | 44 | top_seller = klass._factor_alg('top_seller') 45 | self.assertEqual(top_seller.__name__, 'MarrecoTopSellerJob') 46 | 47 | neighbor = klass._factor_alg('neighbor') 48 | self.assertEqual(neighbor.__name__, 'MarrecoNeighborJob') 49 | -------------------------------------------------------------------------------- /tests/unit/spark_jobs/test_base.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2017 Willian Fuks 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | import unittest 24 | import sys 25 | import os 26 | import mock 27 | 28 | sys.path.append('./spark_jobs') 29 | 30 | class Test_base(unittest.TestCase): 31 | @staticmethod 32 | def _get_target_class(): 33 | from base import MarrecoBase 34 | 35 | 36 | return MarrecoBase 37 | 38 | 39 | def test_ctor(self): 40 | klass = self._get_target_class()(['test']) 41 | self.assertEqual(klass.tasks, ['test']) 42 | 43 | 44 | def test_run_tasks(self): 45 | method = mock.Mock() 46 | kwargs = {'1': 1} 47 | sc = mock.Mock() 48 | klass = self._get_target_class()([(method, kwargs)]) 49 | print(klass.tasks) 50 | klass.run_tasks(sc) 51 | 52 | method.assert_called_once_with(sc, **kwargs) 53 | -------------------------------------------------------------------------------- /data/queries/marreco/datajet/orderconfirmation.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | data.* 4 | FROM( 5 | SELECT 6 | ARRAY( 7 | SELECT AS STRUCT 8 | STRUCT(1 as schema_version, STRUCT(STRUCT(NULL) AS location, "" AS gender) as user, 9 | STRUCT(STRUCT("11" AS value, "bid" AS type) AS bid, STRUCT("_392" AS value, "customer_user_id" AS type) AS customer_user_id, STRUCT(fullvisitorid AS value, "djUCID" as type) AS djUCID) AS identifiers, 10 | STRUCT(device.browser AS client, device.operatingSystem AS os, device.deviceCategory AS origin) AS device, 11 | STRUCT("fish" AS tracker, page.pagePath AS url, referer AS url_referrer) AS source, 12 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS created_at, 13 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS local_timestamp, 14 | "orderconfirmation" AS type, 15 | STRUCT(transaction.transactionId AS order_id, ARRAY(SELECT STRUCT(STRUCT(productBrand AS name) AS brand, STRUCT(productPrice / 10e6 AS `current`, productPrice / 10e6 AS previous) AS price, REGEXP_EXTRACT(productSKU, r'(.*)-\d+') AS group_id, [COALESCE(REGEXP_EXTRACT(productSKU, r'(.*)-\d+'), productSKU), productSKU] AS skus, ARRAY(SELECT AS STRUCT v AS name, REGEXP_REPLACE(v, ' ', '-') AS slug FROM UNNEST(SPLIT( v2productCategory, '|')) v) AS main_category_path) FROM UNNEST(product)) AS products, ARRAY(SELECT productQuantity FROM UNNEST(product)) AS quantities) AS details) event, 16 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS created_at 17 | FROM UNNEST(hits) WHERE ecommerceaction.action_type = '6') data 18 | FROM `40663402.ga_sessions_*` 19 | WHERE True 20 | AND EXISTS(SELECT 1 FROM UNNEST(hits) WHERE ecommerceaction.action_type = '6') 21 | AND NOT EXISTS(SELECT 1 FROM UNNEST(hits), UNNEST(product) WHERE productSKU IS NULL OR productQuantity IS NULL) 22 | AND _TABLE_SUFFIX BETWEEN FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL {{days_init}} DAY)) AND FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL {{days_end}} DAY)) 23 | ), 24 | UNNEST(data) data 25 | 26 | -------------------------------------------------------------------------------- /data/queries/marreco/datajet/purchase.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | data.* 4 | FROM( 5 | SELECT 6 | ARRAY( 7 | SELECT AS STRUCT 8 | STRUCT(1 as schema_version, STRUCT(STRUCT(NULL) AS location, "" AS gender) as user, 9 | STRUCT(STRUCT("11" AS value, "bid" AS type) AS bid, STRUCT("_392" AS value, "customer_user_id" AS type) AS customer_user_id, STRUCT(fullvisitorid AS value, "djUCID" as type) AS djUCID) AS identifiers, 10 | STRUCT(device.browser AS client, device.operatingSystem AS os, device.deviceCategory AS origin) AS device, 11 | STRUCT("fish" AS tracker, page.pagePath AS url, referer AS url_referrer) AS source, 12 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS created_at, 13 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS local_timestamp, 14 | "orderconfirmation" AS type, 15 | STRUCT(transaction.transactionId AS order_id, ARRAY(SELECT STRUCT(STRUCT(productBrand AS name) AS brand, STRUCT(productPrice / 10e6 AS `current`, productPrice / 10e6 AS previous) AS price, REGEXP_EXTRACT(productSKU, r'(.*)-\d+') AS group_id, [COALESCE(REGEXP_EXTRACT(productSKU, r'(.*)-\d+'), productSKU), productSKU] AS skus, ARRAY(SELECT AS STRUCT v AS name, REGEXP_REPLACE(v, ' ', '-') AS slug FROM UNNEST(SPLIT( v2productCategory, '|')) v) AS main_category_path) FROM UNNEST(product)) AS products, ARRAY(SELECT productQuantity FROM UNNEST(product)) AS quantities) AS details) event, 16 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS created_at 17 | FROM UNNEST(hits) WHERE ecommerceaction.action_type = '6') data 18 | FROM `{{dataset}}.ga_sessions_*` 19 | WHERE True 20 | AND EXISTS(SELECT 1 FROM UNNEST(hits) WHERE ecommerceaction.action_type = '6') 21 | AND NOT EXISTS(SELECT 1 FROM UNNEST(hits), UNNEST(product) WHERE productSKU IS NULL OR productQuantity IS NULL) 22 | AND _TABLE_SUFFIX BETWEEN FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL {{days_interval}} DAY)) AND FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL {{days_interval_end}} DAY)) 23 | ), 24 | UNNEST(data) data 25 | -------------------------------------------------------------------------------- /tests/unit/data/test_help.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2017 Willian Fuks 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | 24 | import unittest 25 | import sys 26 | import os 27 | import mock 28 | 29 | sys.path.append('./data') 30 | 31 | class Test_help(unittest.TestCase): 32 | @staticmethod 33 | def _get_target_class(): 34 | from help import Jinjafy 35 | 36 | 37 | return Jinjafy 38 | 39 | 40 | def test_ctor(self): 41 | klass = self._get_target_class()('.') 42 | self.assertEqual(klass.env.loader.searchpath, ['.']) 43 | 44 | 45 | def test_render_template(self): 46 | print(os.path.abspath('.')) 47 | klass = self._get_target_class()('tests/data') 48 | result = klass.render_template('test_template.html', 49 | **{'name': 'test'}) 50 | 51 | expected = """

test

""" 52 | self.assertEqual(result, expected) 53 | -------------------------------------------------------------------------------- /spark_jobs/factory.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2017 Willian Fuks 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | 24 | """ 25 | Main Class to manage Spark Jobs. 26 | """ 27 | 28 | class MarrecoFactory(object): 29 | """Factory to get appropriate algorithm strategy. 30 | 31 | :type algorithm: str 32 | :param algorithm: states which algorithm should be prepared. 33 | 34 | :rtype: `base.MarrecoBase` 35 | :returns: algorithm strategy ready to run jobs and analysis. 36 | """ 37 | @classmethod 38 | def _factor_alg(cls, alg): 39 | if alg == 'top_seller': 40 | from top_seller import MarrecoTopSellerJob 41 | return MarrecoTopSellerJob 42 | elif alg == 'neighbor': 43 | from neighbor import MarrecoNeighborJob 44 | return MarrecoNeighborJob 45 | else: 46 | raise ValueError("Algorithm '{}' is not available. Please choose " 47 | "between 'neighbor' or 'top_seller'".format(alg)) 48 | -------------------------------------------------------------------------------- /data/queries/marreco/datajet/productview.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | data.* 4 | FROM( 5 | SELECT 6 | ARRAY( 7 | SELECT AS STRUCT 8 | STRUCT(1 as schema_version, STRUCT(STRUCT(NULL) AS location, "" AS gender) as user, 9 | STRUCT(STRUCT("11" AS value, "bid" AS type) AS bid, STRUCT("_392" AS value, "customer_user_id" AS type) AS customer_user_id, STRUCT(fullvisitorid AS value, "djUCID" as type) AS djUCID) AS identifiers, 10 | STRUCT(device.browser AS client, device.operatingSystem AS os, device.deviceCategory AS origin) AS device, 11 | STRUCT("fish" AS tracker, page.pagePath AS url, referer AS url_referrer) AS source, 12 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS created_at, 13 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS local_timestamp, 14 | "productview" AS type, 15 | STRUCT(STRUCT((SELECT v2ProductName FROM UNNEST(product)) AS title, STRUCT((SELECT productBrand FROM UNNEST(product)) AS name) AS brand, STRUCT((SELECT productPrice / 10e6 FROM UNNEST(product)) AS `current`, (SELECT productPrice / 10e6 FROM UNNEST(product)) AS `previous`) AS price, (SELECT productSKU FROM UNNEST(product)) AS group_id, ARRAY(SELECT productSKU FROM UNNEST(product)) AS skus, ARRAY(SELECT AS STRUCT v AS name, REGEXP_REPLACE(v, ' ', '-') AS slug FROM UNNEST(SPLIT((SELECT v2productCategory FROM UNNEST(product)), ',')) v) AS categories, ARRAY(SELECT AS STRUCT v AS name, REGEXP_REPLACE(v, ' ', '-') AS slug FROM UNNEST(SPLIT((SELECT v2productCategory FROM UNNEST(product)), ',')) v) AS main_category_path, page.pagePath AS url, ARRAY(SELECT page.pagePath FROM UNNEST(hits) LIMIT 1) AS images) AS product) AS details) event, 16 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS created_at 17 | FROM UNNEST(hits) WHERE ecommerceaction.action_type = '2') data 18 | FROM `{{dataset}}.ga_sessions_*` 19 | WHERE True 20 | AND EXISTS(SELECT 1 FROM UNNEST(hits) WHERE ecommerceaction.action_type = '2') 21 | AND _TABLE_SUFFIX BETWEEN FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL {{days_interval}} DAY)) AND FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL {{days_interval_end}} DAY)) 22 | ), 23 | UNNEST(data) data 24 | -------------------------------------------------------------------------------- /data/help.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2017 Willian Fuks 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | """Helper functions with general scopes""" 24 | 25 | import os 26 | import uuid 27 | from jinja2 import Environment, FileSystemLoader 28 | 29 | from google.cloud.bigquery import Client as bq_Client 30 | from google.cloud.storage import Client as s_Client 31 | 32 | 33 | class Jinjafy(object): 34 | """Handles main operations related to Jinja such as creating 35 | environments, rendering templates and related operations. 36 | 37 | :type env: str 38 | :param env: folder of where to build jinja environment 39 | """ 40 | 41 | def __init__(self, loader_path): 42 | self.env = Environment(loader=FileSystemLoader(loader_path)) 43 | 44 | 45 | def render_template(self, file_path, **kwargs): 46 | """Gets Jinja template and return the file rendered based on kwargs input. 47 | 48 | :type file_path: str 49 | :param file_path: path to file containing jinja template 50 | 51 | :param kwargs: key values to render jinja template. 52 | """ 53 | return self.env.get_template(file_path).render(**kwargs) 54 | -------------------------------------------------------------------------------- /spark_jobs/run_marreco.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2017 Willian Fuks 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | 24 | """ 25 | Builds Marreco to run Jobs in Spark. 26 | """ 27 | 28 | import sys 29 | import argparse 30 | 31 | import pyspark 32 | from factory import MarrecoFactory 33 | 34 | def get_alg(args): 35 | parser = argparse.ArgumentParser() 36 | 37 | args = [e for e in args if 'algorithm' in e or '-h' in e] 38 | if len(args) == 2: 39 | args.remove('-h') 40 | parser.add_argument('--algorithm', 41 | dest='algorithm', 42 | type=str, 43 | help=('Which algorithm to run. Currently options are ' 44 | '"neighbor" or "top seller"')) 45 | 46 | args = parser.parse_args(args) 47 | return args 48 | 49 | def main(): 50 | alg = get_alg(sys.argv[1:]).algorithm 51 | if alg: 52 | job = MarrecoFactory._factor_alg(alg)() 53 | args = job.process_sysargs( 54 | [e for e in sys.argv[1:] if 'algorithm' not in e]) 55 | 56 | with pyspark.SparkContext() as sc: 57 | job.transform_data(sc, args) 58 | job.build_marreco(sc, args) 59 | 60 | 61 | if __name__ == '__main__': 62 | sys.exit(main()) 63 | -------------------------------------------------------------------------------- /nox.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2017 Willian Fuks 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | import os 24 | import nox 25 | 26 | @nox.session 27 | @nox.parametrize('python_version', ['3.6']) 28 | def unit_tests(session, python_version): 29 | """Run just unit testings""" 30 | 31 | session.intepreter = 'python{}'.format(python_version) 32 | 33 | # Set virtualenv dirname 34 | session.virtualenv_dirname = 'unit-' + python_version 35 | 36 | session.install('mock', 'pytest', 'pytest-cov') 37 | session.install('-e', '.') 38 | 39 | session.run('py.test', 40 | '--quite', 41 | '--cov=tests.unit', 42 | '--cov-append', 43 | '--cov-config=.coveragerc', 44 | '--cov-report=', 45 | '--cov-fail-under=100', 46 | os.path.join('tests', 'unit'), 47 | *session.posargs 48 | ) 49 | 50 | @nox.session 51 | @nox.parametrize('python_version', ['3.6']) 52 | def system_tests(session, python_version): 53 | """Run tests against a live spark (preferably a local cluster).""" 54 | 55 | session.interpreter = 'python{}'.format(python_version) 56 | 57 | session.virtualenv_dirname = 'sys-' + python_version 58 | 59 | session.install('mock', 'pytest') 60 | session.install('-e', '.') 61 | 62 | session.run('py.test', 63 | '--quiet', 64 | os.path.join('tests', 'system.py'), 65 | *session.posargs 66 | ) 67 | -------------------------------------------------------------------------------- /bin/launch_jupyter_interface.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | DIR="${BASH_SOURCE%/*}" 5 | [[ ! -d "$DIR" ]] && DIR="$PWD" 6 | 7 | source "utils.sh" 8 | 9 | function usage { 10 | echo "Creates an SSH tunnel and socks proxy and launches Chrome, using the environment " 11 | echo "variable DATAPROC_CLUSTER_NAME for the unique cluster name. The cluster metadata " 12 | echo "must contain a value for the key 'JUPYTER_PORT'." 13 | echo "" 14 | echo "If the appropriate environment variables are not set and the appropriate command" 15 | echo "line arguments are not given, then the usage message will be displayed and the " 16 | echo "script will exit." 17 | echo "" 18 | echo "usage: $0 [-h] [-c=cluster-name] [-z=zone]" 19 | echo " -h display help" 20 | echo " -z=zone specify cloud zone for cluster" 21 | echo " -c=cluster-name specify unique dataproc cluster name to launch" 22 | exit 1 23 | } 24 | 25 | for i in "$@" 26 | do 27 | case $i in 28 | -z=*) 29 | ZONE="${i#*=}" 30 | shift # past argument=value 31 | ;; 32 | -c=*) 33 | DATAPROC_CLUSTER_NAME="${i#*=}" 34 | shift # past argument=value 35 | ;; 36 | -h) 37 | usage 38 | ;; 39 | *) 40 | ;; 41 | esac 42 | done 43 | 44 | [[ -z $DATAPROC_CLUSTER_NAME ]] && usage 45 | [[ -z $ZONE ]] && usage 46 | JUPYTER_PORT=$(get_metadata_property $DATAPROC_CLUSTER_NAME JUPYTER_PORT) 47 | [[ ! $JUPYTER_PORT =~ ^[0-9]+$ ]] && throw "metadata must contain a valid 'JUPYTER_PORT' value, but instead has the value \"$JUPYTER_PORT\"" 48 | 49 | # TODO: Ensure that Jupyter notebook is running on cluster master node 50 | 51 | echo "Using following cluster name: $DATAPROC_CLUSTER_NAME" 52 | echo "Using following cluster zone: $ZONE" 53 | echo "Using following remote dataproc jupyter port: $JUPYTER_PORT" 54 | echo "" 55 | 56 | # 0. Set default path to Chrome application (by operating system type). 57 | # OS X 58 | #CHROME_APP_PATH="/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome" 59 | # Linux 60 | CHROME_APP_PATH="/usr/bin/google-chrome" 61 | # Windows 62 | #CHROME_APP_PATH="C:\Program Files (x86)\Google\Chrome\Application\chrome.exe" 63 | 64 | # Following configuration at: 65 | # https://cloud.google.com/dataproc/cluster-web-interfaces 66 | # 1. Setup ssh tunnel and socks proxy 67 | ZONE_FLAG="" 68 | [[ -v ZONE ]] && ZONE_FLAG="--zone=$ZONE" 69 | gcloud compute ssh $ZONE_FLAG --ssh-flag="-D 10000" --ssh-flag="-N" --ssh-flag="-n" "$DATAPROC_CLUSTER_NAME-m" & 70 | sleep 5 # Wait for tunnel to be ready before opening browser... 71 | 72 | # 2.Launch Chrome instance, referencing the proxy server. 73 | # TODO: Parameterize the chrome app path 74 | eval $CHROME_APP_PATH \ 75 | "http://$DATAPROC_CLUSTER_NAME-m:$JUPYTER_PORT" \ 76 | --proxy-server="socks5://localhost:10000" \ 77 | --host-resolver-rules="MAP * 0.0.0.0 , EXCLUDE localhost" \ 78 | --user-data-dir=/tmp/ 79 | 80 | -------------------------------------------------------------------------------- /tests/system/data/datajet_test.json: -------------------------------------------------------------------------------- 1 | {"event":{"schema_version":1,"user":{"location":{}},"identifiers":{"bid":{"value":"dcb7b9b540188da2ef245e15785d2ecb","type":"bid"},"djUCID":{"value":"25e35a54c8cace51","type":"djUCID"}},"device":{"client":"Mozilla/5.0 (Linux; Android 4.4.4; SM-G530BT Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36","os":"Linux armv7l","origin":"web"},"source":{"tracker":"fish","url":"m.dafiti.com.br/Moletom-Enfim-Estampado-Azul-2923423.html","url_referrer":"m.dafiti.com.br/catalog/?q=Enfim\\u0026wtqs=1\\u0026dft_capi=1\\u0026page=7"},"created_at":1502582400021,"local_timestamp":1502589588052,"type":"productview","details":{"product":{"id":"","title":"Moletom Enfim Estampado Azul","brand":{"name":"Enfim"},"price":{"current":84.99},"group_id":"MA042APM76IPJ","skus":["MA042APM76IPJ"],"categories":[[{"name":"masculino","slug":"masculino"},{"name":"roupas masculinas","slug":"roupas-masculinas"},{"name":"moletons","slug":"moletons"},{"name":"moletom aberto","slug":"moletom-aberto"}]],"main_category_path":[{"name":"masculino","slug":"masculino"},{"name":"roupas masculinas","slug":"roupas-masculinas"},{"name":"moletons","slug":"moletons"},{"name":"moletom aberto","slug":"moletom-aberto"}],"url":"m.dafiti.com.br/Moletom-Enfim-Estampado-Azul-2923423.html","images":["https://dafitistatic-a.akamaihd.net/p/Enfim-Moletom-Enfim-Estampado-Azul-5611-3243292-1-zoom.jpg"],"colors":["Azul"]}}},"created_at":1502582400021} 2 | {"event":{"schema_version":1,"user":{"location":{}},"identifiers":{"bid":{"value":"03651597b830fbbee9c7f4299989bd48","type":"bid"},"djUCID":{"value":"610574c802ba3b33","type":"djUCID"}},"device":{"client":"Mozilla/5.0 (Linux; Android 7.0; XT1635-02 Build/NPN25.137-24-1; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/59.0.3071.125 Mobile Safari/537.36 [FB_IAB/FB4A;FBAV/136.0.0.22.91;]","os":"Linux armv7l","origin":"web"},"source":{"tracker":"fish","url":"m.dafiti.com.br/checkout/success/","url_referrer":"checkout.dafiti.com.br/checkout/finish/"},"created_at":1502582416663,"local_timestamp":1502582415616,"type":"orderconfirmation","details":{"order_id":"15965531","products":[{"id":"","price":{"current":74.5},"group_id":"DA923SHF35RHK","skus":["DA923SHF35RHK"],"main_category_path":[{"name":"calcados","slug":"calcados"},{"name":"calcados femininos","slug":"calcados-femininos"}]},{"id":"","price":{"current":74.5},"group_id":"VI618SHF69UQC","skus":["VI618SHF69UQC"],"main_category_path":[{"name":"calcados","slug":"calcados"},{"name":"calcados femininos","slug":"calcados-femininos"}]}],"quantities":[1,1]}},"created_at":1502582416663} 3 | {"event":{"schema_version":1,"user":{"location":{}},"device":{"origin":"web"},"source":{"tracker":"hawk","url":"/","url_referrer":"/"},"created_at":1502582400127,"type":"search_response","details":{"generation_ms":69,"request":{"category_dept":1,"facet_count":1000,"facets":["brand","price","size","gender","color","categories_slugs","categories_ids","owner","category"],"fields":"*","filters":{"brand.slug":["calvin-klein-kids"],"categories_ids":["257"]},"gs":3,"rq":5,"size":48,"sort":"relevance","top_product_ids":[""]},"response":{"count":189,"id":"d7c104cab610e7edf07290428c4db4e6ec49fcc1","items":["CA947APM37XCS","CA947APM24OVZ"]}}},"created_at":1502582400127} 4 | -------------------------------------------------------------------------------- /tests/system/data/top_seller/datajet_test.json: -------------------------------------------------------------------------------- 1 | {"event":{"schema_version":1,"user":{"location":{}},"identifiers":{"bid":{"value":"dcb7b9b540188da2ef245e15785d2ecb","type":"bid"},"djUCID":{"value":"25e35a54c8cace51","type":"djUCID"}},"device":{"client":"Mozilla/5.0 (Linux; Android 4.4.4; SM-G530BT Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36","os":"Linux armv7l","origin":"web"},"source":{"tracker":"fish","url":"m.dafiti.com.br/Moletom-Enfim-Estampado-Azul-2923423.html","url_referrer":"m.dafiti.com.br/catalog/?q=Enfim\\u0026wtqs=1\\u0026dft_capi=1\\u0026page=7"},"created_at":1502582400021,"local_timestamp":1502589588052,"type":"productview","details":{"product":{"id":"","title":"Moletom Enfim Estampado Azul","brand":{"name":"Enfim"},"price":{"current":84.99},"group_id":"MA042APM76IPJ","skus":["MA042APM76IPJ"],"categories":[[{"name":"masculino","slug":"masculino"},{"name":"roupas masculinas","slug":"roupas-masculinas"},{"name":"moletons","slug":"moletons"},{"name":"moletom aberto","slug":"moletom-aberto"}]],"main_category_path":[{"name":"masculino","slug":"masculino"},{"name":"roupas masculinas","slug":"roupas-masculinas"},{"name":"moletons","slug":"moletons"},{"name":"moletom aberto","slug":"moletom-aberto"}],"url":"m.dafiti.com.br/Moletom-Enfim-Estampado-Azul-2923423.html","images":["https://dafitistatic-a.akamaihd.net/p/Enfim-Moletom-Enfim-Estampado-Azul-5611-3243292-1-zoom.jpg"],"colors":["Azul"]}}},"created_at":1502582400021} 2 | {"event":{"schema_version":1,"user":{"location":{}},"identifiers":{"bid":{"value":"03651597b830fbbee9c7f4299989bd48","type":"bid"},"djUCID":{"value":"610574c802ba3b33","type":"djUCID"}},"device":{"client":"Mozilla/5.0 (Linux; Android 7.0; XT1635-02 Build/NPN25.137-24-1; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/59.0.3071.125 Mobile Safari/537.36 [FB_IAB/FB4A;FBAV/136.0.0.22.91;]","os":"Linux armv7l","origin":"web"},"source":{"tracker":"fish","url":"m.dafiti.com.br/checkout/success/","url_referrer":"checkout.dafiti.com.br/checkout/finish/"},"created_at":1502582416663,"local_timestamp":1502582415616,"type":"orderconfirmation","details":{"order_id":"15965531","products":[{"id":"","price":{"current":74.5},"group_id":"DA923SHF35RHK","skus":["DA923SHF35RHK"],"main_category_path":[{"name":"calcados","slug":"calcados"},{"name":"calcados femininos","slug":"calcados-femininos"}]},{"id":"","price":{"current":74.5},"group_id":"VI618SHF69UQC","skus":["VI618SHF69UQC"],"main_category_path":[{"name":"calcados","slug":"calcados"},{"name":"calcados femininos","slug":"calcados-femininos"}]}],"quantities":[1,1]}},"created_at":1502582416663} 3 | {"event":{"schema_version":1,"user":{"location":{}},"device":{"origin":"web"},"source":{"tracker":"hawk","url":"/","url_referrer":"/"},"created_at":1502582400127,"type":"search_response","details":{"generation_ms":69,"request":{"category_dept":1,"facet_count":1000,"facets":["brand","price","size","gender","color","categories_slugs","categories_ids","owner","category"],"fields":"*","filters":{"brand.slug":["calvin-klein-kids"],"categories_ids":["257"]},"gs":3,"rq":5,"size":48,"sort":"relevance","top_product_ids":[""]},"response":{"count":189,"id":"d7c104cab610e7edf07290428c4db4e6ec49fcc1","items":["CA947APM37XCS","CA947APM24OVZ"]}}},"created_at":1502582400127} 4 | -------------------------------------------------------------------------------- /tests/unit/spark_jobs/test_run_marreco.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2017 Willian Fuks 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | import unittest 24 | import sys 25 | import mock 26 | from collections import namedtuple 27 | 28 | sys.path.append('./spark_jobs') 29 | 30 | 31 | class Test_run_marreco(unittest.TestCase): 32 | def test_get_alg(self): 33 | from run_marreco import get_alg 34 | 35 | 36 | expected = 'test' 37 | args = get_alg(['--algorithm=test']) 38 | self.assertEqual(expected, args.algorithm) 39 | 40 | 41 | @mock.patch('run_marreco.get_alg') 42 | @mock.patch('run_marreco.pyspark') 43 | @mock.patch('run_marreco.MarrecoFactory') 44 | def test_main_runs(self, factory_mock, spark_mock, get_alg_mock): 45 | from run_marreco import main 46 | 47 | 48 | Args = namedtuple('args', 'algorithm') 49 | args = Args('test') 50 | get_alg_mock.return_value = args 51 | 52 | job_mock = mock.Mock() 53 | factory_mock._factor_alg.return_value.return_value = job_mock 54 | 55 | job_mock.process_sysargs.return_value = 'test' 56 | context_mock = mock.Mock() 57 | spark_mock.SparkContext.return_value = context_mock 58 | context_enter_mock = mock.Mock() 59 | 60 | context_mock.__enter__ = context_enter_mock 61 | context_mock.__exit__ = mock.Mock() 62 | 63 | main() 64 | job_mock.transform_data.assert_called_once_with(context_enter_mock(), 'test') 65 | job_mock.build_marreco.assert_called_once_with(context_enter_mock(), 'test') 66 | 67 | 68 | @mock.patch('run_marreco.get_alg') 69 | @mock.patch('run_marreco.pyspark') 70 | @mock.patch('run_marreco.MarrecoFactory') 71 | def test_main_does_not_run(self, factory_mock, spark_mock, get_alg_mock): 72 | from run_marreco import main 73 | 74 | 75 | Args = namedtuple('args', 'algorithm') 76 | args = Args(None) 77 | get_alg_mock.return_value = args 78 | 79 | job_mock = mock.Mock() 80 | factory_mock._factor_alg.return_value.return_value = job_mock 81 | 82 | job_mock.process_sysargs.return_value = 'test' 83 | context_mock = mock.Mock() 84 | spark_mock.SparkContext.return_value = context_mock 85 | context_enter_mock = mock.Mock() 86 | 87 | context_mock.__enter__ = context_enter_mock 88 | context_mock.__exit__ = mock.Mock() 89 | 90 | main() 91 | job_mock.transform_data.assert_not_called() 92 | job_mock.build_marreco.assert_not_called() 93 | -------------------------------------------------------------------------------- /spark_jobs/base.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2017 Willian Fuks 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | 24 | """ 25 | Base Class for Algorithms in Spark. 26 | """ 27 | 28 | import abc 29 | import datetime 30 | 31 | class MarrecoBase(object): 32 | """Base Class to run Jobs against Spark 33 | 34 | :type tasks: list 35 | :param tasks: list with values [(task, {key:value}] pairs to be used later 36 | on when invoking command ``self.run()`` 37 | """ 38 | def __init__(self, tasks=[]): 39 | self.tasks = tasks 40 | 41 | 42 | def run_tasks(self, sc): 43 | """For each task saved in ``self.task``, uses the context ``sc`` to 44 | execute the jobs. 45 | 46 | :type sc: `pyspark.SparkContext` 47 | :param sc: spark context used to run the jobs. 48 | """ 49 | if not self.tasks: 50 | raise ValueError("``self.tasks`` list is empty. Please specify" 51 | " which jobs you want to run") 52 | 53 | for method, kwargs in self.tasks: 54 | method(sc, **kwargs) 55 | 56 | 57 | @abc.abstractmethod 58 | def process_sysargs(self, args): 59 | """Process input arguments sent in sys args. Each algorithm have its 60 | own implementation for making the parsing. 61 | 62 | :type args: list 63 | :param args: list of arguments like ['--days_init=2', '--days_end=1'] 64 | """ 65 | pass 66 | 67 | 68 | @abc.abstractmethod 69 | def transform_data(self, sc, args): 70 | """Gets data from datajet and transforms so that Marreco can read 71 | and use it properly. Each algorithm shall implement its own strategy 72 | """ 73 | pass 74 | 75 | 76 | @abc.abstractmethod 77 | def build_marreco(self, sc, args): 78 | """Main method for each algorithm where results are calculated, such 79 | as computing matrix similarities or top selling items. 80 | """ 81 | pass 82 | 83 | @abc.abstractmethod 84 | def get_formatted_date(self, day): 85 | """This method is used mainly to transform the input of ``days`` 86 | into a string of type ``YYYY-MM-DD`` 87 | 88 | :type day: int 89 | :param day: how many days in time to come back from today to make 90 | the string transformation. 91 | 92 | :rtype: str 93 | :returns: formated date of today - day in format %Y-%m-%d 94 | """ 95 | return (datetime.datetime.now() - 96 | datetime.timedelta(days=day)).strftime('%Y-%m-%d') 97 | 98 | -------------------------------------------------------------------------------- /notebooks/marreco_dense_dimsum.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | cells: [ 3 | { 4 | cell_type: "code", 5 | execution_count: null, 6 | metadata: { 7 | collapsed: true 8 | }, 9 | outputs: [ ], 10 | source: [ 11 | "from pyspark.mllib.linalg import SparseVector ", 12 | "from pyspark.mllib.linalg.distributed import RowMatrix ", 13 | "import numpy as np ", 14 | "from sklearn.metrics.pairwise import cosine_similarity ", 15 | "import time ", 16 | "from collections import defaultdict ", 17 | "from pyspark.sql import functions as sfunc ", 18 | "from pyspark.sql import types as stypes ", 19 | "import math ", 20 | "import sys ", 21 | "from pyspark.ml.linalg import SparseVector ", 22 | "from pyspark.mllib.linalg.distributed import RowMatrix ", 23 | "from operator import itemgetter " 24 | ] 25 | }, 26 | { 27 | cell_type: "code", 28 | execution_count: null, 29 | metadata: { 30 | collapsed: true 31 | }, 32 | outputs: [ ], 33 | source: [ 34 | "schema = stypes.StructType().add("fv", stypes.StringType()).add("sku", stypes.StringType()).add("score", stypes.FloatType()) ", 35 | "train_df = spark.read.csv('gs://lbanor/pyspark/train_query*.gz', header=True, schema=schema) ", 36 | "train_df.createOrReplaceTempView('test1')" 37 | ] 38 | }, 39 | { 40 | cell_type: "code", 41 | execution_count: null, 42 | metadata: { 43 | collapsed: true 44 | }, 45 | outputs: [ ], 46 | source: [ 47 | "query = """ ", 48 | "SELECT ", 49 | " sku, ", 50 | " ROW_NUMBER() OVER (ORDER BY SUM(1)) -1 idx ", 51 | "FROM test1 ", 52 | "GROUP BY 1 ", 53 | """" ", 54 | "skus_rdd = spark.sql(query).rdd" 55 | ] 56 | }, 57 | { 58 | cell_type: "code", 59 | execution_count: null, 60 | metadata: { 61 | collapsed: true 62 | }, 63 | outputs: [ ], 64 | source: [ 65 | "d = {row.sku: row.idx for row in skus_rdd.collect()} ", 66 | "db = sc.broadcast(d) ", 67 | " ", 68 | "id_ = {value: key for key, value in d.items()} ", 69 | "id_b = sc.broadcast(id_)" 70 | ] 71 | }, 72 | { 73 | cell_type: "code", 74 | execution_count: null, 75 | metadata: { 76 | collapsed: true 77 | }, 78 | outputs: [ ], 79 | source: [ 80 | "query_users_items = """ ", 81 | "SELECT ", 82 | "data ", 83 | "FROM( ", 84 | " SELECT ", 85 | " fv, ", 86 | " COLLECT_LIST(STRUCT(sku, score * 2 AS score)) data ", 87 | " FROM test1 ", 88 | " GROUP BY 1 ", 89 | ") ", 90 | "WHERE size(data) between 2 and 20 ", 91 | "LIMIT 3 ", 92 | """"" 93 | ] 94 | }, 95 | { 96 | cell_type: "code", 97 | execution_count: null, 98 | metadata: { 99 | collapsed: true 100 | }, 101 | outputs: [ ], 102 | source: [ 103 | "users_rdd = spark.sql(query_users_items).rdd" 104 | ] 105 | }, 106 | { 107 | cell_type: "code", 108 | execution_count: null, 109 | metadata: { 110 | collapsed: true 111 | }, 112 | outputs: [ ], 113 | source: [ 114 | "def make_sparse(row): ", 115 | " tmp = sorted([(db.value[i.sku], i.score) for i in row.data], key=itemgetter(0)) ", 116 | " return (SparseVector(len(db.value), [e[0] for e in tmp], [e[1] for e in tmp]),) ", 117 | " ", 118 | "t0 = time.time() ", 119 | "mat = RowMatrix(users_rdd.map(lambda x: make_sparse(x)).toDF())" 120 | ] 121 | }, 122 | { 123 | cell_type: "code", 124 | execution_count: null, 125 | metadata: { 126 | collapsed: true 127 | }, 128 | outputs: [ ], 129 | source: [ 130 | "a = mat.columnSimilarities(0)" 131 | ] 132 | } 133 | ], 134 | metadata: { 135 | kernelspec: { 136 | display_name: "PySpark", 137 | language: "python", 138 | name: "pyspark" 139 | }, 140 | language_info: { 141 | codemirror_mode: { 142 | name: "ipython", 143 | version: 3 144 | }, 145 | file_extension: ".py", 146 | mimetype: "text/x-python", 147 | name: "python", 148 | nbconvert_exporter: "python", 149 | pygments_lexer: "ipython3", 150 | version: "3.5.2" 151 | } 152 | }, 153 | nbformat: 4, 154 | nbformat_minor: 2 155 | } 156 | -------------------------------------------------------------------------------- /tests/unit/data/test_exporter.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2017 Willian Fuks 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | import unittest 24 | import sys 25 | import os 26 | import mock 27 | 28 | sys.path.append('./data') 29 | 30 | class Test_Exporter(unittest.TestCase): 31 | @staticmethod 32 | def _get_target_class(): 33 | from exporter import Exporter 34 | 35 | 36 | return Exporter 37 | 38 | @mock.patch('exporter.uuid') 39 | def test_run_bq_query(self, uuid_mock): 40 | class JobSpec(object): 41 | def __init__(self, destination): 42 | self._destination = destination 43 | self._errors = None 44 | self._maximum_bytes_billed = None 45 | 46 | @property 47 | def destination(self): 48 | return self._desgination 49 | 50 | @destination.setter 51 | def destination(self, value): 52 | self._destination = value 53 | 54 | def run(self): 55 | pass 56 | 57 | @property 58 | def errors(self): 59 | return self._errors 60 | 61 | @errors.setter 62 | def errors(self, value): 63 | self._errors = value 64 | 65 | @property 66 | def maximum_bytes_billed(self): 67 | return self._maximum_bytes_billed 68 | 69 | @maximum_bytes_billed.setter 70 | def maximum_bytes_billed(self, value): 71 | self._maximum_bytes_billed = value 72 | 73 | def begin(self): 74 | pass 75 | 76 | def result(self): 77 | pass 78 | 79 | uuid_mock.uuid4.return_value = 'test_id' 80 | klass = self._get_target_class()() 81 | job_mock = mock.Mock(spec=JobSpec) 82 | job_mock.errors = None 83 | 84 | client_mock = mock.Mock() 85 | client_mock.run_async_query.return_value = job_mock 86 | 87 | klass.run_bq_query(client_mock, 88 | 'query_test', 89 | {'threshold': 2, 90 | 'destination': 'test', 91 | 'maximum_bytes_billed': 100}) 92 | 93 | self.assertEqual(job_mock.destination, 'test') 94 | self.assertEqual(job_mock.maximum_bytes_billed, 100) 95 | client_mock.run_async_query.assert_called_once_with(*['test_id', 'query_test']) 96 | 97 | with self.assertRaises(Exception): 98 | job_mock.errors = 'error' 99 | klass.run_bq_query(client_mock, 'test', {}) 100 | -------------------------------------------------------------------------------- /tests/system/data/neighbor/train/1/train.json: -------------------------------------------------------------------------------- 1 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "0"}}}} 2 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}} 3 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}} 4 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "2"}}}} 5 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}} 6 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}} 7 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "0"}}}} 8 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "0"}}}} 9 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}} 10 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}} 11 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "2"}}}} 12 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "2"}}}} 13 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}} 14 | {"event": {"identifiers": {"djUCID": {"value": "2", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}} 15 | {"event": {"identifiers": {"djUCID": {"value": "2", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}} 16 | {"event": {"identifiers": {"djUCID": {"value": "2", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "2"}}}} 17 | {"event": {"identifiers": {"djUCID": {"value": "2", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}} 18 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "0"}}}} 19 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}} 20 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "orderconfirmation", "details": {"products": [{"group_id": "2"}, {"group_id": "3"}]}}} 21 | -------------------------------------------------------------------------------- /tests/unit/spark_jobs/test_top_seller.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2017 Willian Fuks 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | 24 | import unittest 25 | import sys 26 | import mock 27 | import json 28 | import datetime 29 | import math 30 | from collections import namedtuple 31 | 32 | from pyspark.sql import types as stypes 33 | sys.path.append('./spark_jobs') 34 | 35 | 36 | class Test_TopSeller(unittest.TestCase): 37 | @staticmethod 38 | def _get_target_class(): 39 | from top_seller import MarrecoTopSellerJob 40 | 41 | 42 | return MarrecoTopSellerJob 43 | 44 | 45 | def test_load_top_seller_schema(self): 46 | klass = self._get_target_class()() 47 | expected = stypes.StructType(fields=[ 48 | stypes.StructField("item_key", stypes.StringType()), 49 | stypes.StructField("value", stypes.IntegerType())]) 50 | 51 | result = klass._load_top_seller_schema() 52 | 53 | self.assertEqual(expected, result) 54 | 55 | 56 | def test_render_inter_uri(self): 57 | klass = self._get_target_class()() 58 | expected = 'folder/part-*' 59 | result = klass._render_inter_uri('folder') 60 | self.assertEqual(expected, result) 61 | 62 | 63 | def test_process_json_product_view(self): 64 | klass = self._get_target_class()() 65 | data = open('tests/data/productview_mock.json').read() 66 | 67 | result = list(klass._process_json(data)) 68 | self.assertEqual(result, []) 69 | 70 | 71 | def test_process_json_search(self): 72 | klass = self._get_target_class()() 73 | 74 | data = open('tests/data/search_mock.json').read() 75 | result = list(klass._process_json(data)) 76 | self.assertEqual(result, []) 77 | 78 | 79 | def test_process_json_orderconfirmation(self): 80 | klass = self._get_target_class()() 81 | 82 | data = open('tests/data/orderconfirmation_mock.json').read() 83 | result = list(klass._process_json(data)) 84 | expected = [('DA923SHF35RHK', 1), ('VI618SHF69UQC', 1)] 85 | 86 | self.assertEqual(expected, result) 87 | 88 | 89 | def test_process_sysargs(self): 90 | input = ['--days_init=2', 91 | '--days_end=3', 92 | '--source_uri=source_uri', 93 | '--inter_uri=inter_uri', 94 | '--top_seller_uri=top_seller_uri', 95 | '--force=no'] 96 | 97 | klass = self._get_target_class()() 98 | args = klass.process_sysargs(input) 99 | self.assertEqual(args.days_init, 2) 100 | self.assertEqual(args.days_end, 3) 101 | self.assertEqual(args.source_uri, 'source_uri') 102 | self.assertEqual(args.inter_uri, 'inter_uri') 103 | self.assertEqual(args.top_seller_uri, 'top_seller_uri') 104 | self.assertEqual(args.force, 'no') 105 | 106 | -------------------------------------------------------------------------------- /tests/system/data/top_seller/train/1/train.json: -------------------------------------------------------------------------------- 1 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "0"}}}} 2 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}} 3 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}} 4 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "2"}}}} 5 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}} 6 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}} 7 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "0"}}}} 8 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "0"}}}} 9 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}} 10 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}} 11 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "2"}}}} 12 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "2"}}}} 13 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}} 14 | {"event": {"identifiers": {"djUCID": {"value": "2", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}} 15 | {"event": {"identifiers": {"djUCID": {"value": "2", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}} 16 | {"event": {"identifiers": {"djUCID": {"value": "2", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "2"}}}} 17 | {"event": {"identifiers": {"djUCID": {"value": "2", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}} 18 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "0"}}}} 19 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}} 20 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "orderconfirmation", "details": {"products": [{"group_id": "2"}, {"group_id": "3"}], "quantities": [1, 1]}}} 21 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "orderconfirmation", "details": {"products": [{"group_id": "0"}], "quantities": [1]}}} 22 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "orderconfirmation", "details": {"products": [{"group_id": "0"}], "quantities": [1]}}} 23 | -------------------------------------------------------------------------------- /data/exporter.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2017 Willian Fuks 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | """ 24 | Exports data from BigQuery to GCS for future spark jobs. 25 | """ 26 | 27 | import os 28 | import uuid 29 | 30 | from jinja2 import Environment, FileSystemLoader 31 | from google.cloud.bigquery import Client 32 | 33 | 34 | class Exporter(object): 35 | def bq_to_gcs(self, 36 | client, 37 | query, 38 | bq_config, 39 | gcs_config): 40 | """Runs ``query`` against BigQuery and exports the results to GCS. 41 | 42 | :type config: dict 43 | :param config: parameters to set the job constructor to run in BQ, 44 | such as destination table, dataset, expiration time. 45 | 46 | :type gcs_bucket: str 47 | :param gcs_bucket: bucket in where to save the query results. 48 | """ 49 | self.run_bq_query(client, query, bq_config) 50 | self.export_to_gcs(client, gcs_config) 51 | 52 | 53 | def run_bq_query(self, client, query, config): 54 | """Runs ``query`` against BQ 55 | 56 | :type client: data.clients.bq.uClient 57 | :param client: bq client for job operations. 58 | 59 | :type config: dict 60 | :param config: general information for job execution. 61 | 62 | :raises Exception: on ``job.errors`` is not None. 63 | """ 64 | job = client.run_async_query(str(uuid.uuid4()), query) 65 | job = self._update_job_attrs(job, config) 66 | job.begin() 67 | job.result() 68 | if job.errors: 69 | raise Exception(str(job.errors)) 70 | 71 | 72 | def export_to_gcs(self, client, config): 73 | """Runs job to export table from BigQuery to GCS. 74 | 75 | :type client: `google.cloud.bigquery.Client` 76 | :param client: bigquery client to run the job. 77 | 78 | :type config: dict 79 | :param config: key values to setup the job execution. 80 | 81 | :raises Exception: on ``job.errors`` is not None. 82 | """ 83 | job = client.extract_table_to_storage(str(uuid.uuid4()), 84 | config['table'], 85 | config['uri']) 86 | 87 | job = self._update_job_attrs(job, config) 88 | job.begin() 89 | result = job.result() 90 | if result.errors: 91 | raise Exception(str(result.errors)) 92 | 93 | 94 | def _update_job_attrs(self, job, config): 95 | """Updates job attributes before running ``begin`` or ``run``. 96 | 97 | :type job: `google.cloud.bigquery.job.Job` 98 | :param job: job to be executed. 99 | 100 | :type config: dict 101 | :param config: values with attributes to update how ``job`` should be 102 | executed. 103 | 104 | :rtype job: Job 105 | :returns: job with updated attributes. 106 | """ 107 | for key, value in config.items(): 108 | if key in set(dir(job)): 109 | job.__setattr__(key, value) 110 | return job 111 | -------------------------------------------------------------------------------- /bin/export_datajet.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2017 Willian Fuks 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | 24 | """ 25 | Export data from BigQuery to GCS for PySpark Neighborhood 26 | """ 27 | 28 | import argparse 29 | import sys 30 | 31 | sys.path.append('..') 32 | 33 | from data.exporter import Exporter 34 | from data.help import Jinjafy 35 | from google.cloud.bigquery import Client 36 | 37 | 38 | def get_sysargs(args): 39 | parser = argparse.ArgumentParser() 40 | 41 | parser.add_argument('--days_init', 42 | dest='days_init', 43 | type=int, 44 | help=("Total amount of days to come back in time " 45 | "from today's date.")) 46 | 47 | parser.add_argument('--days_end', 48 | dest='days_end', 49 | type=int, 50 | help=("Total amount of days to come back in time " 51 | "from today's date.")) 52 | 53 | parser.add_argument('--table', 54 | dest='table', 55 | type=str, 56 | help=("Table name for where to save results in BQ.")) 57 | 58 | parser.add_argument('--dataset', 59 | dest='dataset', 60 | type=str, 61 | help=('Name of dataset to export BQ tables to.')) 62 | 63 | parser.add_argument('--uri', 64 | dest='uri', 65 | type=str, 66 | help=('URI name to save the contents in GCS')) 67 | 68 | args = parser.parse_args(args) 69 | return args 70 | 71 | def build_query(jinjafy, query, input): 72 | """builds neighborhood query. 73 | 74 | :type jinjafy: `data.help.Jinjafy` 75 | :param jinjafy: handler for jinja operations. 76 | 77 | :type input: dict 78 | :param input: values to be used in jinja rendering. 79 | 80 | :rtype query: str 81 | :param query: query after jinja rendered runs. 82 | """ 83 | return jinjafy.render_template(query, **input) 84 | 85 | def main(): 86 | args = get_sysargs(sys.argv[1:]) 87 | exporter = Exporter() 88 | jinjafy = Jinjafy('../data/queries/marreco/datajet/') 89 | 90 | client = Client() 91 | dataset = client.dataset(args.dataset) 92 | table = dataset.table(args.table) 93 | 94 | for day in range(args.days_init, args.days_end - 1, -1): 95 | print('processing day: ', day) 96 | for idx, file_ in enumerate(['productview.sql', 97 | 'search.sql', 98 | 'purchase.sql']): 99 | 100 | query = build_query(jinjafy, 101 | file_, 102 | {'dataset': '40663402', 103 | 'days_interval': day, 104 | 'days_interval_end': day}) 105 | 106 | exporter.bq_to_gcs(client, 107 | query, 108 | {'destination': table, 109 | 'maximum_bytes_billed': 1000000000000, 110 | 'write_disposition': 'WRITE_TRUNCATE'}, 111 | {'uri': args.uri.format(day=day, idx=idx), 112 | 'table': table, 113 | 'compression': 'GZIP', 114 | 'destination_format': 'NEWLINE_DELIMITED_JSON'}) 115 | 116 | 117 | if __name__ == '__main__': 118 | sys.exit(main()) 119 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PySpark Marreco 2 | Implements the algorithm [DIMSUM](http://arxiv.org/abs/1304.1467) using a PySpark implementation. 3 | 4 | ## Getting Started 5 | This repository is built to implement the algorithm DIMSUM on a set of data containing customers interactions on products for a given web commerce. 6 | 7 | The folder `data` was implemented so to manipulate data that is used as input for the algorithm. It follows already a pre-defined schema that transforms data from Google BigQuery GA data to the specified schema (and saves results to a user input specified URI, as will further be discussed below. 8 | 9 | The main folder of this repository is `spark_jobs` where you'll find the main algorithm implemented, specifically, the file `spark_jobs/neighbor.py`. 10 | 11 | To run a neighbor job against spark using [Google Dataproc](https://cloud.google.com/dataproc/), this is one example of how to do so: 12 | 13 | ```sh 14 | gcloud dataproc jobs submit pyspark \ 15 | --cluster=test3 \ 16 | --properties=spark.hadoop.fs.s3n.awsAccessKeyId=,spark.hadoop.fs.s3n.awsSecretAccessKey= \ 17 | --py-files=base.py,factory.py,neighbor.py \ 18 | --bucket=lbanor \ 19 | run_marreco.py -- \ 20 | --days_init=7 \ 21 | --days_end=3 \ 22 | --source_uri=gs://lbanor/pyspark/datajet/dt={}/*.gz \ 23 | --inter_uri=gs://lbanor/pyspark/marreco/neighbor/intermediate/{} \ 24 | --threshold=0.1 \ 25 | --force=no \ 26 | --decay=0.03 \ 27 | --w_browse=0.5 \ 28 | --w_purchase=6.0 \ 29 | --neighbor_uri=s3n://gfg-reco/similarities_matrix/ \ 30 | --algorithm=neighbor 31 | ``` 32 | 33 | In this example, notice the `source_uri` is a template for where to get datajet data from. The `{}` is 34 | later used for string formatting in python (where the date is set). 35 | 36 | Next we have `inter_uri` and this is where intermediary results are saved. By intermediary results, this means 37 | the result of the pre-processing that each algorithm applies on datajet data to get its input schema setup for 38 | later usage. 39 | 40 | Finally we have the `neighbor_uri` and that's where we save the final results. The example shown above contains values 41 | that we used in our own production environment. Please change them accordingly to your infrastructure. 42 | 43 | For the `top_seller` algorithm, here follows an example: 44 | 45 | ```sh 46 | gcloud dataproc jobs submit pyspark --cluster=test3 \ 47 | --properties=spark.hadoop.fs.s3n.awsAccessKeyId=,spark.hadoop.fs.s3n.awsSecretAccessKey= \ 48 | --py-files=base.py,factory.py,top_seller.py \ 49 | --bucket=lbanor \ 50 | run_marreco.py -- \ 51 | --days_init=7 \ 52 | --days_end=3 \ 53 | --source_uri=gs://lbanor/pyspark/datajet/dt={}/*.gz \ 54 | --inter_uri=gs://lbanor/pyspark/marreco/top_seller/intermediate/{} \ 55 | --force=no \ 56 | --top_seller_uri=s3n://gfg-reco/top_seller_array/ \ 57 | --algorithm=top_seller 58 | ``` 59 | 60 | To get access for the *help* menu, you can run: 61 | 62 | ```sh 63 | python run_marreco.py -h 64 | ``` 65 | 66 | And for information about each algorithm, you can run (replace "neighbor" with any other available *algorithm* you desire): 67 | 68 | ```sh 69 | python run_marreco.py --algorithm=neighbor -h 70 | ``` 71 | 72 | Examples of running each algorithm can be found in the folder `bin` such as the file `bin/dataproc_neighbor.sh`. 73 | 74 | ### Neighbor Algorithm 75 | 76 | For the neighborhood algorithm, you can send the parameter `threshold` which sets from which number the similarities should converge to real values with given probability. For instance, if you choose `threshold=0.1`, then everything above this value will be guaranteed to converge to real value with given probability and with a given relative error. The trade-off is that less computing resources is required to run the job. 77 | 78 | ## Pre-Requisites 79 | 80 | Main dependecies are: 81 | * *pyspark* with spark installed and ready to receive jobs. 82 | * Jinja2 83 | * Numpy (for unit test) 84 | * *pytest*, *pytest-cov* and *mock* 85 | 86 | ## Running Unit Tests 87 | 88 | There are two types of tests in this project, *unit* and *system*. To run the latter, it's required to have a local spark cluster running in order to receive the jobs. 89 | 90 | To run *unit testing*, go to main folder and run: 91 | 92 | ```sh 93 | py.test tests/unit/ --quiet --cov=. 94 | ``` 95 | 96 | For *integration testing*, it's required to run each test separately so to not have spark conflicts: 97 | 98 | ```sh 99 | py.test tests/system/spark_jobs/test_neighbor.py --quiet --cov=. --cov-fail-under=100 100 | ``` 101 | 102 | Or for top seller: 103 | 104 | ```sh 105 | py.test tests/system/spark_jobs/test_top_seller.py --quiet --cov=. --cov-fail-under=100 106 | ``` 107 | 108 | Notice the integration tests will take much longer as it initializes a spark context for the tests. 109 | -------------------------------------------------------------------------------- /tests/unit/spark_jobs/test_neighbor.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2017 Willian Fuks 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | import unittest 24 | import sys 25 | import mock 26 | import json 27 | import datetime 28 | import math 29 | from collections import namedtuple 30 | 31 | from pyspark.sql import types as stypes 32 | sys.path.append('./spark_jobs') 33 | 34 | 35 | class Test_neighbor(unittest.TestCase): 36 | @staticmethod 37 | def _get_target_class(): 38 | from neighbor import MarrecoNeighborJob 39 | 40 | 41 | return MarrecoNeighborJob 42 | 43 | 44 | def test_users_matrix_schema(self): 45 | klass = self._get_target_class()() 46 | expected = stypes.StructType(fields=[ 47 | stypes.StructField("user_id", stypes.StringType()), 48 | stypes.StructField('interacted_items', stypes.ArrayType( 49 | stypes.StructType(fields=[stypes.StructField('key', 50 | stypes.StringType()), stypes.StructField('score', 51 | stypes.FloatType())])))]) 52 | 53 | self.assertEqual(expected, klass._load_users_matrix_schema()) 54 | 55 | 56 | def test_neighbor_schema(self): 57 | klass = self._get_target_class()() 58 | expected = stypes.StructType(fields=[ 59 | stypes.StructField("item_key", stypes.StringType()), 60 | stypes.StructField("similarity_items", stypes.ArrayType( 61 | stypes.StructType(fields=[ 62 | stypes.StructField("key", stypes.StringType()), 63 | stypes.StructField("score", stypes.FloatType())])))]) 64 | 65 | self.assertEqual(expected, klass._load_neighbor_schema()) 66 | 67 | 68 | @mock.patch('neighbor.random') 69 | def test_run_dimsum(self, random_mock): 70 | klass = self._get_target_class()() 71 | 72 | random_mock.random.return_value = 0.5 73 | class BroadDict(object): 74 | def __init__(self, dict_): 75 | self.value = dict_ 76 | 77 | pq_b = BroadDict({'0': [0.6, 2.], 78 | '1': [0.6, 2.], 79 | '2': [0.3, 2.], 80 | '3': [0.6, 4.]}) 81 | 82 | row = [('0', 2.), ('1', 4.), ('2', 6.), ('3', 8)] 83 | expected = [(('0', '1'), 2), (('0', '3'), 2.), (('1', '3'), 4.)] 84 | 85 | result = list(klass._run_DIMSUM(row, pq_b)) 86 | self.assertEqual(expected, result) 87 | 88 | 89 | def test_process_scores(self): 90 | klass = self._get_target_class()() 91 | row = ['0', [('0', 1.), ('1', 2.), ('2', 3.)]] 92 | expected = [('0', 1.), ('1', 4.), ('2', 9)] 93 | 94 | result = list(klass._process_scores(row)) 95 | self.assertEqual(expected, result) 96 | 97 | 98 | def test_render_inter_uri(self): 99 | klass = self._get_target_class()() 100 | 101 | expected = 'test_uri/part-*' 102 | result = klass._render_inter_uri('test_uri') 103 | 104 | self.assertEqual(expected, result) 105 | 106 | @mock.patch('neighbor.datetime') 107 | def test_process_json_product_view(self, datetime_mock): 108 | datetime_mock.datetime.now.return_value = datetime.datetime.utcfromtimestamp( 109 | 1502685428091 / 1000) 110 | datetime_mock.datetime.utcfromtimestamp.return_value = \ 111 | datetime.datetime(*[2017, 8, 13]) 112 | 113 | data = open('tests/data/productview_mock.json').read() 114 | 115 | Args = namedtuple('args', ['w_browse', 'w_purchase', 'decay']) 116 | args = Args(0.5, 2., 1.5) 117 | 118 | klass = self._get_target_class()() 119 | result = list(klass._process_json(data, args)) 120 | expected = [['25e35a54c8cace51', ('MA042APM76IPJ', math.exp(-1.5 * 1) * args.w_browse)]] 121 | self.assertEqual(expected, result) 122 | 123 | 124 | @mock.patch('neighbor.datetime') 125 | def test_process_json_orderconfirmation(self, datetime_mock): 126 | datetime_mock.datetime.now.return_value = datetime.datetime.utcfromtimestamp( 127 | 1502685428091 / 1000) 128 | datetime_mock.datetime.utcfromtimestamp.return_value = \ 129 | datetime.datetime(*[2017, 8, 13]) 130 | 131 | data = open('tests/data/orderconfirmation_mock.json').read() 132 | 133 | Args = namedtuple('args', ['w_browse', 'w_purchase', 'decay']) 134 | args = Args(0.5, 2., 1.5) 135 | 136 | klass = self._get_target_class()() 137 | result = list(klass._process_json(data, args)) 138 | expected = [['610574c802ba3b33', 139 | ('DA923SHF35RHK', math.exp(-1.5 * 1) * args.w_purchase)], 140 | ['610574c802ba3b33', 141 | ('VI618SHF69UQC', math.exp(-1.5 * 1) * args.w_purchase)]] 142 | self.assertEqual(expected, result) 143 | 144 | 145 | def test_process_json_search(self): 146 | data = open('tests/data/search_mock.json').read() 147 | 148 | Args = namedtuple('args', ['w_browse', 'w_purchase', 'decay']) 149 | args = Args(0.5, 2., 1.5) 150 | 151 | klass = self._get_target_class()() 152 | result = list(klass._process_json(data, args)) 153 | expected = [] 154 | self.assertEqual(expected, result) 155 | 156 | 157 | def test_aggregate_skus(self): 158 | row = ['0', [('1', 0.5), ('2', 1.), ('1', 1.)]] 159 | expected = [('0', [('1', 1.5), ('2', 1.)])] 160 | 161 | klass = self._get_target_class()() 162 | result = list(klass._aggregate_skus(row)) 163 | self.assertEqual(expected, result) 164 | 165 | 166 | def test_process_sysargs(self): 167 | args = ['--days_init=3', 168 | '--days_end=2', 169 | '--source_uri=source_uri', 170 | '--inter_uri=inter_uri', 171 | '--threshold=0.5', 172 | '--force=yes', 173 | '--users_matrix_uri=users_uri', 174 | '--neighbor_uri=neighbor_uri', 175 | '--w_browse=0.6', 176 | '--w_purchase=1.5'] 177 | 178 | klass = self._get_target_class()() 179 | args = klass.process_sysargs(args) 180 | self.assertEqual(args.days_init, 3) 181 | self.assertEqual(args.days_end, 2) 182 | self.assertEqual(args.source_uri, 'source_uri') 183 | self.assertEqual(args.inter_uri, 'inter_uri') 184 | self.assertEqual(args.threshold, 0.5) 185 | self.assertEqual(args.force, 'yes') 186 | self.assertEqual(args.users_matrix_uri, 'users_uri') 187 | self.assertEqual(args.neighbor_uri, 'neighbor_uri') 188 | self.assertEqual(args.w_browse, 0.6) 189 | self.assertEqual(args.w_purchase, 1.5) 190 | -------------------------------------------------------------------------------- /notebooks/marreco_df.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | cells: [ 3 | { 4 | cell_type: "code", 5 | execution_count: 57, 6 | metadata: { }, 7 | outputs: [ 8 | { 9 | data: { 10 | text/plain: [ 11 | "['In', ", 12 | " 'Out', ", 13 | " 'SQLContext', ", 14 | " 'SparkConf', ", 15 | " 'SparkContext', ", 16 | " 'SparkSession', ", 17 | " '_', ", 18 | " '_19', ", 19 | " '_23', ", 20 | " '_28', ", 21 | " '_33', ", 22 | " '_36', ", 23 | " '_39', ", 24 | " '_4', ", 25 | " '_44', ", 26 | " '_47', ", 27 | " '_50', ", 28 | " '_56', ", 29 | " '__', ", 30 | " '___', ", 31 | " '__builtin__', ", 32 | " '__builtins__', ", 33 | " '__doc__', ", 34 | " '__loader__', ", 35 | " '__name__', ", 36 | " '__package__', ", 37 | " '__spec__', ", 38 | " '_dh', ", 39 | " '_i', ", 40 | " '_i1', ", 41 | " '_i10', ", 42 | " '_i11', ", 43 | " '_i12', ", 44 | " '_i13', ", 45 | " '_i14', ", 46 | " '_i15', ", 47 | " '_i16', ", 48 | " '_i17', ", 49 | " '_i18', ", 50 | " '_i19', ", 51 | " '_i2', ", 52 | " '_i20', ", 53 | " '_i21', ", 54 | " '_i22', ", 55 | " '_i23', ", 56 | " '_i24', ", 57 | " '_i25', ", 58 | " '_i26', ", 59 | " '_i27', ", 60 | " '_i28', ", 61 | " '_i29', ", 62 | " '_i3', ", 63 | " '_i30', ", 64 | " '_i31', ", 65 | " '_i32', ", 66 | " '_i33', ", 67 | " '_i34', ", 68 | " '_i35', ", 69 | " '_i36', ", 70 | " '_i37', ", 71 | " '_i38', ", 72 | " '_i39', ", 73 | " '_i4', ", 74 | " '_i40', ", 75 | " '_i41', ", 76 | " '_i42', ", 77 | " '_i43', ", 78 | " '_i44', ", 79 | " '_i45', ", 80 | " '_i46', ", 81 | " '_i47', ", 82 | " '_i48', ", 83 | " '_i49', ", 84 | " '_i5', ", 85 | " '_i50', ", 86 | " '_i51', ", 87 | " '_i52', ", 88 | " '_i53', ", 89 | " '_i54', ", 90 | " '_i55', ", 91 | " '_i56', ", 92 | " '_i57', ", 93 | " '_i6', ", 94 | " '_i7', ", 95 | " '_i8', ", 96 | " '_i9', ", 97 | " '_ih', ", 98 | " '_ii', ", 99 | " '_iii', ", 100 | " '_oh', ", 101 | " '_pythonstartup', ", 102 | " 'atexit', ", 103 | " 'build_correlations', ", 104 | " 'combine_skus', ", 105 | " 'conf', ", 106 | " 'defaultdict', ", 107 | " 'exit', ", 108 | " 'get_ipython', ", 109 | " 'math', ", 110 | " 'os', ", 111 | " 'platform', ", 112 | " 'py4j', ", 113 | " 'query', ", 114 | " 'quit', ", 115 | " 'r', ", 116 | " 'sc', ", 117 | " 'schema', ", 118 | " 'sfunc', ", 119 | " 'spark', ", 120 | " 'sql', ", 121 | " 'sqlContext', ", 122 | " 'sqlCtx', ", 123 | " 'stypes', ", 124 | " 'time', ", 125 | " 'train_df', ", 126 | " 'warnings']" 127 | ] 128 | }, 129 | execution_count: 57, 130 | metadata: { }, 131 | output_type: "execute_result" 132 | } 133 | ], 134 | source: [ 135 | "dir()" 136 | ] 137 | }, 138 | { 139 | cell_type: "code", 140 | execution_count: 92, 141 | metadata: { 142 | collapsed: true 143 | }, 144 | outputs: [ ], 145 | source: [ 146 | "import time ", 147 | "from collections import defaultdict ", 148 | "from pyspark.sql import functions as sfunc ", 149 | "from pyspark.sql import types as stypes ", 150 | "import math ", 151 | "import sys" 152 | ] 153 | }, 154 | { 155 | cell_type: "code", 156 | execution_count: 2, 157 | metadata: { 158 | collapsed: true 159 | }, 160 | outputs: [ ], 161 | source: [ 162 | "schema = stypes.StructType().add("fv", stypes.StringType()).add("sku", stypes.StringType()).add("score", stypes.FloatType())" 163 | ] 164 | }, 165 | { 166 | cell_type: "code", 167 | execution_count: 3, 168 | metadata: { 169 | collapsed: true 170 | }, 171 | outputs: [ ], 172 | source: [ 173 | "train_df = spark.read.csv('gs://lbanor/pyspark/train_query*.gz', header=True, schema=schema)" 174 | ] 175 | }, 176 | { 177 | cell_type: "code", 178 | execution_count: 4, 179 | metadata: { }, 180 | outputs: [ 181 | { 182 | data: { 183 | text/plain: [ 184 | "[Row(fv='3383270414872112082', sku='MO578SHF77RTI', score=0.5), ", 185 | " Row(fv='7143168022217708588', sku='DA923SHF54UJP', score=0.5), ", 186 | " Row(fv='8844960186636261737', sku='LU621ACM67NYU', score=0.5)]" 187 | ] 188 | }, 189 | execution_count: 4, 190 | metadata: { }, 191 | output_type: "execute_result" 192 | } 193 | ], 194 | source: [ 195 | "tt = train_df.head(3)" 196 | ] 197 | }, 198 | { 199 | cell_type: "code", 200 | execution_count: 96, 201 | metadata: { 202 | collapsed: true 203 | }, 204 | outputs: [ ], 205 | source: [ 206 | "tt = train_df.collect()" 207 | ] 208 | }, 209 | { 210 | cell_type: "code", 211 | execution_count: 98, 212 | metadata: { }, 213 | outputs: [ 214 | { 215 | data: { 216 | text/plain: [ 217 | "Row(fv='3383270414872112082', sku='MO578SHF77RTI', score=0.5)" 218 | ] 219 | }, 220 | execution_count: 98, 221 | metadata: { }, 222 | output_type: "execute_result" 223 | } 224 | ], 225 | source: [ 226 | "tt[0]" 227 | ] 228 | }, 229 | { 230 | cell_type: "code", 231 | execution_count: 97, 232 | metadata: { }, 233 | outputs: [ 234 | { 235 | data: { 236 | text/plain: [ 237 | "42915448" 238 | ] 239 | }, 240 | execution_count: 97, 241 | metadata: { }, 242 | output_type: "execute_result" 243 | } 244 | ], 245 | source: [ 246 | "sys.getsizeof(tt)" 247 | ] 248 | }, 249 | { 250 | cell_type: "code", 251 | execution_count: 20, 252 | metadata: { 253 | collapsed: true 254 | }, 255 | outputs: [ ], 256 | source: [ 257 | "train_df.createOrReplaceTempView('test1')" 258 | ] 259 | }, 260 | { 261 | cell_type: "code", 262 | execution_count: 10, 263 | metadata: { 264 | collapsed: true 265 | }, 266 | outputs: [ ], 267 | source: [ 268 | "def build_correlations(row): ", 269 | " return [{"sku": e.sku, "corr": [{"sku": i.sku, "score": e.score * i.score} for i in row]} for e in row] ", 270 | "sqlContext.udf.register("BUILD_CORRELATIONS", build_correlations, stypes.ArrayType(stypes.StructType(fields=[stypes.StructField("sku", stypes.StringType(), False), stypes.StructField("corr", stypes.ArrayType(stypes.StructType(fields=[stypes.StructField("sku", stypes.StringType(), False), stypes.StructField("score", stypes.FloatType(), False)])), False)])))" 271 | ] 272 | }, 273 | { 274 | cell_type: "code", 275 | execution_count: 51, 276 | metadata: { 277 | collapsed: true 278 | }, 279 | outputs: [ ], 280 | source: [ 281 | "def combine_skus(ref_sku, row): ", 282 | " d = defaultdict(float) ", 283 | " ref_norm = 0.0 ", 284 | " for inner_row in row: ", 285 | " for e in inner_row: ", 286 | " d[e.sku] += e.score ", 287 | " if e.sku == ref_sku: ", 288 | " ref_norm += e.score ", 289 | " ref_norm = math.sqrt(ref_norm) ", 290 | " return {"norm": ref_norm, "corr": [{"sku": key, "similarity": value / ref_norm} for key, value in d.items()]} ", 291 | "sqlContext.udf.register("COMBINE_SKUS", combine_skus, stypes.StructType(fields=[stypes.StructField("norm", stypes.FloatType(), False), stypes.StructField("corr", stypes.ArrayType(stypes.StructType(fields=[stypes.StructField("sku", stypes.StringType(), False), stypes.StructField("similarity", stypes.FloatType(), False)]) ) )]))" 292 | ] 293 | }, 294 | { 295 | cell_type: "code", 296 | execution_count: 85, 297 | metadata: { 298 | collapsed: true 299 | }, 300 | outputs: [ ], 301 | source: [ 302 | "query = """ ", 303 | "SELECT ", 304 | " data.sku sku, ", 305 | " COMBINE_SKUS(data.sku, COLLECT_LIST(data.corr)) data ", 306 | "FROM( ", 307 | " SELECT ", 308 | " EXPLODE(BUILD_CORRELATIONS(data)) data ", 309 | " FROM( ", 310 | " SELECT ", 311 | " fv, ", 312 | " COLLECT_LIST(STRUCT(sku, score)) data ", 313 | " FROM test1 ", 314 | " GROUP BY ", 315 | " fv ", 316 | " HAVING SIZE(data) > 1 AND SIZE(data) < 200 ", 317 | " ) ", 318 | ") ", 319 | "GROUP BY ", 320 | " data.sku ", 321 | """"" 322 | ] 323 | }, 324 | { 325 | cell_type: "code", 326 | execution_count: 81, 327 | metadata: { 328 | collapsed: true 329 | }, 330 | outputs: [ ], 331 | source: [ 332 | "r1 = spark.sql(query)" 333 | ] 334 | }, 335 | { 336 | cell_type: "code", 337 | execution_count: 82, 338 | metadata: { 339 | collapsed: true 340 | }, 341 | outputs: [ ], 342 | source: [ 343 | "r1.createOrReplaceTempView('test2')" 344 | ] 345 | }, 346 | { 347 | cell_type: "code", 348 | execution_count: 69, 349 | metadata: { 350 | collapsed: true 351 | }, 352 | outputs: [ ], 353 | source: [ 354 | "query_extract_norms = """ ", 355 | "SELECT ", 356 | " sku, ", 357 | " data.norm norm ", 358 | "FROM test2 ", 359 | """"" 360 | ] 361 | }, 362 | { 363 | cell_type: "code", 364 | execution_count: 84, 365 | metadata: { }, 366 | outputs: [ 367 | { 368 | name: "stdout", 369 | output_type: "stream", 370 | text: [ 371 | "1481.6083595752716 " 372 | ] 373 | } 374 | ], 375 | source: [ 376 | "t0 = time.time() ", 377 | "r2 = {e.sku: e.norm for e in spark.sql(query_extract_norms).collect()} ", 378 | "print(time.time() - t0)" 379 | ] 380 | }, 381 | { 382 | cell_type: "code", 383 | execution_count: 86, 384 | metadata: { 385 | collapsed: true 386 | }, 387 | outputs: [ ], 388 | source: [ 389 | "r2_broad = sc.broadcast(r2)" 390 | ] 391 | }, 392 | { 393 | cell_type: "code", 394 | execution_count: 87, 395 | metadata: { 396 | collapsed: true 397 | }, 398 | outputs: [ ], 399 | source: [ 400 | "def normalize_corrs(corrs): ", 401 | " return [{"sku": e.sku, "similarity": e.similarity / r2_broad.value[e.sku]} for e in corrs] ", 402 | "sqlContext.udf.register("NORMALIZE_CORRS", normalize_corrs, stypes.ArrayType(stypes.StructType(fields=[stypes.StructField("sku", stypes.StringType(), False), stypes.StructField("similarity", stypes.FloatType(), False)])))" 403 | ] 404 | }, 405 | { 406 | cell_type: "code", 407 | execution_count: 88, 408 | metadata: { 409 | collapsed: true 410 | }, 411 | outputs: [ ], 412 | source: [ 413 | "final_query = """ ", 414 | "select ", 415 | "sku, ", 416 | "NORMALIZE_CORRS(data.corr) corr ", 417 | "FROM test2 ", 418 | """"" 419 | ] 420 | }, 421 | { 422 | cell_type: "code", 423 | execution_count: 90, 424 | metadata: { 425 | collapsed: true 426 | }, 427 | outputs: [ ], 428 | source: [ 429 | "final = spark.sql(final_query)" 430 | ] 431 | }, 432 | { 433 | cell_type: "code", 434 | execution_count: 91, 435 | metadata: { }, 436 | outputs: [ 437 | { 438 | name: "stdout", 439 | output_type: "stream", 440 | text: [ 441 | "381.65184354782104 " 442 | ] 443 | } 444 | ], 445 | source: [ 446 | "t0 = time.time() ", 447 | "final.head(1) ", 448 | "print(time.time() - t0)" 449 | ] 450 | }, 451 | { 452 | cell_type: "code", 453 | execution_count: null, 454 | metadata: { 455 | collapsed: true 456 | }, 457 | outputs: [ ], 458 | source: [ ] 459 | }, 460 | { 461 | cell_type: "code", 462 | execution_count: null, 463 | metadata: { 464 | collapsed: true 465 | }, 466 | outputs: [ ], 467 | source: [ ] 468 | } 469 | ], 470 | metadata: { 471 | kernelspec: { 472 | display_name: "PySpark", 473 | language: "python", 474 | name: "pyspark" 475 | }, 476 | language_info: { 477 | codemirror_mode: { 478 | name: "ipython", 479 | version: 3 480 | }, 481 | file_extension: ".py", 482 | mimetype: "text/x-python", 483 | name: "python", 484 | nbconvert_exporter: "python", 485 | pygments_lexer: "ipython3", 486 | version: "3.5.2" 487 | } 488 | }, 489 | nbformat: 4, 490 | nbformat_minor: 2 491 | } 492 | -------------------------------------------------------------------------------- /spark_jobs/top_seller.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2017 Willian Fuks 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | """ 24 | Set of tools to run Marreco's Top Seller algorithm in spark. 25 | """ 26 | 27 | import os 28 | import sys 29 | import json 30 | import operator 31 | import math 32 | import random 33 | import argparse 34 | from collections import defaultdict 35 | 36 | sys.path.append('..') 37 | 38 | from base import MarrecoBase 39 | from py4j.protocol import Py4JJavaError 40 | from pyspark.sql.utils import AnalysisException 41 | from pyspark.sql import SparkSession 42 | from pyspark.sql import types as stypes 43 | 44 | 45 | class MarrecoTopSellerJob(MarrecoBase): 46 | """This Class has all methods necessary to build Marreco Neighborhood 47 | against Spark. 48 | 49 | :type context: `pyspark.SparkContext` 50 | :param context: context in which Jobs are ran against. 51 | """ 52 | def transform_data(self, sc, args): 53 | """This method gets datajet files as input and prepare them on a daily 54 | intermediary basis for Marreco's Top Seller algorithm. 55 | 56 | :type sc: spark context 57 | :param sc: spark context for running jobs. 58 | 59 | :param kwargs: 60 | 61 | :type days_init: int 62 | :param days: how many days to scan through the files to be used 63 | in the transformation phase. 64 | 65 | :type days_end: int 66 | :param days_end: 67 | 68 | :type inter_uri: str 69 | :param inter_uri: uri for where to save intermediate results. 70 | 71 | :type force: str 72 | :param force: either ``yes``, in which case forces recreation of 73 | files, or ``no``, which in case if files already 74 | exist then does nothing. 75 | 76 | :type source_uri: str 77 | :param source_uri: URI from where to read files. 78 | """ 79 | spark = SparkSession(sc) 80 | for day in range(args.days_init, args.days_end - 1, -1): 81 | formatted_day = self.get_formatted_date(day) 82 | 83 | source_uri = args.source_uri.format(formatted_day) 84 | inter_uri = args.inter_uri.format(formatted_day) 85 | try: 86 | inter_data = spark.read.json(inter_uri, 87 | schema = self._load_top_seller_schema()).first() 88 | 89 | if args.force == 'yes' or not inter_data: 90 | self._process_datajet_day(sc, 91 | source_uri, 92 | inter_uri, 93 | 'overwrite') 94 | except (Py4JJavaError, AnalysisException): 95 | self._process_datajet_day(sc, source_uri, inter_uri) 96 | finally: 97 | print('processed data for {} day'.format(day)) 98 | 99 | 100 | def _process_datajet_day(self, sc, uri, inter_uri, mode=None): 101 | """Gets datajet json like files and transforms them into data like 102 | [(sku, items_sold),...] saving it in the end. 103 | 104 | :type sc: spark context 105 | :param sc: context to run spark jobs. 106 | 107 | :type uri: str 108 | :param uri: where the files are located. 109 | 110 | :type inter_uri: str 111 | :param inter_uri: where intermediate results should be saved. 112 | 113 | :type mode: str 114 | :param mode: indicates how data should be saved. If ``None`` then 115 | throws error if file already exist. If ``overwrite`` then 116 | deletes previous file and saves new one. 117 | """ 118 | sc.textFile(uri) \ 119 | .flatMap(lambda x: self._process_json(x)) \ 120 | .filter(lambda x: x) \ 121 | .reduceByKey(operator.add) \ 122 | .toDF(schema=self._load_top_seller_schema()) \ 123 | .write.json(inter_uri, compression='gzip', mode=mode) 124 | 125 | 126 | def _load_top_seller_schema(self): 127 | """Loads schema for top seller intermediate data saved like 128 | [sku, items_sold] 129 | 130 | :rtype: `pyspark.sql.StructType` 131 | :returns: schema for top selling data 132 | """ 133 | return stypes.StructType(fields=[ 134 | stypes.StructField("item_key", stypes.StringType()), 135 | stypes.StructField("value", stypes.IntegerType())]) 136 | 137 | 138 | def build_marreco(self, sc, args): 139 | """Main method for building Marreco's algorithms and saving results 140 | for later usage. 141 | 142 | :type sc: `pyspark.SparkContext` 143 | :param sc: spark context for running jobs. 144 | 145 | :type args: Namespace 146 | :param args: 147 | :type days_init: int 148 | :param days_init: which date time that will be used for reading data 149 | with intermediary daily results. 150 | 151 | :type days_end: int 152 | :param days_end: until what file to read input data. 153 | 154 | :type inter_uri: str 155 | :param inter_uri: URI where intermediary results should be read from 156 | 157 | :type source_uri: str 158 | :param source_uri: source from where to read input data 159 | 160 | :type force: str 161 | :param force: either ``yes`` in which case replace intermediate files 162 | or ``no`` where nothing is done if file already exists. 163 | 164 | :type top_seller_uri: str 165 | :param top_seller_uri: URI for where to save results 166 | """ 167 | spark = SparkSession(sc) 168 | data = sc.emptyRDD() 169 | 170 | for day in range(args.days_init, args.days_end - 1, -1): 171 | formatted_day = self.get_formatted_date(day) 172 | inter_uri = self._render_inter_uri(args.inter_uri.format( 173 | formatted_day)) 174 | 175 | data = data.union(spark.read.json(inter_uri, 176 | schema=self._load_top_seller_schema()).rdd) 177 | 178 | data = data.reduceByKey(operator.add) \ 179 | .sortBy(lambda x: x[1], False) 180 | self._save_top_seller_matrix(args.top_seller_uri, data) 181 | 182 | 183 | def _save_top_seller_matrix(self, top_seller_uri, data): 184 | """Loads top seller schema and saves final results as 185 | [(item_key, items_sold), (item_key, items_sold)...]} 186 | 187 | :type top_seller_uri: str 188 | :param top_seller_uri: uri for where to save the matrix. 189 | 190 | :type data: RDD 191 | :param data: RDD with data like [item_key, items_sold] 192 | """ 193 | data.toDF(schema=self._load_top_seller_schema()) \ 194 | .write.json(top_seller_uri, compression='gzip', mode='overwrite') 195 | 196 | 197 | def _render_inter_uri(self, inter_uri, name_pattern='part-*'): 198 | """Helper function to process inter_uri's for later usage. 199 | 200 | :type inter_uri: str 201 | :param inter_uri: URI used for saving intermediate data transformation 202 | results. 203 | 204 | :type name_pattern: str 205 | :param name_pattern: pattern used by spark to save multiple files. 206 | 207 | :rtype: str 208 | :returns: URI rendered template for retrieving data back to code. 209 | """ 210 | return os.path.join(inter_uri, name_pattern) 211 | 212 | 213 | @staticmethod 214 | def _process_json(row): 215 | """Mapper function to extract from each line from datajet file 216 | and return interactions between customers and sold skus. 217 | 218 | :type row: str 219 | :param row: json string with datajet data. 220 | 221 | :rtype: list 222 | :returns: `yield` on [sku, items_sold] 223 | """ 224 | try: 225 | r = json.loads(row) 226 | if (r['event']['source']['tracker'] == 'fish' and 227 | 'local_timestamp' in r['event'] and 228 | r['event']['identifiers']['djUCID']['value'] and 229 | r['event']['type'] == "orderconfirmation"): 230 | 231 | for e in list(zip([e['group_id'] for e in 232 | r['event']['details']['products']], 233 | ([int(e) for e in 234 | r['event']['details']['quantities']]))): 235 | yield e 236 | 237 | except: 238 | yield [] 239 | 240 | 241 | @staticmethod 242 | def process_sysargs(args): 243 | parser = argparse.ArgumentParser() 244 | 245 | parser.add_argument('--days_init', 246 | dest='days_init', 247 | type=int, 248 | help=("Total amount of days to come back in time " 249 | "from today's date.")) 250 | 251 | parser.add_argument('--days_end', 252 | dest='days_end', 253 | type=int, 254 | help=("Total amount of days to come back in time " 255 | "from today's date.")) 256 | 257 | parser.add_argument('--source_uri', 258 | dest='source_uri', 259 | type=str, 260 | help=("URI template from where to read source " 261 | "files from.")) 262 | 263 | parser.add_argument('--inter_uri', 264 | dest='inter_uri', 265 | type=str, 266 | help=('URI for saving intermediary results.')) 267 | 268 | parser.add_argument('--top_seller_uri', 269 | dest='top_seller_uri', 270 | type=str, 271 | help=('URI for saving top_seller results.')) 272 | 273 | parser.add_argument('--force', 274 | dest='force', 275 | type=str, 276 | help=('If ``yes`` then replace all files with new ones. ' 277 | ' If ``no``, then no replacing happens.')) 278 | 279 | args = parser.parse_args(args) 280 | return args 281 | -------------------------------------------------------------------------------- /tests/system/spark_jobs/test_top_seller.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2017 Willian Fuks 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | """ 24 | This is system tests and should only be run if the environment has pyspark 25 | and a spark cluster installed to receive on-demand jobs 26 | """ 27 | 28 | import os 29 | import unittest 30 | import sys 31 | import mock 32 | import json 33 | import datetime 34 | import pyspark 35 | import math 36 | import glob 37 | import shutil 38 | from collections import namedtuple 39 | import numpy as np 40 | 41 | from pyspark.sql import types as stypes 42 | sys.path.append('./spark_jobs') 43 | 44 | py_files = ['./spark_jobs/top_seller.py', 45 | './spark_jobs/base.py', 46 | './spark_jobs/factory.py'] 47 | 48 | 49 | class Test_top_seller(unittest.TestCase): 50 | 51 | _sc = pyspark.SparkContext(pyFiles=py_files) 52 | _session = pyspark.sql.SparkSession(_sc) 53 | _to_delete_uris = [] 54 | 55 | 56 | @staticmethod 57 | def _get_target_class(): 58 | from top_seller import MarrecoTopSellerJob 59 | 60 | 61 | return MarrecoTopSellerJob 62 | 63 | 64 | @staticmethod 65 | def _delete_dirs(*args): 66 | for arg in args: 67 | if os.path.isdir(arg): 68 | shutil.rmtree(arg) 69 | 70 | 71 | def _prepare_daily_data(self): 72 | for i in [1, 2]: 73 | uri = 'tests/system/data/top_seller/train/{}/train.json'.format( 74 | i) 75 | data = self._sc.textFile(uri) 76 | formatted_day = (datetime.datetime.now() - 77 | datetime.timedelta(days=i)).strftime('%Y-%m-%d') 78 | 79 | save_uri = 'tests/system/data/top_seller/train/{}/train.json'.format( 80 | formatted_day) 81 | self._delete_dirs(save_uri) 82 | self._to_delete_uris.append(os.path.dirname(save_uri)) 83 | data.saveAsTextFile(save_uri) 84 | 85 | 86 | def _delete_uris(self): 87 | for uri in self._to_delete_uris: 88 | self._delete_dirs(uri) 89 | self._to_delete_uris = [] 90 | 91 | 92 | def test_process_datajet_day_no_force(self): 93 | klass = self._get_target_class()() 94 | inter_uri = 'tests/system/data/top_seller/dj' 95 | self._delete_dirs(inter_uri) 96 | self.assertFalse(os.path.isdir(inter_uri)) 97 | 98 | klass._process_datajet_day(self._sc, 99 | 'tests/system/data/top_seller/datajet_test.json', 100 | inter_uri, 101 | mode=None) 102 | 103 | result = self._session.read.json(inter_uri).toJSON().collect() 104 | expected = ['{"item_key":"DA923SHF35RHK","value":1}', 105 | '{"item_key":"VI618SHF69UQC","value":1}'] 106 | 107 | self.assertEqual(result, expected) 108 | self._delete_dirs(inter_uri) 109 | self.assertFalse(os.path.isdir(inter_uri)) 110 | 111 | 112 | def test_process_datajet_day_yes_force(self): 113 | klass = self._get_target_class()() 114 | inter_uri = 'tests/system/data/top_seller/dj' 115 | self._delete_dirs(inter_uri) 116 | self.assertFalse(os.path.isdir(inter_uri)) 117 | 118 | klass._process_datajet_day(self._sc, 119 | 'tests/system/data/top_seller/datajet_test.json', 120 | inter_uri, 121 | mode=None) 122 | 123 | klass._process_datajet_day(self._sc, 124 | 'tests/system/data/top_seller/datajet_test.json', 125 | inter_uri, 126 | mode='overwrite') 127 | 128 | result = self._session.read.json(inter_uri).toJSON().collect() 129 | expected = ['{"item_key":"DA923SHF35RHK","value":1}', 130 | '{"item_key":"VI618SHF69UQC","value":1}'] 131 | 132 | self.assertEqual(result, expected) 133 | self._delete_dirs(inter_uri) 134 | self.assertFalse(os.path.isdir(inter_uri)) 135 | 136 | 137 | def test_transform_data_no_force(self): 138 | klass = self._get_target_class()() 139 | inter_uri = 'tests/system/data/top_seller/inter/{}' 140 | Args = namedtuple('args', ['days_init', 141 | 'days_end', 142 | 'force', 143 | 'source_uri', 144 | 'inter_uri']) 145 | 146 | self._prepare_daily_data() 147 | 148 | args = Args(2, 1, 'no', 149 | 'tests/system/data/top_seller/train/{}/train.json', 150 | inter_uri) 151 | klass.transform_data(self._sc, args) 152 | 153 | data1_uri = ['{"item_key":"2","value":1}', 154 | '{"item_key":"3","value":1}', 155 | '{"item_key":"0","value":2}'] 156 | 157 | data2_uri = ['{"item_key":"0","value":1}', 158 | '{"item_key":"1","value":1}', 159 | '{"item_key":"2","value":2}'] 160 | 161 | expected = {2: data2_uri, 162 | 1: data1_uri} 163 | 164 | for day in range(args.days_init, args.days_end - 1, -1): 165 | formatted_day = klass.get_formatted_date(day) 166 | result = self._session.read.json(inter_uri.format(formatted_day), 167 | schema=klass._load_top_seller_schema()).toJSON().collect() 168 | self.assertEqual(result, expected[day]) 169 | 170 | for day in [2, 1]: 171 | formatted_day = klass.get_formatted_date(day) 172 | self._delete_dirs(inter_uri.format(formatted_day)) 173 | self.assertFalse(os.path.isdir(inter_uri.format(formatted_day))) 174 | self._delete_uris() 175 | self.assertEqual(self._to_delete_uris, []) 176 | 177 | 178 | def test_transform_data_yes_force(self): 179 | klass = self._get_target_class()() 180 | inter_uri = 'tests/system/data/top_seller/inter/{}' 181 | Args = namedtuple('args', ['days_init', 182 | 'days_end', 183 | 'force', 184 | 'source_uri', 185 | 'inter_uri']) 186 | 187 | self._prepare_daily_data() 188 | 189 | args = Args(2, 1, 'no', 190 | 'tests/system/data/top_seller/train/{}/train.json', 191 | inter_uri) 192 | klass.transform_data(self._sc, args) 193 | 194 | args = Args(2, 1, 'yes', 195 | 'tests/system/data/top_seller/train/{}/train.json', 196 | inter_uri) 197 | klass.transform_data(self._sc, args) 198 | 199 | data1_uri = ['{"item_key":"2","value":1}', 200 | '{"item_key":"3","value":1}', 201 | '{"item_key":"0","value":2}'] 202 | 203 | data2_uri = ['{"item_key":"0","value":1}', 204 | '{"item_key":"1","value":1}', 205 | '{"item_key":"2","value":2}'] 206 | 207 | expected = {2: data2_uri, 208 | 1: data1_uri} 209 | 210 | for day in range(args.days_init, args.days_end - 1, -1): 211 | formatted_day = klass.get_formatted_date(day) 212 | result = self._session.read.json(inter_uri.format(formatted_day), 213 | schema=klass._load_top_seller_schema()).toJSON().collect() 214 | self.assertEqual(result, expected[day]) 215 | 216 | for day in [2, 1]: 217 | formatted_day = klass.get_formatted_date(day) 218 | self._delete_dirs(inter_uri.format(formatted_day)) 219 | self.assertFalse(os.path.isdir(inter_uri.format(formatted_day))) 220 | self._delete_uris() 221 | self.assertEqual(self._to_delete_uris, []) 222 | 223 | 224 | def test_build_marreco_yes_force(self): 225 | klass = self._get_target_class()() 226 | inter_uri = 'tests/system/data/top_seller/inter/{}' 227 | result_uri = 'tests/system/data/top_seller/result' 228 | 229 | self._prepare_daily_data() 230 | 231 | Args = namedtuple('args', ['days_init', 232 | 'days_end', 233 | 'force', 234 | 'source_uri', 235 | 'inter_uri', 236 | 'top_seller_uri']) 237 | 238 | args = Args(2, 1, 'no', 239 | 'tests/system/data/top_seller/train/{}/train.json', 240 | inter_uri, 241 | 'tests/system/data/top_seller/result') 242 | klass.transform_data(self._sc, args) 243 | klass.build_marreco(self._sc, args) 244 | 245 | args = Args(2, 1, 'yes', 246 | 'tests/system/data/top_seller/train/{}/train.json', 247 | inter_uri, 248 | 'tests/system/data/top_seller/result') 249 | klass.transform_data(self._sc, args) 250 | klass.build_marreco(self._sc, args) 251 | 252 | expected = ['{"item_key":"0","value":3}', 253 | '{"item_key":"1","value":1}', 254 | '{"item_key":"2","value":3}', 255 | '{"item_key":"3","value":1}'] 256 | 257 | result = sorted(self._session.read.json(result_uri, 258 | schema=klass._load_top_seller_schema()).toJSON().collect()) 259 | 260 | self.assertEqual(result, expected) 261 | self._delete_dirs(result_uri) 262 | 263 | for day in [2, 1]: 264 | formatted_day = klass.get_formatted_date(day) 265 | self._delete_dirs(inter_uri.format(formatted_day)) 266 | self.assertFalse(os.path.isdir(inter_uri.format(formatted_day))) 267 | self._delete_uris() 268 | self.assertEqual(self._to_delete_uris, []) 269 | 270 | 271 | def test_build_marreco_no_force(self): 272 | klass = self._get_target_class()() 273 | inter_uri = 'tests/system/data/top_seller/inter/{}' 274 | result_uri = 'tests/system/data/top_seller/result' 275 | Args = namedtuple('args', ['days_init', 276 | 'days_end', 277 | 'force', 278 | 'source_uri', 279 | 'inter_uri', 280 | 'top_seller_uri']) 281 | 282 | self._prepare_daily_data() 283 | 284 | args = Args(2, 1, 'no', 285 | 'tests/system/data/top_seller/train/{}/train.json', 286 | inter_uri, 287 | 'tests/system/data/top_seller/result') 288 | klass.transform_data(self._sc, args) 289 | klass.build_marreco(self._sc, args) 290 | 291 | expected = ['{"item_key":"0","value":3}', 292 | '{"item_key":"1","value":1}', 293 | '{"item_key":"2","value":3}', 294 | '{"item_key":"3","value":1}'] 295 | 296 | result = sorted(self._session.read.json(result_uri, 297 | schema=klass._load_top_seller_schema())\ 298 | .toJSON().collect()) 299 | 300 | self.assertEqual(result, expected) 301 | 302 | self._delete_dirs(result_uri) 303 | 304 | for day in [2, 1]: 305 | formatted_day = klass.get_formatted_date(day) 306 | self._delete_dirs(inter_uri.format(formatted_day)) 307 | self.assertFalse(os.path.isdir(inter_uri.format(formatted_day))) 308 | self._delete_uris() 309 | self.assertEqual(self._to_delete_uris, []) 310 | 311 | -------------------------------------------------------------------------------- /notebooks/marreco_dimsum_internal.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | cells: [ 3 | { 4 | cell_type: "code", 5 | execution_count: 2, 6 | metadata: { 7 | collapsed: true 8 | }, 9 | outputs: [ ], 10 | source: [ 11 | "from pyspark.mllib.linalg import SparseVector ", 12 | "from pyspark.mllib.linalg.distributed import RowMatrix ", 13 | "import numpy as np ", 14 | "from sklearn.metrics.pairwise import cosine_similarity ", 15 | "import time ", 16 | "from collections import defaultdict ", 17 | "from pyspark.sql import functions as sfunc ", 18 | "from pyspark.sql import types as stypes ", 19 | "import math ", 20 | "import sys ", 21 | "from pyspark.ml.linalg import SparseVector ", 22 | "from pyspark.mllib.linalg.distributed import RowMatrix ", 23 | "from operator import itemgetter ", 24 | "import operator ", 25 | "import random" 26 | ] 27 | }, 28 | { 29 | cell_type: "code", 30 | execution_count: 3, 31 | metadata: { 32 | collapsed: true 33 | }, 34 | outputs: [ ], 35 | source: [ 36 | "schema = stypes.StructType().add("fv", stypes.StringType()).add("sku", stypes.StringType()).add("score", stypes.FloatType()) ", 37 | "train_df = spark.read.csv('gs://lbanor/pyspark/train_query*.gz', header=True, schema=schema) ", 38 | "train_df.createOrReplaceTempView('test1')" 39 | ] 40 | }, 41 | { 42 | cell_type: "code", 43 | execution_count: 188, 44 | metadata: { }, 45 | outputs: [ 46 | { 47 | name: "stdout", 48 | output_type: "stream", 49 | text: [ 50 | "[Row(fv='1005105267406228429', sku='FI911SHF89UBM-50', score=5.0)] " 51 | ] 52 | } 53 | ], 54 | source: [ 55 | "print(train_df.rdd.filter(lambda x: x.sku == 'FI911SHF89UBM-50').take(3))" 56 | ] 57 | }, 58 | { 59 | cell_type: "code", 60 | execution_count: 82, 61 | metadata: { 62 | collapsed: true 63 | }, 64 | outputs: [ ], 65 | source: [ 66 | "# query = """ ", 67 | "# SELECT ", 68 | "# sku, ", 69 | "# ROW_NUMBER() OVER (ORDER BY SUM(1)) -1 idx ", 70 | "# FROM test1 ", 71 | "# GROUP BY 1 ", 72 | "# """ ", 73 | "# skus_rdd = spark.sql(query).rdd" 74 | ] 75 | }, 76 | { 77 | cell_type: "code", 78 | execution_count: 4, 79 | metadata: { 80 | collapsed: true 81 | }, 82 | outputs: [ ], 83 | source: [ 84 | "query_statistics = """ ", 85 | "SELECT ", 86 | " sku, ", 87 | " SQRT(10 * LOG(COUNT(sku) OVER()) / {threshold}) / SQRT(SUM(score * score)) p, ", 88 | " IF(SQRT(10 * LOG(COUNT(sku) OVER()) / {threshold}) > SQRT(SUM(score * score)), SQRT(SUM(score * score)), SQRT(10 * LOG(COUNT(sku) OVER()) / {threshold})) q --- implements the min(gamma, ||c||) ", 89 | "FROM test1 ", 90 | "GROUP BY 1 ", 91 | """"" 92 | ] 93 | }, 94 | { 95 | cell_type: "code", 96 | execution_count: 8, 97 | metadata: { }, 98 | outputs: [ ], 99 | source: [ 100 | "skus_stats = spark.sql(query_statistics.format(threshold=0.1))" 101 | ] 102 | }, 103 | { 104 | cell_type: "code", 105 | execution_count: 9, 106 | metadata: { }, 107 | outputs: [ 108 | { 109 | name: "stdout", 110 | output_type: "stream", 111 | text: [ 112 | "[Row(sku='FI911SHF89UBM-50', p=7.132311576894841, q=5.0)] " 113 | ] 114 | } 115 | ], 116 | source: [ 117 | "print(skus_stats.rdd.filter(lambda x: x.sku == 'FI911SHF89UBM-50').take(3))" 118 | ] 119 | }, 120 | { 121 | cell_type: "code", 122 | execution_count: 178, 123 | metadata: { }, 124 | outputs: [ 125 | { 126 | data: { 127 | text/plain: [ 128 | "[Row(sku='PO140ACU06DDD', p=2.4697175158107982, q=14.439529078193651), ", 129 | " Row(sku='PO140ACU76FVN', p=35.661557884474206, q=1.0), ", 130 | " Row(sku='JU082SHF02WWZ', p=3.790780833876121, q=9.40744386111339)]" 131 | ] 132 | }, 133 | execution_count: 178, 134 | metadata: { }, 135 | output_type: "execute_result" 136 | } 137 | ], 138 | source: [ 139 | "sku_stats.take(3)" 140 | ] 141 | }, 142 | { 143 | cell_type: "code", 144 | execution_count: 194, 145 | metadata: { }, 146 | outputs: [ 147 | { 148 | name: "stdout", 149 | output_type: "stream", 150 | text: [ 151 | "[] " 152 | ] 153 | } 154 | ], 155 | source: [ 156 | "print(skus_stats.rdd.filter(lambda x: x.sku == 'FI911SHF89UBM-50').take(3))" 157 | ] 158 | }, 159 | { 160 | cell_type: "code", 161 | execution_count: null, 162 | metadata: { 163 | collapsed: true 164 | }, 165 | outputs: [ ], 166 | source: [ 167 | "# query_statistics = """ ", 168 | "# SELECT ", 169 | "# sku, ", 170 | "# {gamma} / SQRT(SUM(score * score)) p, ", 171 | "# IF({gamma} > SQRT(SUM(score * score)), SQRT(SUM(score * score)), {gamma}) q ", 172 | "# FROM test1 ", 173 | "# GROUP BY 1 ", 174 | "# """" 175 | ] 176 | }, 177 | { 178 | cell_type: "code", 179 | execution_count: 60, 180 | metadata: { 181 | collapsed: true 182 | }, 183 | outputs: [ ], 184 | source: [ 185 | "# def get_gamma(threshold, numCols): ", 186 | "# return math.sqrt(10 * math.log(numCols) / threshold) if threshold > 10e-6 else math.inf" 187 | ] 188 | }, 189 | { 190 | cell_type: "code", 191 | execution_count: 76, 192 | metadata: { }, 193 | outputs: [ 194 | { 195 | name: "stdout", 196 | output_type: "stream", 197 | text: [ 198 | "35.57234899487128 " 199 | ] 200 | } 201 | ], 202 | source: [ 203 | "# gamma_b = sc.broadcast(get_gamma(10e-2)) ", 204 | "# print(gamma_b.value)" 205 | ] 206 | }, 207 | { 208 | cell_type: "code", 209 | execution_count: 77, 210 | metadata: { }, 211 | outputs: [ ], 212 | source: [ 213 | "# skus_stats = spark.sql(query_statistics.format(gamma=gamma_b.value))" 214 | ] 215 | }, 216 | { 217 | cell_type: "code", 218 | execution_count: 78, 219 | metadata: { }, 220 | outputs: [ 221 | { 222 | data: { 223 | text/plain: [ 224 | "[Row(sku='NI531SRM74IHX', p=2.8758539658272255, q=12.36931687685298), ", 225 | " Row(sku='MO578SHF45QNE', p=0.5225157525775272, q=35.57234899487128)]" 226 | ] 227 | }, 228 | execution_count: 78, 229 | metadata: { }, 230 | output_type: "execute_result" 231 | } 232 | ], 233 | source: [ 234 | "# skus_stats.head(2)" 235 | ] 236 | }, 237 | { 238 | cell_type: "code", 239 | execution_count: 10, 240 | metadata: { }, 241 | outputs: [ ], 242 | source: [ 243 | "pq_b = sc.broadcast({row.sku: [row.p, row.q] for row in skus_stats.collect()})" 244 | ] 245 | }, 246 | { 247 | cell_type: "code", 248 | execution_count: 11, 249 | metadata: { }, 250 | outputs: [ 251 | { 252 | data: { 253 | text/plain: [ 254 | "[7.132311576894841, 5.0]" 255 | ] 256 | }, 257 | execution_count: 11, 258 | metadata: { }, 259 | output_type: "execute_result" 260 | } 261 | ], 262 | source: [ 263 | "pq_b.value['FI911SHF89UBM-50']" 264 | ] 265 | }, 266 | { 267 | cell_type: "code", 268 | execution_count: 157, 269 | metadata: { 270 | collapsed: true 271 | }, 272 | outputs: [ ], 273 | source: [ 274 | "#skus_idx_b = sc.broadcast({sku: idx for idx, sku in enumerate(pq_b.value.keys())})" 275 | ] 276 | }, 277 | { 278 | cell_type: "code", 279 | execution_count: 158, 280 | metadata: { }, 281 | outputs: [ ], 282 | source: [ 283 | "#idx_skus_b = sc.broadcast({value: key for key, value in skus_idx_b.value.items()})" 284 | ] 285 | }, 286 | { 287 | cell_type: "code", 288 | execution_count: 53, 289 | metadata: { 290 | collapsed: true 291 | }, 292 | outputs: [ ], 293 | source: [ 294 | "# d = {row.sku: row.idx for row in skus_rdd.collect()} ", 295 | "# db = sc.broadcast(d) ", 296 | " ", 297 | "# id_ = {value: key for key, value in d.items()} ", 298 | "# id_b = sc.broadcast(id_)" 299 | ] 300 | }, 301 | { 302 | cell_type: "code", 303 | execution_count: 159, 304 | metadata: { 305 | collapsed: true 306 | }, 307 | outputs: [ ], 308 | source: [ 309 | "#numCols = sc.broadcast(len(idx_skus_b.value))" 310 | ] 311 | }, 312 | { 313 | cell_type: "code", 314 | execution_count: 57, 315 | metadata: { }, 316 | outputs: [ ], 317 | source: [ 318 | "# p = [0] * numCols.value ", 319 | "# for row in skus_stats" 320 | ] 321 | }, 322 | { 323 | cell_type: "code", 324 | execution_count: 55, 325 | metadata: { 326 | collapsed: true 327 | }, 328 | outputs: [ ], 329 | source: [ 330 | "#p = {row.sku: gamma_b.value / row.norm for row in skus_stats.collect()} # if 0 happens as the ``norm`` we expected an Exception to be raised. ", 331 | "#p_b = sc.broadcast(p)" 332 | ] 333 | }, 334 | { 335 | cell_type: "code", 336 | execution_count: 34, 337 | metadata: { }, 338 | outputs: [ ], 339 | source: [ 340 | "#q = {row.sku: gamma_b.value / row.norm for row in skus_stats.collect()}" 341 | ] 342 | }, 343 | { 344 | cell_type: "code", 345 | execution_count: 35, 346 | metadata: { }, 347 | outputs: [ 348 | { 349 | data: { 350 | text/plain: [ 351 | "312988" 352 | ] 353 | }, 354 | execution_count: 35, 355 | metadata: { }, 356 | output_type: "execute_result" 357 | } 358 | ], 359 | source: [ 360 | "#numCols.value" 361 | ] 362 | }, 363 | { 364 | cell_type: "code", 365 | execution_count: 31, 366 | metadata: { }, 367 | outputs: [ 368 | { 369 | data: { 370 | text/plain: [ 371 | "12.36931687685298" 372 | ] 373 | }, 374 | execution_count: 31, 375 | metadata: { }, 376 | output_type: "execute_result" 377 | } 378 | ], 379 | source: [ 380 | "#skus_s['NI531SRM74IHX']" 381 | ] 382 | }, 383 | { 384 | cell_type: "code", 385 | execution_count: 12, 386 | metadata: { 387 | collapsed: true 388 | }, 389 | outputs: [ ], 390 | source: [ 391 | "query_users_items = """ ", 392 | "SELECT ", 393 | "data ", 394 | "FROM( ", 395 | " SELECT ", 396 | " fv, ", 397 | " COLLECT_LIST(STRUCT(sku, score)) data ", 398 | " FROM test1 ", 399 | " GROUP BY 1 ", 400 | ") ", 401 | "WHERE SIZE(data) BETWEEN 2 AND 200 ", 402 | """" ", 403 | " ", 404 | "t0 = time.time() ", 405 | "users = spark.sql(query_users_items) ", 406 | "users_rdd = users.rdd" 407 | ] 408 | }, 409 | { 410 | cell_type: "code", 411 | execution_count: 148, 412 | metadata: { }, 413 | outputs: [ 414 | { 415 | data: { 416 | text/plain: [ 417 | "[Row(data=[Row(sku='CO796SCF87LXG', score=0.5), Row(sku='CO796SCM72JGT', score=0.5), Row(sku='CO796SCM23HHW', score=0.5)]), ", 418 | " Row(data=[Row(sku='HA723APF18CPL', score=0.5), Row(sku='CO515APF44YPR', score=0.5), Row(sku='LA906APF69OQC', score=0.5), Row(sku='TU142APF19BPC', score=0.5), Row(sku='CO515APF27DIA', score=0.5), Row(sku='GA753APF40NJR', score=0.5), Row(sku='GA753APF41NJQ', score=1.0)])]" 419 | ] 420 | }, 421 | execution_count: 148, 422 | metadata: { }, 423 | output_type: "execute_result" 424 | } 425 | ], 426 | source: [ 427 | "users.head(2)" 428 | ] 429 | }, 430 | { 431 | cell_type: "code", 432 | execution_count: 13, 433 | metadata: { 434 | collapsed: true 435 | }, 436 | outputs: [ ], 437 | source: [ 438 | "def map_cosines(row): ", 439 | " for i in range(len(row)): ", 440 | " value_i = row[i].score / pq_b.value[row[i].sku][1] ", 441 | " if random.random() < pq_b.value[row[i].sku][0]: ", 442 | " for j in range(i + 1, len(row)): ", 443 | " value_j = row[j].score / pq_b.value[row[j].sku][1] ", 444 | " if random.random() < pq_b.value[row[i].sku][0]: ", 445 | " yield ((row[i].sku, row[j].sku), value_i * value_j)" 446 | ] 447 | }, 448 | { 449 | cell_type: "code", 450 | execution_count: 14, 451 | metadata: { }, 452 | outputs: [ ], 453 | source: [ 454 | "users2 = users.rdd.flatMap(lambda row: map_cosines(row.data))" 455 | ] 456 | }, 457 | { 458 | cell_type: "code", 459 | execution_count: 150, 460 | metadata: { }, 461 | outputs: [ 462 | { 463 | data: { 464 | text/plain: [ 465 | "[(('CO796SCM72JGT', 'CO796SCM23HHW'), 0.0002015811797719921), ", 466 | " (('HA723APF18CPL', 'CO515APF44YPR'), 0.031234752377721216)]" 467 | ] 468 | }, 469 | execution_count: 150, 470 | metadata: { }, 471 | output_type: "execute_result" 472 | } 473 | ], 474 | source: [ 475 | "users2.take(2)" 476 | ] 477 | }, 478 | { 479 | cell_type: "code", 480 | execution_count: 15, 481 | metadata: { }, 482 | outputs: [ ], 483 | source: [ 484 | "final = users2.reduceByKey(operator.add)" 485 | ] 486 | }, 487 | { 488 | cell_type: "code", 489 | execution_count: 16, 490 | metadata: { }, 491 | outputs: [ 492 | { 493 | name: "stdout", 494 | output_type: "stream", 495 | text: [ 496 | "[(('VI618SHF35NCY-51', 'LU773ACF56ILV'), 0.029501220638256383), (('FI911APF72ZHF', 'KA952APF52DNB'), 0.015504341823651058), (('FA865ACF45CCS', 'QU097ACF14BCMN'), 0.7071067811865475)] ", 497 | "363.733115196228 " 498 | ] 499 | } 500 | ], 501 | source: [ 502 | "t0 = time.time() ", 503 | "print(final.take(3)) ", 504 | "print(time.time() - t0)" 505 | ] 506 | }, 507 | { 508 | cell_type: "code", 509 | execution_count: null, 510 | metadata: { 511 | collapsed: true 512 | }, 513 | outputs: [ ], 514 | source: [ ] 515 | }, 516 | { 517 | cell_type: "code", 518 | execution_count: null, 519 | metadata: { 520 | collapsed: true 521 | }, 522 | outputs: [ ], 523 | source: [ ] 524 | }, 525 | { 526 | cell_type: "code", 527 | execution_count: null, 528 | metadata: { 529 | collapsed: true 530 | }, 531 | outputs: [ ], 532 | source: [ ] 533 | }, 534 | { 535 | cell_type: "code", 536 | execution_count: null, 537 | metadata: { 538 | collapsed: true 539 | }, 540 | outputs: [ ], 541 | source: [ ] 542 | }, 543 | { 544 | cell_type: "code", 545 | execution_count: null, 546 | metadata: { 547 | collapsed: true 548 | }, 549 | outputs: [ ], 550 | source: [ ] 551 | }, 552 | { 553 | cell_type: "code", 554 | execution_count: null, 555 | metadata: { 556 | collapsed: true 557 | }, 558 | outputs: [ ], 559 | source: [ ] 560 | } 561 | ], 562 | metadata: { 563 | kernelspec: { 564 | display_name: "PySpark", 565 | language: "python", 566 | name: "pyspark" 567 | }, 568 | language_info: { 569 | codemirror_mode: { 570 | name: "ipython", 571 | version: 3 572 | }, 573 | file_extension: ".py", 574 | mimetype: "text/x-python", 575 | name: "python", 576 | nbconvert_exporter: "python", 577 | pygments_lexer: "ipython3", 578 | version: "3.5.2" 579 | } 580 | }, 581 | nbformat: 4, 582 | nbformat_minor: 2 583 | } 584 | -------------------------------------------------------------------------------- /notebooks/rdd_marreco_test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | cells: [ 3 | { 4 | cell_type: "code", 5 | execution_count: 40, 6 | metadata: { 7 | collapsed: true 8 | }, 9 | outputs: [ ], 10 | source: [ 11 | "import operator" 12 | ] 13 | }, 14 | { 15 | cell_type: "code", 16 | execution_count: 50, 17 | metadata: { 18 | collapsed: true 19 | }, 20 | outputs: [ ], 21 | source: [ 22 | "import pyspark" 23 | ] 24 | }, 25 | { 26 | cell_type: "code", 27 | execution_count: 51, 28 | metadata: { }, 29 | outputs: [ 30 | { 31 | data: { 32 | text/plain: [ 33 | "'2.2.0'" 34 | ] 35 | }, 36 | execution_count: 51, 37 | metadata: { }, 38 | output_type: "execute_result" 39 | } 40 | ], 41 | source: [ 42 | "pyspark.__version__" 43 | ] 44 | }, 45 | { 46 | cell_type: "code", 47 | execution_count: 30, 48 | metadata: { 49 | collapsed: true 50 | }, 51 | outputs: [ ], 52 | source: [ 53 | "train_rdd = sc.textFile('gs://lbanor/pyspark/train_query*.gz')" 54 | ] 55 | }, 56 | { 57 | cell_type: "code", 58 | execution_count: 31, 59 | metadata: { 60 | collapsed: true 61 | }, 62 | outputs: [ ], 63 | source: [ 64 | "header = train_rdd.first() ", 65 | "train_rdd = train_rdd.filter(lambda x: x != header).map(lambda x: x.split(','))" 66 | ] 67 | }, 68 | { 69 | cell_type: "code", 70 | execution_count: 32, 71 | metadata: { }, 72 | outputs: [ 73 | { 74 | data: { 75 | text/plain: [ 76 | "[['3383270414872112082', 'MO578SHF77RTI', '0.5'], ", 77 | " ['7143168022217708588', 'DA923SHF54UJP', '0.5'], ", 78 | " ['8844960186636261737', 'LU621ACM67NYU', '0.5']]" 79 | ] 80 | }, 81 | execution_count: 32, 82 | metadata: { }, 83 | output_type: "execute_result" 84 | } 85 | ], 86 | source: [ 87 | "train_rdd.take(3)" 88 | ] 89 | }, 90 | { 91 | cell_type: "code", 92 | execution_count: 33, 93 | metadata: { 94 | collapsed: true 95 | }, 96 | outputs: [ ], 97 | source: [ 98 | "train_rdd = train_rdd.map(lambda x: (x[0], (x[1], float(x[2])))).groupByKey().mapValues(list).filter(lambda x: len(x[1]) > 1)" 99 | ] 100 | }, 101 | { 102 | cell_type: "code", 103 | execution_count: 34, 104 | metadata: { }, 105 | outputs: [ 106 | { 107 | data: { 108 | text/plain: [ 109 | "[('7357279563665682536', ", 110 | " [('CO515SHF91TPO', 0.5), ", 111 | " ('MA862SHF07OZG', 1.0), ", 112 | " ('DA923SHF00KPV', 0.5), ", 113 | " ('RA626SHF48VKP', 0.5), ", 114 | " ('UP428APF54RFP', 0.5), ", 115 | " ('OU295APF41KVE', 0.5)]), ", 116 | " ('3831524958866269889', ", 117 | " [('CA278SHF45EVY', 1.0), ", 118 | " ('SA232SHF43XLS', 0.5), ", 119 | " ('SA232SHF74ADT', 0.5), ", 120 | " ('LA628SHF52JSD', 0.5), ", 121 | " ('SA232SHF29PWG', 0.5), ", 122 | " ('DO302SHF23LDS', 0.5), ", 123 | " ('DO302SHF37LDE', 0.5), ", 124 | " ('CA278SHF48EVV', 0.5), ", 125 | " ('LA628SHF40IHZ', 0.5), ", 126 | " ('CA278SHF97UGW', 0.5), ", 127 | " ('CA278SHF45EVY', 0.5)]), ", 128 | " ('7808161502332133024', ", 129 | " [('AS296SCF58FCV', 0.5), ('CA278SHF78LIB', 0.5), ('CA278SHF77LIC', 0.5)])]" 130 | ] 131 | }, 132 | execution_count: 34, 133 | metadata: { }, 134 | output_type: "execute_result" 135 | } 136 | ], 137 | source: [ 138 | "train_rdd.take(3)" 139 | ] 140 | }, 141 | { 142 | cell_type: "code", 143 | execution_count: 35, 144 | metadata: { 145 | collapsed: true 146 | }, 147 | outputs: [ ], 148 | source: [ 149 | "train_rdd = train_rdd.map(lambda corr: [[((corr[1][i][0], corr[1][j][0]), corr[1][i][1] * corr[1][j][1]) for i in range(len(corr[1]))] for j in range(len(corr[1]))])" 150 | ] 151 | }, 152 | { 153 | cell_type: "code", 154 | execution_count: 36, 155 | metadata: { }, 156 | outputs: [ 157 | { 158 | data: { 159 | text/plain: [ 160 | "[[[(('CO515SHF91TPO', 'CO515SHF91TPO'), 0.25), ", 161 | " (('MA862SHF07OZG', 'CO515SHF91TPO'), 0.5), ", 162 | " (('DA923SHF00KPV', 'CO515SHF91TPO'), 0.25), ", 163 | " (('RA626SHF48VKP', 'CO515SHF91TPO'), 0.25), ", 164 | " (('UP428APF54RFP', 'CO515SHF91TPO'), 0.25), ", 165 | " (('OU295APF41KVE', 'CO515SHF91TPO'), 0.25)], ", 166 | " [(('CO515SHF91TPO', 'MA862SHF07OZG'), 0.5), ", 167 | " (('MA862SHF07OZG', 'MA862SHF07OZG'), 1.0), ", 168 | " (('DA923SHF00KPV', 'MA862SHF07OZG'), 0.5), ", 169 | " (('RA626SHF48VKP', 'MA862SHF07OZG'), 0.5), ", 170 | " (('UP428APF54RFP', 'MA862SHF07OZG'), 0.5), ", 171 | " (('OU295APF41KVE', 'MA862SHF07OZG'), 0.5)], ", 172 | " [(('CO515SHF91TPO', 'DA923SHF00KPV'), 0.25), ", 173 | " (('MA862SHF07OZG', 'DA923SHF00KPV'), 0.5), ", 174 | " (('DA923SHF00KPV', 'DA923SHF00KPV'), 0.25), ", 175 | " (('RA626SHF48VKP', 'DA923SHF00KPV'), 0.25), ", 176 | " (('UP428APF54RFP', 'DA923SHF00KPV'), 0.25), ", 177 | " (('OU295APF41KVE', 'DA923SHF00KPV'), 0.25)], ", 178 | " [(('CO515SHF91TPO', 'RA626SHF48VKP'), 0.25), ", 179 | " (('MA862SHF07OZG', 'RA626SHF48VKP'), 0.5), ", 180 | " (('DA923SHF00KPV', 'RA626SHF48VKP'), 0.25), ", 181 | " (('RA626SHF48VKP', 'RA626SHF48VKP'), 0.25), ", 182 | " (('UP428APF54RFP', 'RA626SHF48VKP'), 0.25), ", 183 | " (('OU295APF41KVE', 'RA626SHF48VKP'), 0.25)], ", 184 | " [(('CO515SHF91TPO', 'UP428APF54RFP'), 0.25), ", 185 | " (('MA862SHF07OZG', 'UP428APF54RFP'), 0.5), ", 186 | " (('DA923SHF00KPV', 'UP428APF54RFP'), 0.25), ", 187 | " (('RA626SHF48VKP', 'UP428APF54RFP'), 0.25), ", 188 | " (('UP428APF54RFP', 'UP428APF54RFP'), 0.25), ", 189 | " (('OU295APF41KVE', 'UP428APF54RFP'), 0.25)], ", 190 | " [(('CO515SHF91TPO', 'OU295APF41KVE'), 0.25), ", 191 | " (('MA862SHF07OZG', 'OU295APF41KVE'), 0.5), ", 192 | " (('DA923SHF00KPV', 'OU295APF41KVE'), 0.25), ", 193 | " (('RA626SHF48VKP', 'OU295APF41KVE'), 0.25), ", 194 | " (('UP428APF54RFP', 'OU295APF41KVE'), 0.25), ", 195 | " (('OU295APF41KVE', 'OU295APF41KVE'), 0.25)]], ", 196 | " [[(('CA278SHF45EVY', 'CA278SHF45EVY'), 1.0), ", 197 | " (('SA232SHF43XLS', 'CA278SHF45EVY'), 0.5), ", 198 | " (('SA232SHF74ADT', 'CA278SHF45EVY'), 0.5), ", 199 | " (('LA628SHF52JSD', 'CA278SHF45EVY'), 0.5), ", 200 | " (('SA232SHF29PWG', 'CA278SHF45EVY'), 0.5), ", 201 | " (('DO302SHF23LDS', 'CA278SHF45EVY'), 0.5), ", 202 | " (('DO302SHF37LDE', 'CA278SHF45EVY'), 0.5), ", 203 | " (('CA278SHF48EVV', 'CA278SHF45EVY'), 0.5), ", 204 | " (('LA628SHF40IHZ', 'CA278SHF45EVY'), 0.5), ", 205 | " (('CA278SHF97UGW', 'CA278SHF45EVY'), 0.5), ", 206 | " (('CA278SHF45EVY', 'CA278SHF45EVY'), 0.5)], ", 207 | " [(('CA278SHF45EVY', 'SA232SHF43XLS'), 0.5), ", 208 | " (('SA232SHF43XLS', 'SA232SHF43XLS'), 0.25), ", 209 | " (('SA232SHF74ADT', 'SA232SHF43XLS'), 0.25), ", 210 | " (('LA628SHF52JSD', 'SA232SHF43XLS'), 0.25), ", 211 | " (('SA232SHF29PWG', 'SA232SHF43XLS'), 0.25), ", 212 | " (('DO302SHF23LDS', 'SA232SHF43XLS'), 0.25), ", 213 | " (('DO302SHF37LDE', 'SA232SHF43XLS'), 0.25), ", 214 | " (('CA278SHF48EVV', 'SA232SHF43XLS'), 0.25), ", 215 | " (('LA628SHF40IHZ', 'SA232SHF43XLS'), 0.25), ", 216 | " (('CA278SHF97UGW', 'SA232SHF43XLS'), 0.25), ", 217 | " (('CA278SHF45EVY', 'SA232SHF43XLS'), 0.25)], ", 218 | " [(('CA278SHF45EVY', 'SA232SHF74ADT'), 0.5), ", 219 | " (('SA232SHF43XLS', 'SA232SHF74ADT'), 0.25), ", 220 | " (('SA232SHF74ADT', 'SA232SHF74ADT'), 0.25), ", 221 | " (('LA628SHF52JSD', 'SA232SHF74ADT'), 0.25), ", 222 | " (('SA232SHF29PWG', 'SA232SHF74ADT'), 0.25), ", 223 | " (('DO302SHF23LDS', 'SA232SHF74ADT'), 0.25), ", 224 | " (('DO302SHF37LDE', 'SA232SHF74ADT'), 0.25), ", 225 | " (('CA278SHF48EVV', 'SA232SHF74ADT'), 0.25), ", 226 | " (('LA628SHF40IHZ', 'SA232SHF74ADT'), 0.25), ", 227 | " (('CA278SHF97UGW', 'SA232SHF74ADT'), 0.25), ", 228 | " (('CA278SHF45EVY', 'SA232SHF74ADT'), 0.25)], ", 229 | " [(('CA278SHF45EVY', 'LA628SHF52JSD'), 0.5), ", 230 | " (('SA232SHF43XLS', 'LA628SHF52JSD'), 0.25), ", 231 | " (('SA232SHF74ADT', 'LA628SHF52JSD'), 0.25), ", 232 | " (('LA628SHF52JSD', 'LA628SHF52JSD'), 0.25), ", 233 | " (('SA232SHF29PWG', 'LA628SHF52JSD'), 0.25), ", 234 | " (('DO302SHF23LDS', 'LA628SHF52JSD'), 0.25), ", 235 | " (('DO302SHF37LDE', 'LA628SHF52JSD'), 0.25), ", 236 | " (('CA278SHF48EVV', 'LA628SHF52JSD'), 0.25), ", 237 | " (('LA628SHF40IHZ', 'LA628SHF52JSD'), 0.25), ", 238 | " (('CA278SHF97UGW', 'LA628SHF52JSD'), 0.25), ", 239 | " (('CA278SHF45EVY', 'LA628SHF52JSD'), 0.25)], ", 240 | " [(('CA278SHF45EVY', 'SA232SHF29PWG'), 0.5), ", 241 | " (('SA232SHF43XLS', 'SA232SHF29PWG'), 0.25), ", 242 | " (('SA232SHF74ADT', 'SA232SHF29PWG'), 0.25), ", 243 | " (('LA628SHF52JSD', 'SA232SHF29PWG'), 0.25), ", 244 | " (('SA232SHF29PWG', 'SA232SHF29PWG'), 0.25), ", 245 | " (('DO302SHF23LDS', 'SA232SHF29PWG'), 0.25), ", 246 | " (('DO302SHF37LDE', 'SA232SHF29PWG'), 0.25), ", 247 | " (('CA278SHF48EVV', 'SA232SHF29PWG'), 0.25), ", 248 | " (('LA628SHF40IHZ', 'SA232SHF29PWG'), 0.25), ", 249 | " (('CA278SHF97UGW', 'SA232SHF29PWG'), 0.25), ", 250 | " (('CA278SHF45EVY', 'SA232SHF29PWG'), 0.25)], ", 251 | " [(('CA278SHF45EVY', 'DO302SHF23LDS'), 0.5), ", 252 | " (('SA232SHF43XLS', 'DO302SHF23LDS'), 0.25), ", 253 | " (('SA232SHF74ADT', 'DO302SHF23LDS'), 0.25), ", 254 | " (('LA628SHF52JSD', 'DO302SHF23LDS'), 0.25), ", 255 | " (('SA232SHF29PWG', 'DO302SHF23LDS'), 0.25), ", 256 | " (('DO302SHF23LDS', 'DO302SHF23LDS'), 0.25), ", 257 | " (('DO302SHF37LDE', 'DO302SHF23LDS'), 0.25), ", 258 | " (('CA278SHF48EVV', 'DO302SHF23LDS'), 0.25), ", 259 | " (('LA628SHF40IHZ', 'DO302SHF23LDS'), 0.25), ", 260 | " (('CA278SHF97UGW', 'DO302SHF23LDS'), 0.25), ", 261 | " (('CA278SHF45EVY', 'DO302SHF23LDS'), 0.25)], ", 262 | " [(('CA278SHF45EVY', 'DO302SHF37LDE'), 0.5), ", 263 | " (('SA232SHF43XLS', 'DO302SHF37LDE'), 0.25), ", 264 | " (('SA232SHF74ADT', 'DO302SHF37LDE'), 0.25), ", 265 | " (('LA628SHF52JSD', 'DO302SHF37LDE'), 0.25), ", 266 | " (('SA232SHF29PWG', 'DO302SHF37LDE'), 0.25), ", 267 | " (('DO302SHF23LDS', 'DO302SHF37LDE'), 0.25), ", 268 | " (('DO302SHF37LDE', 'DO302SHF37LDE'), 0.25), ", 269 | " (('CA278SHF48EVV', 'DO302SHF37LDE'), 0.25), ", 270 | " (('LA628SHF40IHZ', 'DO302SHF37LDE'), 0.25), ", 271 | " (('CA278SHF97UGW', 'DO302SHF37LDE'), 0.25), ", 272 | " (('CA278SHF45EVY', 'DO302SHF37LDE'), 0.25)], ", 273 | " [(('CA278SHF45EVY', 'CA278SHF48EVV'), 0.5), ", 274 | " (('SA232SHF43XLS', 'CA278SHF48EVV'), 0.25), ", 275 | " (('SA232SHF74ADT', 'CA278SHF48EVV'), 0.25), ", 276 | " (('LA628SHF52JSD', 'CA278SHF48EVV'), 0.25), ", 277 | " (('SA232SHF29PWG', 'CA278SHF48EVV'), 0.25), ", 278 | " (('DO302SHF23LDS', 'CA278SHF48EVV'), 0.25), ", 279 | " (('DO302SHF37LDE', 'CA278SHF48EVV'), 0.25), ", 280 | " (('CA278SHF48EVV', 'CA278SHF48EVV'), 0.25), ", 281 | " (('LA628SHF40IHZ', 'CA278SHF48EVV'), 0.25), ", 282 | " (('CA278SHF97UGW', 'CA278SHF48EVV'), 0.25), ", 283 | " (('CA278SHF45EVY', 'CA278SHF48EVV'), 0.25)], ", 284 | " [(('CA278SHF45EVY', 'LA628SHF40IHZ'), 0.5), ", 285 | " (('SA232SHF43XLS', 'LA628SHF40IHZ'), 0.25), ", 286 | " (('SA232SHF74ADT', 'LA628SHF40IHZ'), 0.25), ", 287 | " (('LA628SHF52JSD', 'LA628SHF40IHZ'), 0.25), ", 288 | " (('SA232SHF29PWG', 'LA628SHF40IHZ'), 0.25), ", 289 | " (('DO302SHF23LDS', 'LA628SHF40IHZ'), 0.25), ", 290 | " (('DO302SHF37LDE', 'LA628SHF40IHZ'), 0.25), ", 291 | " (('CA278SHF48EVV', 'LA628SHF40IHZ'), 0.25), ", 292 | " (('LA628SHF40IHZ', 'LA628SHF40IHZ'), 0.25), ", 293 | " (('CA278SHF97UGW', 'LA628SHF40IHZ'), 0.25), ", 294 | " (('CA278SHF45EVY', 'LA628SHF40IHZ'), 0.25)], ", 295 | " [(('CA278SHF45EVY', 'CA278SHF97UGW'), 0.5), ", 296 | " (('SA232SHF43XLS', 'CA278SHF97UGW'), 0.25), ", 297 | " (('SA232SHF74ADT', 'CA278SHF97UGW'), 0.25), ", 298 | " (('LA628SHF52JSD', 'CA278SHF97UGW'), 0.25), ", 299 | " (('SA232SHF29PWG', 'CA278SHF97UGW'), 0.25), ", 300 | " (('DO302SHF23LDS', 'CA278SHF97UGW'), 0.25), ", 301 | " (('DO302SHF37LDE', 'CA278SHF97UGW'), 0.25), ", 302 | " (('CA278SHF48EVV', 'CA278SHF97UGW'), 0.25), ", 303 | " (('LA628SHF40IHZ', 'CA278SHF97UGW'), 0.25), ", 304 | " (('CA278SHF97UGW', 'CA278SHF97UGW'), 0.25), ", 305 | " (('CA278SHF45EVY', 'CA278SHF97UGW'), 0.25)], ", 306 | " [(('CA278SHF45EVY', 'CA278SHF45EVY'), 0.5), ", 307 | " (('SA232SHF43XLS', 'CA278SHF45EVY'), 0.25), ", 308 | " (('SA232SHF74ADT', 'CA278SHF45EVY'), 0.25), ", 309 | " (('LA628SHF52JSD', 'CA278SHF45EVY'), 0.25), ", 310 | " (('SA232SHF29PWG', 'CA278SHF45EVY'), 0.25), ", 311 | " (('DO302SHF23LDS', 'CA278SHF45EVY'), 0.25), ", 312 | " (('DO302SHF37LDE', 'CA278SHF45EVY'), 0.25), ", 313 | " (('CA278SHF48EVV', 'CA278SHF45EVY'), 0.25), ", 314 | " (('LA628SHF40IHZ', 'CA278SHF45EVY'), 0.25), ", 315 | " (('CA278SHF97UGW', 'CA278SHF45EVY'), 0.25), ", 316 | " (('CA278SHF45EVY', 'CA278SHF45EVY'), 0.25)]], ", 317 | " [[(('AS296SCF58FCV', 'AS296SCF58FCV'), 0.25), ", 318 | " (('CA278SHF78LIB', 'AS296SCF58FCV'), 0.25), ", 319 | " (('CA278SHF77LIC', 'AS296SCF58FCV'), 0.25)], ", 320 | " [(('AS296SCF58FCV', 'CA278SHF78LIB'), 0.25), ", 321 | " (('CA278SHF78LIB', 'CA278SHF78LIB'), 0.25), ", 322 | " (('CA278SHF77LIC', 'CA278SHF78LIB'), 0.25)], ", 323 | " [(('AS296SCF58FCV', 'CA278SHF77LIC'), 0.25), ", 324 | " (('CA278SHF78LIB', 'CA278SHF77LIC'), 0.25), ", 325 | " (('CA278SHF77LIC', 'CA278SHF77LIC'), 0.25)]]]" 326 | ] 327 | }, 328 | execution_count: 36, 329 | metadata: { }, 330 | output_type: "execute_result" 331 | } 332 | ], 333 | source: [ 334 | "train_rdd.take(3):" 335 | ] 336 | }, 337 | { 338 | cell_type: "code", 339 | execution_count: 37, 340 | metadata: { 341 | collapsed: true 342 | }, 343 | outputs: [ ], 344 | source: [ 345 | "train_rdd = train_rdd.flatMap(lambda x: x).flatMap(lambda x: x)" 346 | ] 347 | }, 348 | { 349 | cell_type: "code", 350 | execution_count: 38, 351 | metadata: { }, 352 | outputs: [ 353 | { 354 | data: { 355 | text/plain: [ 356 | "[(('LU759APM92BCD', 'LU759APM92BCD'), 0.25), ", 357 | " (('MA099APM20LVF', 'LU759APM92BCD'), 0.25), ", 358 | " (('DU387APM43GAQ', 'LU759APM92BCD'), 0.25)]" 359 | ] 360 | }, 361 | execution_count: 38, 362 | metadata: { }, 363 | output_type: "execute_result" 364 | } 365 | ], 366 | source: [ 367 | "train_rdd.take(3)" 368 | ] 369 | }, 370 | { 371 | cell_type: "code", 372 | execution_count: 43, 373 | metadata: { }, 374 | outputs: [ 375 | { 376 | data: { 377 | text/plain: [ 378 | "110201577" 379 | ] 380 | }, 381 | execution_count: 43, 382 | metadata: { }, 383 | output_type: "execute_result" 384 | } 385 | ], 386 | source: [ 387 | "train_rdd.count()" 388 | ] 389 | }, 390 | { 391 | cell_type: "code", 392 | execution_count: 41, 393 | metadata: { 394 | collapsed: true 395 | }, 396 | outputs: [ ], 397 | source: [ 398 | "r = train_rdd.reduceByKey(operator.add)" 399 | ] 400 | }, 401 | { 402 | cell_type: "code", 403 | execution_count: 42, 404 | metadata: { }, 405 | outputs: [ 406 | { 407 | data: { 408 | text/plain: [ 409 | "[(('VI185ACF98VRP', 'SA232ACF26KXP'), 0.75), ", 410 | " (('HA651APF16CBF', 'CA558APF27RGU'), 0.25), ", 411 | " (('AG170APF90ZUN', 'MA250APF39AWQ'), 0.5)]" 412 | ] 413 | }, 414 | execution_count: 42, 415 | metadata: { }, 416 | output_type: "execute_result" 417 | } 418 | ], 419 | source: [ 420 | "r.take(3) # taking almost 30 mins now..." 421 | ] 422 | }, 423 | { 424 | cell_type: "code", 425 | execution_count: 61, 426 | metadata: { 427 | collapsed: true 428 | }, 429 | outputs: [ ], 430 | source: [ 431 | "r = sc.parallelize([[(i, i) for i in range(300000) ] ]).flatMap(lambda x: x) ", 432 | "r2 = sc.parallelize([[(i, i) for i in range(300000) ] ]).flatMap(lambda x: x)" 433 | ] 434 | }, 435 | { 436 | cell_type: "code", 437 | execution_count: 65, 438 | metadata: { 439 | collapsed: true 440 | }, 441 | outputs: [ ], 442 | source: [ 443 | "r3 = r.cartesian(r2)" 444 | ] 445 | } 446 | ], 447 | metadata: { 448 | kernelspec: { 449 | display_name: "PySpark", 450 | language: "python", 451 | name: "pyspark" 452 | }, 453 | language_info: { 454 | codemirror_mode: { 455 | name: "ipython", 456 | version: 3 457 | }, 458 | file_extension: ".py", 459 | mimetype: "text/x-python", 460 | name: "python", 461 | nbconvert_exporter: "python", 462 | pygments_lexer: "ipython3", 463 | version: "3.5.2" 464 | } 465 | }, 466 | nbformat: 4, 467 | nbformat_minor: 2 468 | } 469 | -------------------------------------------------------------------------------- /tests/system/spark_jobs/test_neighbor.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2017 Willian Fuks 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | """ 24 | This is system tests and should only be run if the environment has pyspark 25 | and a spark cluster installed to receive on-demand jobs 26 | """ 27 | 28 | import os 29 | import unittest 30 | import sys 31 | import mock 32 | import json 33 | import datetime 34 | import pyspark 35 | import math 36 | import glob 37 | import shutil 38 | from collections import namedtuple 39 | import numpy as np 40 | 41 | from pyspark.sql import types as stypes 42 | sys.path.append('./spark_jobs') 43 | 44 | py_files = ['./spark_jobs/neighbor.py', 45 | './spark_jobs/base.py', 46 | './spark_jobs/factory.py'] 47 | 48 | 49 | class Test_neighbor(unittest.TestCase): 50 | 51 | _sc = pyspark.SparkContext(pyFiles=py_files) 52 | _session = pyspark.sql.SparkSession(_sc) 53 | _to_delete_uris = [] 54 | 55 | 56 | @staticmethod 57 | def _get_target_class(): 58 | from neighbor import MarrecoNeighborJob 59 | 60 | 61 | return MarrecoNeighborJob 62 | 63 | 64 | @staticmethod 65 | def _delete_dirs(*args): 66 | for arg in args: 67 | if os.path.isdir(arg): 68 | shutil.rmtree(arg) 69 | 70 | 71 | def _prepare_daily_data(self): 72 | for i in [1, 2]: 73 | uri = 'tests/system/data/neighbor/train/{}/train.json'.format( 74 | i) 75 | data = self._sc.textFile(uri) 76 | formatted_day = (datetime.datetime.now() - 77 | datetime.timedelta(days=i)).strftime('%Y-%m-%d') 78 | 79 | save_uri = 'tests/system/data/neighbor/train/{}/train.json'.format( 80 | formatted_day) 81 | self._delete_dirs(save_uri) 82 | self._to_delete_uris.append(os.path.dirname(save_uri)) 83 | data.saveAsTextFile(save_uri) 84 | 85 | 86 | def _delete_uris(self): 87 | for uri in self._to_delete_uris: 88 | self._delete_dirs(uri) 89 | self._to_delete_uris = [] 90 | 91 | 92 | def test_process_datajet_day_no_force(self): 93 | klass = self._get_target_class()() 94 | Args = namedtuple('args', ['w_browse', 'w_purchase', 'decay']) 95 | args = Args(0.5, 2.0, 0.1) 96 | inter_uri = 'tests/system/data/dj' 97 | self._delete_dirs(inter_uri) 98 | self.assertFalse(os.path.isdir(inter_uri)) 99 | 100 | test_days = (datetime.datetime.now() 101 | - datetime.datetime(*[2017, 8, 13])).days 102 | 103 | klass._process_datajet_day(self._sc, 104 | 'tests/system/data/datajet_test.json', 105 | inter_uri, 106 | args, 107 | mode=None, 108 | compression=None) 109 | 110 | expected = [str({"user_id": "25e35a54c8cace51", 111 | "interacted_items":[{"key":"MA042APM76IPJ", 112 | "score": float(str(args.w_browse * math.exp( 113 | -args.decay * test_days))[:9])}]}), 114 | str({"user_id": "610574c802ba3b33", 115 | "interacted_items":[{"key":"DA923SHF35RHK", 116 | "score": float(str(args.w_purchase * math.exp( 117 | -args.decay * test_days))[:9])}, 118 | {"key": "VI618SHF69UQC", 119 | "score": float(str(args.w_purchase * math.exp( 120 | -args.decay * test_days))[:9])}]})] 121 | 122 | result = [json.loads(i) for i in ''.join([open(e).read() 123 | for e in glob.glob(inter_uri + '/*.json')]).split('\n') if i] 124 | for e in result: 125 | for item in e['interacted_items']: 126 | item['score'] = float(str(item['score'])[:9]) 127 | 128 | self.assertEqual(expected, [str(e) for e in result]) 129 | 130 | 131 | def test_process_datajet_day_yes_force(self): 132 | klass = self._get_target_class()() 133 | spark = pyspark.sql.SparkSession(self._sc) 134 | Args = namedtuple('args', ['w_browse', 'w_purchase', 'decay']) 135 | args = Args(0.5, 2.0, 0.1) 136 | inter_uri = '/tests/system/data/dj' 137 | self._delete_dirs(inter_uri) 138 | 139 | test_days = (datetime.datetime.now() 140 | - datetime.datetime(*[2017, 8, 13])).days 141 | 142 | klass._process_datajet_day(self._sc, 143 | 'tests/system/data/datajet_test.json', 144 | inter_uri, 145 | args, 146 | mode=None, 147 | compression=None) 148 | 149 | self.assertTrue(os.path.isdir(inter_uri)) 150 | 151 | klass._process_datajet_day(self._sc, 152 | 'tests/system/data/datajet_test.json', 153 | inter_uri, 154 | args, 155 | mode='overwrite', 156 | compression=None) 157 | 158 | expected = [str({"user_id": "25e35a54c8cace51", 159 | "interacted_items":[{"key":"MA042APM76IPJ", 160 | "score": float(str(args.w_browse * math.exp( 161 | -args.decay * test_days))[:9])}]}), 162 | str({"user_id": "610574c802ba3b33", 163 | "interacted_items":[{"key":"DA923SHF35RHK", 164 | "score": float(str(args.w_purchase * math.exp( 165 | -args.decay * test_days))[:9])}, 166 | {"key": "VI618SHF69UQC", 167 | "score": float(str(args.w_purchase * math.exp( 168 | -args.decay * test_days))[:9])}]})] 169 | 170 | result = [json.loads(i) for i in ''.join([open(e).read() 171 | for e in glob.glob(inter_uri + '/*.json')]).split('\n') if i] 172 | for e in result: 173 | for item in e['interacted_items']: 174 | item['score'] = float(str(item['score'])[:9]) 175 | 176 | self.assertEqual(expected, [str(e) for e in result]) 177 | self._delete_dirs(inter_uri) 178 | self.assertFalse(os.path.isdir(inter_uri)) 179 | 180 | 181 | def test_transform_data_no_force(self): 182 | klass = self._get_target_class()() 183 | inter_uri = 'tests/system/data/neighbor/inter/{}' 184 | Args = namedtuple('args', ['days_init', 185 | 'days_end', 186 | 'w_browse', 187 | 'w_purchase', 188 | 'force', 189 | 'source_uri', 190 | 'inter_uri', 191 | 'neighbor_uri', 192 | 'threshold', 193 | 'decay']) 194 | 195 | self._prepare_daily_data() 196 | 197 | args = Args(2, 1, 0.5, 6, 'no', 198 | 'tests/system/data/neighbor/train/{}/train.json', 199 | inter_uri, 200 | 'tests/sytem/data/neighbor/result', 201 | 0.0, 0.0) 202 | klass.transform_data(self._sc, args) 203 | 204 | data1_uri = 'tests/system/data/neighbor/transformed_1.json' 205 | data2_uri = 'tests/system/data/neighbor/transformed_2.json' 206 | 207 | expected = {2: sorted(self._session.read.json(data2_uri, 208 | schema = klass._load_users_matrix_schema()) \ 209 | .toJSON().collect()), 210 | 1: sorted(self._session.read.json(data1_uri, 211 | schema = klass._load_users_matrix_schema()) \ 212 | .toJSON().collect())} 213 | 214 | for day in range(args.days_init, args.days_end - 1, -1): 215 | formatted_day = klass.get_formatted_date(day) 216 | result = sorted(self._session.read.json( 217 | inter_uri.format(formatted_day), 218 | schema=klass._load_users_matrix_schema())\ 219 | .toJSON().collect()) 220 | for i in range(len(result)): 221 | result_i = json.loads(result[i]) 222 | result_i['interacted_items'] = sorted( 223 | result_i['interacted_items'], key=lambda x: x['key']) 224 | expected_i = json.loads(expected[day][i]) 225 | expected_i['interacted_items'] = sorted( 226 | expected_i['interacted_items'], key=lambda x: x['key']) 227 | self.assertEqual(expected_i, result_i) 228 | 229 | for day in [2, 1]: 230 | formatted_day = klass.get_formatted_date(day) 231 | self._delete_dirs(inter_uri.format(formatted_day)) 232 | self.assertFalse(os.path.isdir(inter_uri.format(formatted_day))) 233 | self._delete_uris() 234 | self.assertEqual(self._to_delete_uris, []) 235 | 236 | 237 | def test_transform_data_yes_force(self): 238 | klass = self._get_target_class()() 239 | inter_uri = 'tests/system/data/neighbor/inter/{}' 240 | Args = namedtuple('args', ['days_init', 241 | 'days_end', 242 | 'w_browse', 243 | 'w_purchase', 244 | 'force', 245 | 'source_uri', 246 | 'inter_uri', 247 | 'neighbor_uri', 248 | 'threshold', 249 | 'decay']) 250 | 251 | self._prepare_daily_data() 252 | 253 | args = Args(2, 1, 0.5, 6, 'no', 254 | 'tests/system/data/neighbor/train/{}/train.json', 255 | inter_uri, 256 | 'tests/sytem/data/neighbor/result', 257 | 0.0, 0.0) 258 | klass.transform_data(self._sc, args) 259 | 260 | args = Args(2, 1, 0.5, 6, 'yes', 261 | 'tests/system/data/neighbor/train/{}/train.json', 262 | inter_uri, 263 | 'tests/sytem/data/neighbor/result', 264 | 0.0, 0.0) 265 | klass.transform_data(self._sc, args) 266 | 267 | data1_uri = 'tests/system/data/neighbor/transformed_1.json' 268 | data2_uri = 'tests/system/data/neighbor/transformed_2.json' 269 | 270 | expected = {2: sorted(self._session.read.json(data2_uri, 271 | schema = klass._load_users_matrix_schema()) \ 272 | .toJSON().collect()), 273 | 1: sorted(self._session.read.json(data1_uri, 274 | schema = klass._load_users_matrix_schema()) \ 275 | .toJSON().collect())} 276 | 277 | for day in range(args.days_init, args.days_end - 1, -1): 278 | formatted_day = klass.get_formatted_date(day) 279 | result = sorted(self._session.read.json( 280 | inter_uri.format(formatted_day), 281 | schema=klass._load_users_matrix_schema())\ 282 | .toJSON().collect()) 283 | for i in range(len(result)): 284 | result_i = json.loads(result[i]) 285 | result_i['interacted_items'] = sorted( 286 | result_i['interacted_items'], key=lambda x: x['key']) 287 | expected_i = json.loads(expected[day][i]) 288 | expected_i['interacted_items'] = sorted( 289 | expected_i['interacted_items'], key=lambda x: x['key']) 290 | self.assertEqual(expected_i, result_i) 291 | 292 | for day in [2, 1]: 293 | formatted_day = klass.get_formatted_date(day) 294 | self._delete_dirs(inter_uri.format(formatted_day)) 295 | self.assertFalse(os.path.isdir(inter_uri.format(formatted_day))) 296 | self._delete_uris() 297 | self.assertEqual(self._to_delete_uris, []) 298 | 299 | 300 | def test_build_marreco(self): 301 | klass = self._get_target_class()() 302 | result_uri = 'tests/system/data/neighbor/result/similarity' 303 | inter_uri = 'tests/system/data/neighbor/inter/{}' 304 | users_matrix_uri = 'tests/system/data/neighbor/result/users' 305 | 306 | self._delete_dirs(result_uri, users_matrix_uri) 307 | self.assertFalse(os.path.isdir(result_uri)) 308 | self.assertFalse(os.path.isdir(users_matrix_uri)) 309 | 310 | self._prepare_daily_data() 311 | 312 | Args = namedtuple('args', ['days_init', 313 | 'days_end', 314 | 'w_browse', 315 | 'w_purchase', 316 | 'force', 317 | 'source_uri', 318 | 'inter_uri', 319 | 'neighbor_uri', 320 | 'threshold', 321 | 'decay']) 322 | 323 | args = Args(2, 1, 0.5, 6, 'no', 324 | 'tests/system/data/neighbor/train/{}/train.json', 325 | inter_uri, 326 | 'tests/sytem/data/neighbor/result', 327 | 0.0, 0.0) 328 | 329 | klass.transform_data(self._sc, args) 330 | 331 | Args = namedtuple('args', ['days_init', 332 | 'days_end', 333 | 'inter_uri', 334 | 'neighbor_uri', 335 | 'threshold', 336 | 'users_matrix_uri']) 337 | 338 | args = Args(2, 1, inter_uri, result_uri, 0.0, users_matrix_uri) 339 | 340 | klass.build_marreco(self._sc, args) 341 | result = self._session.read.json(result_uri).collect() 342 | 343 | a = np.array([[0.5, 1., 0.5, 2.], 344 | [1., 2., 1., 0.5], 345 | [6., 1., 0.5, 0.5], 346 | [1., 1., 6., 6.]]) 347 | n = np.linalg.norm(a, axis=0).reshape(1, a.shape[1]) 348 | 349 | expected = a.T.dot(a) / n.T.dot(n) 350 | 351 | for row in result: 352 | key1 = row.item_key 353 | for inner_row in row.similarity_items: 354 | np.testing.assert_almost_equal( 355 | expected[int(key1), int(inner_row.key)], 356 | inner_row.score, decimal=6) 357 | 358 | for day in [2, 1]: 359 | formatted_day = klass.get_formatted_date(day) 360 | self._delete_dirs(inter_uri.format(formatted_day)) 361 | self.assertFalse(os.path.isdir(inter_uri.format(formatted_day))) 362 | self._delete_dirs(result_uri) 363 | self.assertFalse(os.path.isdir(result_uri)) 364 | 365 | 366 | def test_build_marreco_with_threshold(self): 367 | klass = self._get_target_class()() 368 | result_uri = 'tests/system/data/neighbor/result/similarity' 369 | inter_uri = 'tests/system/data/neighbor/inter/{}' 370 | users_matrix_uri = 'tests/system/data/neighbor/result/users' 371 | 372 | self._prepare_daily_data() 373 | 374 | self._delete_dirs(result_uri, users_matrix_uri) 375 | self.assertFalse(os.path.isdir(result_uri)) 376 | self.assertFalse(os.path.isdir(users_matrix_uri)) 377 | 378 | Args = namedtuple('args', ['days_init', 379 | 'days_end', 380 | 'w_browse', 381 | 'w_purchase', 382 | 'force', 383 | 'source_uri', 384 | 'inter_uri', 385 | 'neighbor_uri', 386 | 'threshold', 387 | 'decay']) 388 | 389 | args = Args(2, 1, 0.5, 6, 'no', 390 | 'tests/system/data/neighbor/train/{}/train.json', 391 | inter_uri, 392 | 'tests/sytem/data/neighbor/result', 393 | 0.0, 0.0) 394 | 395 | klass.transform_data(self._sc, args) 396 | 397 | Args = namedtuple('args', ['days_init', 398 | 'days_end', 399 | 'inter_uri', 400 | 'neighbor_uri', 401 | 'threshold', 402 | 'users_matrix_uri']) 403 | 404 | args = Args(2, 1, inter_uri, result_uri, 0.11, users_matrix_uri) 405 | 406 | klass.build_marreco(self._sc, args) 407 | result = self._session.read.json(result_uri).collect() 408 | 409 | a = np.array([[0.5, 1., 0.5, 2.], 410 | [1., 2., 1., 0.5], 411 | [6., 1., 0.5, 0.5], 412 | [1., 1., 6., 6.]]) 413 | 414 | n = np.linalg.norm(a, axis=0).reshape(1, a.shape[1]) 415 | 416 | expected = a.T.dot(a) / n.T.dot(n) 417 | 418 | Args = namedtuple('args', ['days_init', 419 | 'days_end', 420 | 'inter_uri', 421 | 'neighbor_uri', 422 | 'threshold', 423 | 'users_matrix_uri']) 424 | 425 | args = Args(2, 1, inter_uri, result_uri, 0.11, users_matrix_uri) 426 | 427 | klass.build_marreco(self._sc, args) 428 | result = self._session.read.json(result_uri).collect() 429 | 430 | for row in result: 431 | key1 = row.item_key 432 | for inner_row in row.similarity_items: 433 | actual = expected[int(key1), int(inner_row.key)] 434 | print('expected: ', actual) 435 | print('estimate: ', inner_row.score) 436 | self.assertTrue((actual - 437 | inner_row.score) / actual < 0.2) 438 | 439 | 440 | for day in [2, 1]: 441 | formatted_day = klass.get_formatted_date(day) 442 | self._delete_dirs(inter_uri.format(formatted_day)) 443 | self.assertFalse(os.path.isdir(inter_uri.format(formatted_day))) 444 | 445 | 446 | self._delete_dirs(result_uri, users_matrix_uri) 447 | self.assertFalse(os.path.isdir(result_uri)) 448 | self.assertFalse(os.path.isdir(users_matrix_uri)) 449 | -------------------------------------------------------------------------------- /notebooks/marreco_dimsum_sparse.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from pyspark.mllib.linalg import SparseVector\n", 12 | "from pyspark.mllib.linalg.distributed import RowMatrix\n", 13 | "import numpy as np\n", 14 | "from sklearn.metrics.pairwise import cosine_similarity\n", 15 | "import time\n", 16 | "from collections import defaultdict\n", 17 | "from pyspark.sql import functions as sfunc\n", 18 | "from pyspark.sql import types as stypes\n", 19 | "import math\n", 20 | "import sys\n", 21 | "from pyspark.ml.linalg import SparseVector\n", 22 | "from pyspark.mllib.linalg.distributed import RowMatrix\n", 23 | "from operator import itemgetter\n", 24 | "import operator\n", 25 | "import random" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "schema = stypes.StructType().add(\"fv\", stypes.StringType()).add(\"sku\", stypes.StringType()).add(\"score\", stypes.FloatType())\n", 37 | "train_df = spark.read.csv('gs://lbanor/pyspark/train_query*.gz', header=True, schema=schema)\n", 38 | "train_df.createOrReplaceTempView('test1')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 17, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "[Row(fv='6094795238635852694', sku='BR049APM25PCS', score=0.5),\n", 50 | " Row(fv='7454424246364596889', sku='TR763APF11DLC', score=0.5),\n", 51 | " Row(fv='5798933384203870548', sku='AN778SHF35NNG', score=0.5)]" 52 | ] 53 | }, 54 | "execution_count": 17, 55 | "metadata": {}, 56 | "output_type": "execute_result" 57 | } 58 | ], 59 | "source": [ 60 | "train_df.head(3)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 188, 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | "[Row(fv='1005105267406228429', sku='FI911SHF89UBM-50', score=5.0)]\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "print(train_df.rdd.filter(lambda x: x.sku == 'FI911SHF89UBM-50').take(3))" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 82, 83 | "metadata": { 84 | "collapsed": true 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "# query = \"\"\"\n", 89 | "# SELECT\n", 90 | "# sku,\n", 91 | "# ROW_NUMBER() OVER (ORDER BY SUM(1)) -1 idx\n", 92 | "# FROM test1\n", 93 | "# GROUP BY 1\n", 94 | "# \"\"\"\n", 95 | "# skus_rdd = spark.sql(query).rdd" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 4, 101 | "metadata": { 102 | "collapsed": true 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "query_statistics = \"\"\"\n", 107 | "SELECT\n", 108 | " sku,\n", 109 | " SQRT(10 * LOG(COUNT(sku) OVER()) / {threshold}) / SQRT(SUM(score * score)) p,\n", 110 | " IF(SQRT(10 * LOG(COUNT(sku) OVER()) / {threshold}) > SQRT(SUM(score * score)), SQRT(SUM(score * score)), SQRT(10 * LOG(COUNT(sku) OVER()) / {threshold})) q --- implements the min(gamma, ||c||)\n", 111 | "FROM test1\n", 112 | "GROUP BY 1\n", 113 | "\"\"\"" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 8, 119 | "metadata": { 120 | "collapsed": true 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "skus_stats = spark.sql(query_statistics.format(threshold=0.1))" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 9, 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "name": "stdout", 134 | "output_type": "stream", 135 | "text": [ 136 | "[Row(sku='FI911SHF89UBM-50', p=7.132311576894841, q=5.0)]\n" 137 | ] 138 | } 139 | ], 140 | "source": [ 141 | "print(skus_stats.rdd.filter(lambda x: x.sku == 'FI911SHF89UBM-50').take(3))" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 178, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/plain": [ 152 | "[Row(sku='PO140ACU06DDD', p=2.4697175158107982, q=14.439529078193651),\n", 153 | " Row(sku='PO140ACU76FVN', p=35.661557884474206, q=1.0),\n", 154 | " Row(sku='JU082SHF02WWZ', p=3.790780833876121, q=9.40744386111339)]" 155 | ] 156 | }, 157 | "execution_count": 178, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "sku_stats.take(3)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 194, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "name": "stdout", 173 | "output_type": "stream", 174 | "text": [ 175 | "[]\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "print(skus_stats.rdd.filter(lambda x: x.sku == 'FI911SHF89UBM-50').take(3))" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": { 187 | "collapsed": true 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "# query_statistics = \"\"\"\n", 192 | "# SELECT\n", 193 | "# sku,\n", 194 | "# {gamma} / SQRT(SUM(score * score)) p,\n", 195 | "# IF({gamma} > SQRT(SUM(score * score)), SQRT(SUM(score * score)), {gamma}) q\n", 196 | "# FROM test1\n", 197 | "# GROUP BY 1\n", 198 | "# \"\"\"" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 60, 204 | "metadata": { 205 | "collapsed": true 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "# def get_gamma(threshold, numCols):\n", 210 | "# return math.sqrt(10 * math.log(numCols) / threshold) if threshold > 10e-6 else math.inf" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 76, 216 | "metadata": {}, 217 | "outputs": [ 218 | { 219 | "name": "stdout", 220 | "output_type": "stream", 221 | "text": [ 222 | "35.57234899487128\n" 223 | ] 224 | } 225 | ], 226 | "source": [ 227 | "# gamma_b = sc.broadcast(get_gamma(10e-2))\n", 228 | "# print(gamma_b.value)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 77, 234 | "metadata": { 235 | "collapsed": true 236 | }, 237 | "outputs": [], 238 | "source": [ 239 | "# skus_stats = spark.sql(query_statistics.format(gamma=gamma_b.value))" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 78, 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "data": { 249 | "text/plain": [ 250 | "[Row(sku='NI531SRM74IHX', p=2.8758539658272255, q=12.36931687685298),\n", 251 | " Row(sku='MO578SHF45QNE', p=0.5225157525775272, q=35.57234899487128)]" 252 | ] 253 | }, 254 | "execution_count": 78, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "# skus_stats.head(2)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 10, 266 | "metadata": { 267 | "collapsed": true 268 | }, 269 | "outputs": [], 270 | "source": [ 271 | "pq_b = sc.broadcast({row.sku: [row.p, row.q] for row in skus_stats.collect()})" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 11, 277 | "metadata": {}, 278 | "outputs": [ 279 | { 280 | "data": { 281 | "text/plain": [ 282 | "[7.132311576894841, 5.0]" 283 | ] 284 | }, 285 | "execution_count": 11, 286 | "metadata": {}, 287 | "output_type": "execute_result" 288 | } 289 | ], 290 | "source": [ 291 | "pq_b.value['FI911SHF89UBM-50']" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 157, 297 | "metadata": { 298 | "collapsed": true 299 | }, 300 | "outputs": [], 301 | "source": [ 302 | "#skus_idx_b = sc.broadcast({sku: idx for idx, sku in enumerate(pq_b.value.keys())})" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 158, 308 | "metadata": { 309 | "collapsed": true 310 | }, 311 | "outputs": [], 312 | "source": [ 313 | "#idx_skus_b = sc.broadcast({value: key for key, value in skus_idx_b.value.items()})" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 53, 319 | "metadata": { 320 | "collapsed": true 321 | }, 322 | "outputs": [], 323 | "source": [ 324 | "# d = {row.sku: row.idx for row in skus_rdd.collect()}\n", 325 | "# db = sc.broadcast(d)\n", 326 | "\n", 327 | "# id_ = {value: key for key, value in d.items()}\n", 328 | "# id_b = sc.broadcast(id_)" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 159, 334 | "metadata": { 335 | "collapsed": true 336 | }, 337 | "outputs": [], 338 | "source": [ 339 | "#numCols = sc.broadcast(len(idx_skus_b.value))" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 57, 345 | "metadata": { 346 | "collapsed": true 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "# p = [0] * numCols.value\n", 351 | "# for row in skus_stats" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 55, 357 | "metadata": { 358 | "collapsed": true 359 | }, 360 | "outputs": [], 361 | "source": [ 362 | "#p = {row.sku: gamma_b.value / row.norm for row in skus_stats.collect()} # if 0 happens as the ``norm`` we expected an Exception to be raised.\n", 363 | "#p_b = sc.broadcast(p)" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 34, 369 | "metadata": { 370 | "collapsed": true 371 | }, 372 | "outputs": [], 373 | "source": [ 374 | "#q = {row.sku: gamma_b.value / row.norm for row in skus_stats.collect()}" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 35, 380 | "metadata": {}, 381 | "outputs": [ 382 | { 383 | "data": { 384 | "text/plain": [ 385 | "312988" 386 | ] 387 | }, 388 | "execution_count": 35, 389 | "metadata": {}, 390 | "output_type": "execute_result" 391 | } 392 | ], 393 | "source": [ 394 | "#numCols.value" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 31, 400 | "metadata": {}, 401 | "outputs": [ 402 | { 403 | "data": { 404 | "text/plain": [ 405 | "12.36931687685298" 406 | ] 407 | }, 408 | "execution_count": 31, 409 | "metadata": {}, 410 | "output_type": "execute_result" 411 | } 412 | ], 413 | "source": [ 414 | "#skus_s['NI531SRM74IHX']" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 12, 420 | "metadata": { 421 | "collapsed": true 422 | }, 423 | "outputs": [], 424 | "source": [ 425 | "query_users_items = \"\"\"\n", 426 | "SELECT\n", 427 | "data\n", 428 | "FROM(\n", 429 | " SELECT\n", 430 | " fv,\n", 431 | " COLLECT_LIST(STRUCT(sku, score)) data\n", 432 | " FROM test1\n", 433 | " GROUP BY 1\n", 434 | ")\n", 435 | "WHERE SIZE(data) BETWEEN 2 AND 200\n", 436 | "\"\"\"\n", 437 | "\n", 438 | "t0 = time.time()\n", 439 | "users = spark.sql(query_users_items)\n", 440 | "users_rdd = users.rdd" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 148, 446 | "metadata": {}, 447 | "outputs": [ 448 | { 449 | "data": { 450 | "text/plain": [ 451 | "[Row(data=[Row(sku='CO796SCF87LXG', score=0.5), Row(sku='CO796SCM72JGT', score=0.5), Row(sku='CO796SCM23HHW', score=0.5)]),\n", 452 | " Row(data=[Row(sku='HA723APF18CPL', score=0.5), Row(sku='CO515APF44YPR', score=0.5), Row(sku='LA906APF69OQC', score=0.5), Row(sku='TU142APF19BPC', score=0.5), Row(sku='CO515APF27DIA', score=0.5), Row(sku='GA753APF40NJR', score=0.5), Row(sku='GA753APF41NJQ', score=1.0)])]" 453 | ] 454 | }, 455 | "execution_count": 148, 456 | "metadata": {}, 457 | "output_type": "execute_result" 458 | } 459 | ], 460 | "source": [ 461 | "users.head(2)" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": 13, 467 | "metadata": { 468 | "collapsed": true 469 | }, 470 | "outputs": [], 471 | "source": [ 472 | "def map_cosines(row):\n", 473 | " for i in range(len(row)):\n", 474 | " value_i = row[i].score / pq_b.value[row[i].sku][1]\n", 475 | " if random.random() < pq_b.value[row[i].sku][0]:\n", 476 | " for j in range(i + 1, len(row)):\n", 477 | " value_j = row[j].score / pq_b.value[row[j].sku][1]\n", 478 | " if random.random() < pq_b.value[row[i].sku][0]:\n", 479 | " yield ((row[i].sku, row[j].sku), value_i * value_j)" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 14, 485 | "metadata": { 486 | "collapsed": true 487 | }, 488 | "outputs": [], 489 | "source": [ 490 | "users2 = users.rdd.flatMap(lambda row: map_cosines(row.data))" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 150, 496 | "metadata": {}, 497 | "outputs": [ 498 | { 499 | "data": { 500 | "text/plain": [ 501 | "[(('CO796SCM72JGT', 'CO796SCM23HHW'), 0.0002015811797719921),\n", 502 | " (('HA723APF18CPL', 'CO515APF44YPR'), 0.031234752377721216)]" 503 | ] 504 | }, 505 | "execution_count": 150, 506 | "metadata": {}, 507 | "output_type": "execute_result" 508 | } 509 | ], 510 | "source": [ 511 | "users2.take(2)" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": 15, 517 | "metadata": { 518 | "collapsed": true 519 | }, 520 | "outputs": [], 521 | "source": [ 522 | "final = users2.reduceByKey(operator.add)" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": 16, 528 | "metadata": {}, 529 | "outputs": [ 530 | { 531 | "name": "stdout", 532 | "output_type": "stream", 533 | "text": [ 534 | "[(('VI618SHF35NCY-51', 'LU773ACF56ILV'), 0.029501220638256383), (('FI911APF72ZHF', 'KA952APF52DNB'), 0.015504341823651058), (('FA865ACF45CCS', 'QU097ACF14BCMN'), 0.7071067811865475)]\n", 535 | "363.733115196228\n" 536 | ] 537 | } 538 | ], 539 | "source": [ 540 | "t0 = time.time()\n", 541 | "print(final.take(3))\n", 542 | "print(time.time() - t0)" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": null, 548 | "metadata": { 549 | "collapsed": true 550 | }, 551 | "outputs": [], 552 | "source": [] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": 18, 557 | "metadata": { 558 | "collapsed": true 559 | }, 560 | "outputs": [], 561 | "source": [ 562 | "import numpy as np" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 20, 568 | "metadata": { 569 | "collapsed": true 570 | }, 571 | "outputs": [], 572 | "source": [ 573 | "a = np.random.randn(12288, 150) # a.shape = (12288, 150)\n", 574 | "b = np.random.randn(150, 45) # b.shape = (150, 45)\n", 575 | "c = np.dot(a,b)" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": 21, 581 | "metadata": {}, 582 | "outputs": [ 583 | { 584 | "data": { 585 | "text/plain": [ 586 | "(12288, 45)" 587 | ] 588 | }, 589 | "execution_count": 21, 590 | "metadata": {}, 591 | "output_type": "execute_result" 592 | } 593 | ], 594 | "source": [ 595 | "c.shape" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": 39, 601 | "metadata": { 602 | "collapsed": true 603 | }, 604 | "outputs": [], 605 | "source": [ 606 | "b = np.random.randn(4, 1)" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": 40, 612 | "metadata": {}, 613 | "outputs": [ 614 | { 615 | "data": { 616 | "text/plain": [ 617 | "array([[ 0.22988676],\n", 618 | " [-0.77589895],\n", 619 | " [-0.77754825],\n", 620 | " [-0.06151452]])" 621 | ] 622 | }, 623 | "execution_count": 40, 624 | "metadata": {}, 625 | "output_type": "execute_result" 626 | } 627 | ], 628 | "source": [ 629 | "b" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": 41, 635 | "metadata": {}, 636 | "outputs": [ 637 | { 638 | "data": { 639 | "text/plain": [ 640 | "array([-0.06151452])" 641 | ] 642 | }, 643 | "execution_count": 41, 644 | "metadata": {}, 645 | "output_type": "execute_result" 646 | } 647 | ], 648 | "source": [ 649 | "b[3]" 650 | ] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "execution_count": 42, 655 | "metadata": { 656 | "collapsed": true 657 | }, 658 | "outputs": [], 659 | "source": [ 660 | "a = np.random.randn(3, 3)\n", 661 | "b = np.random.randn(3, 1)\n", 662 | "c = a*b" 663 | ] 664 | }, 665 | { 666 | "cell_type": "code", 667 | "execution_count": 44, 668 | "metadata": {}, 669 | "outputs": [ 670 | { 671 | "data": { 672 | "text/plain": [ 673 | "array([[-0.01004274, -0.45400667, -1.97007744],\n", 674 | " [-0.54591752, -0.59968557, 1.47375852],\n", 675 | " [ 0.33738485, 1.00607007, 0.69213239]])" 676 | ] 677 | }, 678 | "execution_count": 44, 679 | "metadata": {}, 680 | "output_type": "execute_result" 681 | } 682 | ], 683 | "source": [ 684 | "a" 685 | ] 686 | }, 687 | { 688 | "cell_type": "code", 689 | "execution_count": 45, 690 | "metadata": {}, 691 | "outputs": [ 692 | { 693 | "data": { 694 | "text/plain": [ 695 | "array([[ 0.42442128],\n", 696 | " [-0.8827092 ],\n", 697 | " [-0.5387125 ]])" 698 | ] 699 | }, 700 | "execution_count": 45, 701 | "metadata": {}, 702 | "output_type": "execute_result" 703 | } 704 | ], 705 | "source": [ 706 | "b" 707 | ] 708 | }, 709 | { 710 | "cell_type": "code", 711 | "execution_count": 46, 712 | "metadata": {}, 713 | "outputs": [ 714 | { 715 | "data": { 716 | "text/plain": [ 717 | "array([[-0.00426235, -0.19269009, -0.83614278],\n", 718 | " [ 0.48188642, 0.52934797, -1.30090021],\n", 719 | " [-0.18175343, -0.54198252, -0.37286037]])" 720 | ] 721 | }, 722 | "execution_count": 46, 723 | "metadata": {}, 724 | "output_type": "execute_result" 725 | } 726 | ], 727 | "source": [ 728 | "c" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": null, 734 | "metadata": { 735 | "collapsed": true 736 | }, 737 | "outputs": [], 738 | "source": [] 739 | }, 740 | { 741 | "cell_type": "code", 742 | "execution_count": null, 743 | "metadata": { 744 | "collapsed": true 745 | }, 746 | "outputs": [], 747 | "source": [] 748 | }, 749 | { 750 | "cell_type": "code", 751 | "execution_count": null, 752 | "metadata": { 753 | "collapsed": true 754 | }, 755 | "outputs": [], 756 | "source": [] 757 | }, 758 | { 759 | "cell_type": "code", 760 | "execution_count": null, 761 | "metadata": { 762 | "collapsed": true 763 | }, 764 | "outputs": [], 765 | "source": [] 766 | } 767 | ], 768 | "metadata": { 769 | "kernelspec": { 770 | "display_name": "PySpark", 771 | "language": "python", 772 | "name": "pyspark" 773 | }, 774 | "language_info": { 775 | "codemirror_mode": { 776 | "name": "ipython", 777 | "version": 3 778 | }, 779 | "file_extension": ".py", 780 | "mimetype": "text/x-python", 781 | "name": "python", 782 | "nbconvert_exporter": "python", 783 | "pygments_lexer": "ipython3", 784 | "version": "3.5.2" 785 | } 786 | }, 787 | "nbformat": 4, 788 | "nbformat_minor": 2 789 | } 790 | -------------------------------------------------------------------------------- /spark_jobs/neighbor.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2017 Willian Fuks 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | """ 24 | Set of tools to run Marreco's neighborhood algorithm in spark. 25 | """ 26 | 27 | import os 28 | import sys 29 | import json 30 | import operator 31 | import math 32 | import random 33 | import time 34 | import argparse 35 | import datetime 36 | from collections import defaultdict 37 | 38 | sys.path.append('..') 39 | 40 | from base import MarrecoBase 41 | from py4j.protocol import Py4JJavaError 42 | from pyspark.sql import SparkSession 43 | from pyspark.sql import types as stypes 44 | from pyspark.sql.utils import AnalysisException 45 | 46 | 47 | class MarrecoNeighborJob(MarrecoBase): 48 | """This Class has all methods necessary to build Marreco Neighborhood 49 | against Spark. 50 | 51 | :type context: `pyspark.SparkContext` 52 | :param context: context to run Spark jobs. 53 | """ 54 | def transform_data(self, sc, args): 55 | """This method gets datajet files as input and prepare them on a daily 56 | intermediary basis for Marreco's main algorithm DIMSUM. 57 | 58 | :type sc: spark context 59 | :param sc: spark context for running jobs. 60 | 61 | :param args: 62 | 63 | :type days_init: int 64 | :param days: How many days to scan through the files to be used 65 | in the transformation phase. If this value is say 66 | ``5`` then Marreco will take today's date and come 67 | back 5 days in time from where it will start reading 68 | input files. 69 | 70 | :type days_end: int 71 | :param days_end: Similar to ``days_init`` but tells where the end 72 | of scanning should be. If set say equals to ``3``, 73 | then scans back in time until 3 days ago couting 74 | from today. 75 | 76 | :type w_browse: float 77 | :param w_browse: Weight associated to browsing events on skus. 78 | 79 | :type w_purchase: float 80 | :param w_purchase: Weight associated to purchasing events on skus. 81 | 82 | :type force: str 83 | :param force: Either ``yes``, in which case forces recreation of 84 | files, or ``no``, in which case if files already 85 | exist then do nothing. 86 | 87 | :type source_uri: str 88 | :param source_uri: URI from where to read input data from. 89 | 90 | :type inter_uri: str 91 | :param inter_uri: URI to save intermediate results. 92 | 93 | :type neighbor_uri: str 94 | :param neighbor_uri: URI for where to save similarity matrix result. 95 | 96 | :type threshold: float 97 | :param threshold: This should be converted to float. It asserts how 98 | much quality we should sacrifice in order to gain 99 | performance. 100 | 101 | :type decay: float 102 | :param decay: How much less of an influence a score has given how 103 | long ago it happened. The further ago the more this 104 | ``decay`` factor diminishes the value. 105 | """ 106 | spark = SparkSession(sc) 107 | for day in range(args.days_init, args.days_end - 1, -1): 108 | formatted_day = self.get_formatted_date(day) 109 | source_uri = args.source_uri.format(formatted_day) 110 | inter_uri = args.inter_uri.format(formatted_day) 111 | try: 112 | inter_data = spark.read.json(inter_uri, 113 | schema = self._load_users_matrix_schema()).first() 114 | 115 | if args.force == 'yes' or not inter_data: 116 | self._process_datajet_day(sc, 117 | source_uri, 118 | inter_uri, 119 | args, 120 | mode='overwrite') 121 | except (Py4JJavaError, AnalysisException): 122 | self._process_datajet_day(sc, source_uri, inter_uri, args) 123 | 124 | 125 | def _process_datajet_day(self, 126 | sc, 127 | uri, 128 | inter_uri, 129 | args, 130 | mode=None, 131 | compression='gzip'): 132 | """Gets datajet json like files and transforms them into data like 133 | [user_id [(sku, score),...]] saving it in the end. 134 | 135 | :type sc: spark context 136 | :param sc: context to run spark jobs. 137 | 138 | :type uri: str 139 | :param uri: where the files are located. 140 | 141 | :type inter_uri: str 142 | :param inter_uri: where intermediate results should be saved. 143 | 144 | :type args: namedtuple 145 | :type args.w_browse: float 146 | :param args.w_browse: weight associated to users browsing history. 147 | 148 | :type args.w_purchase: float 149 | :param args.w_purchase: weight associated to purchases. 150 | 151 | :type args.decay: float 152 | :param args.decay: decay factor for account events that happened 153 | long ago. 154 | 155 | :type mode: str 156 | :param mode: indicates how data should be saved. If ``None`` then 157 | throws error if file already exist. If ``overwrite`` then 158 | deletes previous file and saves new one. 159 | """ 160 | sc.textFile(uri) \ 161 | .flatMap(lambda x: self._process_json(x, args)) \ 162 | .filter(lambda x: x) \ 163 | .groupByKey() \ 164 | .mapValues(list) \ 165 | .flatMap(lambda x: self._aggregate_skus(x)) \ 166 | .toDF(schema=self._load_users_matrix_schema()) \ 167 | .write.json(inter_uri, compression=compression, mode=mode) 168 | 169 | 170 | def _load_users_matrix_schema(self): 171 | """Loads schema with data type [user, [(sku, score), (sku, score)]] 172 | 173 | :rtype: `pyspark.sql.type.StructType` 174 | :returns: schema speficiation for user -> (sku, score) data. 175 | """ 176 | return stypes.StructType(fields=[ 177 | stypes.StructField("user_id", stypes.StringType()), 178 | stypes.StructField('interacted_items', stypes.ArrayType( 179 | stypes.StructType(fields=[stypes.StructField('key', 180 | stypes.StringType()), stypes.StructField('score', 181 | stypes.FloatType())])))]) 182 | 183 | 184 | def build_marreco(self, sc, args): 185 | """Main method for building Marreco's algorithms and saving results 186 | for later usage. 187 | 188 | :type sc: `pyspark.SparkContext` 189 | :param sc: spark context for running jobs. 190 | 191 | :param args: 192 | :type days_init: int 193 | :param days_init: which date time that will be used for reading data 194 | with intermediary daily results. 195 | 196 | :type days_end: int 197 | :param days_end: until what file to read input data. 198 | 199 | :type inter_uri: str 200 | :param inter_uri: URI where intermediary results should be read from 201 | 202 | :type neighbor_uri: str 203 | :param neighbor_uri: where to save final marreco matrix (similarity 204 | and user_sku_score matrix). 205 | 206 | :type inter_uri: str 207 | :param inter_uri: URI for where to save intermediary results. 208 | 209 | :type users_matrix_uri: str 210 | :param users_matrix_uri: URI for where to save matrix of users 211 | and their interacted skus. 212 | 213 | :type threshold: str 214 | :param threshold: this should be converted to str. Sets how much 215 | we'll sacrifice in terms of quality in exchange 216 | of processing time. 217 | """ 218 | spark = SparkSession(sc) 219 | data = sc.emptyRDD() 220 | for day in range(args.days_init, args.days_end - 1, -1): 221 | formatted_day = self.get_formatted_date(day) 222 | inter_uri = self._render_inter_uri( 223 | args.inter_uri.format(formatted_day)) 224 | 225 | data = data.union(spark.read.json(inter_uri, 226 | schema=self._load_users_matrix_schema()).rdd) 227 | 228 | data = data.reduceByKey(operator.add) \ 229 | .flatMap(lambda x: self._aggregate_skus(x)) \ 230 | .filter(lambda x: len(x[1]) > 1 and len(x[1]) <= 20) 231 | 232 | if args.users_matrix_uri: 233 | self._save_users_matrix(args.users_matrix_uri, data) 234 | 235 | pq_b = self._broadcast_pq(sc, data, args.threshold) 236 | data = data.flatMap(lambda x: self._run_DIMSUM(x[1], pq_b)) \ 237 | .reduceByKey(operator.add) 238 | 239 | self._save_neighbor_matrix(args.neighbor_uri, data) 240 | 241 | 242 | def _save_neighbor_matrix(self, neighbor_uri, data): 243 | """Turns similarities into the final neighborhood matrix. The schema 244 | for saving the matrix is like {sku0: [(sku1, similarity1)...]} 245 | 246 | :type neighbor_uri: str 247 | :param neighbor_uri: uri for where to save the matrix. 248 | 249 | :type data: RDD 250 | :param data: RDD with data like [sku0, sku1, similarity] 251 | """ 252 | def duplicate_keys(row): 253 | """Builds the similarities between both the diagonals 254 | of the similarity matrix. In the DIMSUM algorithm, we just compute 255 | one of the diagonals. Here we will add the transpose of the matrix 256 | so Marreco can see all similarities between all skus. 257 | 258 | :type row: list 259 | :param row: data of type [(sku0, sku1), similarity] 260 | 261 | :rtype: list: 262 | :returns: skus and their transposed similarities, such as 263 | [sku0, [sku1, s]], [sku1, [sku0, s]] 264 | """ 265 | yield (row[0][0], [(row[0][1], row[1])]) 266 | yield (row[0][1], [(row[0][0], row[1])]) 267 | 268 | data.flatMap(lambda x: duplicate_keys(x)) \ 269 | .reduceByKey(operator.add) \ 270 | .toDF(schema=self._load_neighbor_schema()) \ 271 | .write.json(neighbor_uri, compression='gzip', mode='overwrite') 272 | 273 | 274 | def _load_neighbor_schema(self): 275 | """Loads neighborhood schema for similarity matrix 276 | 277 | :rtype: `pyspark.sql.types.StructField` 278 | :returns: schema of type ["key", [("key", "value")]] 279 | """ 280 | return stypes.StructType(fields=[ 281 | stypes.StructField("item_key", stypes.StringType()), 282 | stypes.StructField("similarity_items", stypes.ArrayType( 283 | stypes.StructType(fields=[ 284 | stypes.StructField("key", stypes.StringType()), 285 | stypes.StructField("score", stypes.FloatType())])))]) 286 | 287 | 288 | def _save_users_matrix(self, user_matrix_uri, data): 289 | """Saves user -> sku matrix so Marreco can use it later for greater 290 | optimization. In this case, the matrix is saved as: 291 | [user_id, [{"key": sku, "score": score}] interacted_items] 292 | 293 | :type sc: `pyspark.SparkContext` 294 | :param sc: context for spark jobs. 295 | 296 | :type session: `pyspark.sql.SparkSession` 297 | :param session: session used so to be able to save DataFrames. 298 | 299 | :type data: RDD 300 | :param data: RDD with values [user, [(sku, score), (sku, score)]] 301 | """ 302 | def transform_users_data(row): 303 | """Transform row from [user, [(sku, score)]] to desired output. 304 | 305 | :type data: RDD 306 | :param data: observed users interaction 307 | """ 308 | yield [{"user_id": row[0], 309 | "interacted_items": list(map( 310 | lambda x: {"key": x[0], "score": x[1]}, row[1]))}] 311 | data.toDF(schema=self._load_users_matrix_schema()) \ 312 | .write.json(user_matrix_uri, compression='gzip', mode='overwrite') 313 | 314 | 315 | def _run_DIMSUM(self, row, pq_b): 316 | """Implements DIMSUM as describe here: 317 | 318 | http://arxiv.org/abs/1304.1467 319 | 320 | :type row: list 321 | :param row: list with values (user, [(sku, score)...]) 322 | 323 | :rtype: list 324 | :returns: similarities between skus in the form [(sku0, sku1, similarity)] 325 | """ 326 | for i in range(len(row)): 327 | if random.random() < pq_b.value[row[i][0]][0]: 328 | for j in range(i + 1, len(row)): 329 | if random.random() < pq_b.value[row[j][0]][0]: 330 | value_i = row[i][1] / pq_b.value[row[i][0]][1] 331 | value_j = row[j][1] / pq_b.value[row[j][0]][1] 332 | key = ((row[i][0], row[j][0]) if row[i][0] < row[j][0] 333 | else (row[j][0], row[i][0])) 334 | yield (key, value_i * value_j) 335 | 336 | 337 | def _broadcast_pq(self, sc, data, threshold): 338 | """Builds and broadcast probability ``p`` value and factor ``q`` for 339 | each sku. 340 | 341 | :type data: `spark.RDD` 342 | :param data: RDD with values (user, (sku, score)). 343 | 344 | :type threshold: float 345 | :param threshold: all similarities above this value will be guaranteed 346 | to converge to real value with relative error ``e``. 347 | 348 | :rtype: broadcasted dict 349 | :returns: dict sku -> (p, q) where p is defined as ``gamma / ||c||`` 350 | and ``q = min(gamma, ||c||)``. 351 | """ 352 | norms = {sku: score for sku, score in 353 | data.flatMap(lambda x: self._process_scores(x)) \ 354 | .reduceByKey(operator.add) \ 355 | .map(lambda x: (x[0], math.sqrt(x[1]))) \ 356 | .collect()} 357 | 358 | gamma = (math.sqrt(10 * math.log(len(norms)) / threshold) if threshold 359 | > 1e-6 else math.inf) 360 | 361 | pq_b = sc.broadcast({sku: (gamma / value, min(gamma, value)) 362 | for sku, value in norms.items()}) 363 | return pq_b 364 | 365 | 366 | def _process_scores(self, row): 367 | """After all user -> score aggregation is done, this method loops 368 | through each sku for a given user and yields its squared score so 369 | that we can compute the norm ``||c||`` for each sku column. 370 | 371 | :type row: list 372 | :param row: list of type [(user, (sku, score))] 373 | 374 | :rtype: tuple 375 | :returns: tuple of type (sku, (score ** 2)) 376 | """ 377 | for inner_row in row[1]: 378 | yield (inner_row[0], inner_row[1] ** 2) 379 | 380 | 381 | def _render_inter_uri(self, inter_uri, name_pattern='part-*'): 382 | """Helper function to process inter_uri's for later usage. 383 | 384 | :type inter_uri: str 385 | :param inter_uri: URI used for saving intermediate data transformation 386 | results. 387 | 388 | :type name_pattern: str 389 | :param name_pattern: pattern used by spark to save multiple files. 390 | 391 | :rtype: str 392 | :returns: URI rendered template for retrieving data back to code. 393 | """ 394 | return os.path.join(inter_uri, name_pattern) 395 | 396 | 397 | @staticmethod 398 | def _process_json(row, args): 399 | """Mapper function to extract from each line from datajet file 400 | and return interactions between customers and skus. 401 | 402 | :type row: str 403 | :param row: json string with datajet data. 404 | 405 | :type args: namedtuple 406 | :param args: contains values to specify how the json transformantion 407 | should happen. 408 | 409 | :type w_browse: float 410 | :param w_browse: weight associated to the browsing patterns of 411 | customers. 412 | 413 | :type w_purchase: float 414 | :param w_purchase: weight associated to purchasing patterns of 415 | customers. 416 | 417 | :type decay: float 418 | :param decay: determines how much past interactions should be less 419 | meaningful as time passes by. 420 | 421 | :rtype: list 422 | :returns: `yield` on [customerID, (sku, score)] 423 | """ 424 | try: 425 | r = json.loads(row) 426 | if (r['event']['source']['tracker'] == 'fish' and 427 | 'local_timestamp' in r['event'] and 428 | r['event']['identifiers']['djUCID']['value'] and 429 | r['event']['type'] in {"productview", "orderconfirmation"}): 430 | 431 | decay_factor = math.exp(-args.decay * (datetime.datetime.now() - 432 | datetime.datetime.utcfromtimestamp( 433 | int(r['event']['local_timestamp']) / 1000.0)).days) 434 | 435 | type_ = r['event']['type'] 436 | score = (args.w_browse if type_ == 'productview' 437 | else args.w_purchase) * decay_factor 438 | 439 | if type_ == 'productview': 440 | yield [r['event']['identifiers']['djUCID']['value'], 441 | (r['event']['details']['product']['group_id'], score)] 442 | elif type_ == 'orderconfirmation': 443 | for product in r['event']['details']['products']: 444 | yield [r['event']['identifiers']['djUCID']['value'], 445 | (product['group_id'], score)] 446 | except: 447 | yield [] 448 | 449 | 450 | @staticmethod 451 | def _aggregate_skus(row): 452 | """Aggregates skus from customers and their respective scores 453 | 454 | :type row: list 455 | :param row: list having values [user, (sku, score)] 456 | 457 | :rtype: list 458 | :returns: `yield` on [user, (sku, sum(score))] 459 | """ 460 | d = defaultdict(float) 461 | for inner_row in row[1]: 462 | d[inner_row[0]] += inner_row[1] 463 | yield (row[0], list(d.items())) 464 | 465 | 466 | def process_sysargs(self, args): 467 | parser = argparse.ArgumentParser() 468 | 469 | parser.add_argument('--days_init', 470 | dest='days_init', 471 | type=int, 472 | help=("Total amount of days to come back in time " 473 | "from today's date.")) 474 | 475 | parser.add_argument('--days_end', 476 | dest='days_end', 477 | type=int, 478 | help=("Total amount of days to come back in time " 479 | "from today's date.")) 480 | 481 | parser.add_argument('--source_uri', 482 | dest='source_uri', 483 | type=str, 484 | help=("URI template from where to read source " 485 | "files from.")) 486 | 487 | parser.add_argument('--inter_uri', 488 | dest='inter_uri', 489 | type=str, 490 | help=('URI for saving intermediary results.')) 491 | 492 | parser.add_argument('--threshold', 493 | dest='threshold', 494 | type=float, 495 | help=('Threshold for acceptable similarity relative' 496 | ' error.')) 497 | 498 | parser.add_argument('--force', 499 | dest='force', 500 | type=str, 501 | help=('If ``yes`` then replace all files with new ones. ' 502 | 'If ``no``, then no replacing happens.')) 503 | 504 | parser.add_argument('--users_matrix_uri', 505 | dest='users_matrix_uri', 506 | type=str, 507 | default=None, 508 | help=('where to save matrix of users. If ``None`` ' 509 | 'then the matrix is not built.')) 510 | 511 | parser.add_argument('--neighbor_uri', 512 | dest='neighbor_uri', 513 | type=str, 514 | help=('where to save matrix of skus similarities')) 515 | 516 | parser.add_argument('--w_browse', 517 | dest='w_browse', 518 | type=float, 519 | help=('weight associated to browsing action score')) 520 | 521 | parser.add_argument('--w_purchase', 522 | dest='w_purchase', 523 | type=float, 524 | help=('weight associated to purchasing action score')) 525 | 526 | parser.add_argument('--decay', 527 | dest='decay', 528 | type=float, 529 | help=('Decaying factor to account for past interactions')) 530 | 531 | args = parser.parse_args(args) 532 | return args 533 | 534 | --------------------------------------------------------------------------------