├── data
├── __init__.py
├── queries
│ └── marreco
│ │ └── datajet
│ │ ├── search.sql
│ │ ├── orderconfirmation.sql
│ │ ├── purchase.sql
│ │ └── productview.sql
├── help.py
└── exporter.py
├── tests
├── __init__.py
├── data
│ ├── test_template.html
│ ├── build_query_test
│ ├── test_macro_template.html
│ ├── search_mock.json
│ ├── orderconfirmation_mock.json
│ └── productview_mock.json
├── system
│ ├── data
│ │ ├── neighbor
│ │ │ ├── transformed_2.json
│ │ │ ├── transformed_1.json
│ │ │ └── train
│ │ │ │ ├── 1
│ │ │ │ └── train.json
│ │ │ │ └── 2
│ │ │ │ └── train.json
│ │ ├── top_seller
│ │ │ ├── train
│ │ │ │ ├── 1
│ │ │ │ │ └── train.json
│ │ │ │ └── 2
│ │ │ │ │ └── train.json
│ │ │ └── datajet_test.json
│ │ └── datajet_test.json
│ └── spark_jobs
│ │ ├── test_top_seller.py
│ │ └── test_neighbor.py
└── unit
│ ├── spark_jobs
│ ├── test_factory.py
│ ├── test_base.py
│ ├── test_run_marreco.py
│ ├── test_top_seller.py
│ └── test_neighbor.py
│ └── data
│ ├── test_help.py
│ └── test_exporter.py
├── spark_jobs
├── __init__.py
├── factory.py
├── run_marreco.py
├── base.py
├── top_seller.py
└── neighbor.py
├── bin
├── pytest.sh
├── pytest_system_neighbor.sh
├── export_datajet.sh
├── dataproc_top_seller.py
├── dataproc_neighbor.sh
├── utils.sh
├── create_cluster.sh
├── launch_jupyter_interface.sh
└── export_datajet.py
├── .coveragerc
├── requirements.txt
├── notebooks
├── .gitignore
├── marreco_dense_dimsum.ipynb
├── marreco_df.ipynb
├── marreco_dimsum_internal.ipynb
├── rdd_marreco_test.ipynb
└── marreco_dimsum_sparse.ipynb
├── .gitignore
├── LICENSE
├── nox.py
└── README.md
/data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/spark_jobs/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/data/test_template.html:
--------------------------------------------------------------------------------
1 |
{{name}}
2 |
--------------------------------------------------------------------------------
/bin/pytest.sh:
--------------------------------------------------------------------------------
1 | py.test tests/unit/ --quiet --cov=. --cov-fail-under=100
2 |
--------------------------------------------------------------------------------
/bin/pytest_system_neighbor.sh:
--------------------------------------------------------------------------------
1 | py.test tests/system/spark_jobs/test_neighbor.py --quiet --cov=. --cov-fail-under=100
2 |
--------------------------------------------------------------------------------
/tests/data/build_query_test:
--------------------------------------------------------------------------------
1 | SELECT
2 | data
3 | FROM table
4 | WHERE init_days = {{days_interval}} and {{days_interval_end}}
5 |
--------------------------------------------------------------------------------
/bin/export_datajet.sh:
--------------------------------------------------------------------------------
1 | python export_datajet.py --days_init=2 --days_end=1 --uri gs://lbanor/pyspark/{day}/train{idx}*.gz --table=dj1 --dataset=simona
2 |
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = True
3 |
4 | [report]
5 | fail_under = 100
6 | show_missing = True
7 | exclude_lines =
8 | if __name__ == .__main__.:
9 |
--------------------------------------------------------------------------------
/tests/data/test_macro_template.html:
--------------------------------------------------------------------------------
1 | {% macro func(v1, v2) %}
2 | value of v1: {{v1}}
3 |
value of v2: {{v2}}
4 | {% endmacro %}
5 | {{func(v1, v2)}}
6 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Jinja2==2.9.5
2 | mock==2.0.0
3 | pytest==3.2.0
4 | pytest-cov==2.5.1
5 | #pyspark==2.2.0
6 | google-cloud-bigquery==0.26.0
7 | google-cloud-storage==1.3.1
8 | numpy==1.12.1
9 |
--------------------------------------------------------------------------------
/tests/system/data/neighbor/transformed_2.json:
--------------------------------------------------------------------------------
1 | {"user_id":"3","interacted_items":[{"key":"1","score":0.5},{"key":"0","score":0.5}]}
2 | {"user_id":"2","interacted_items":[{"key":"0","score":6.0}]}
3 | {"user_id":"0","interacted_items":[{"key":"3","score":1.0}]}
4 | {"user_id":"1","interacted_items":[{"key":"1","score":1.0}]}
5 |
--------------------------------------------------------------------------------
/notebooks/.gitignore:
--------------------------------------------------------------------------------
1 | *.py[cod]
2 | *.sw[op]
3 |
4 | # C extensions
5 | *.so
6 |
7 | # Packages
8 | *.egg
9 | *.egg-info
10 | dist
11 | build
12 | eggs
13 | parts
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | __pycache__
21 |
22 | # Unit test / coverage reports
23 | .coverage
24 | .nox
25 | .tox
26 | .cache
27 | htmlcov
28 |
29 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.py[cod]
2 | *.sw[op]
3 | *.crc
4 | *.gz
5 | _SUCCESS
6 |
7 | # Packages
8 | *.egg
9 | *.egg-info
10 | dist
11 | build
12 | eggs
13 | parts
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | __pycache__
21 |
22 | # Unit test / coverage reports
23 | .coverage
24 | .nox
25 | .tox
26 | .cache
27 | htmlcov
28 |
29 | #GCP keys
30 | key.json
31 |
--------------------------------------------------------------------------------
/bin/dataproc_top_seller.py:
--------------------------------------------------------------------------------
1 | gcloud dataproc jobs submit pyspark --cluster=test3 --py-files=base.py,factory.py,top_seller.py --bucket=lbanor run_marreco.py -- --days_init=4 --days_end=2 --source_uri=gs://lbanor/pyspark/train_{}_*.gz --inter_uri=gs://lbanor/pyspark/marreco/top_seller/intermediate/{} --force=no --top_seller_uri=gs://lbanor/pyspark/marreco/top_seller/results --algorithm=top_seller
2 |
--------------------------------------------------------------------------------
/bin/dataproc_neighbor.sh:
--------------------------------------------------------------------------------
1 | gcloud dataproc jobs submit pyspark --cluster=test3 --py-files=base.py,factory.py,neighbor.py --bucket=lbanor run_marreco.py -- --days_init=4 --days_end=4 --source_uri=gs://lbanor/pyspark/{}/train*.gz --inter_uri=gs://lbanor/pyspark/marreco/neighbor/intermediate/{} --threshold=0.1 --force=no --users_matrix_uri=gs://lbanor/pyspark/marreco/neighbor/user_matrix --decay=0.03 --w_browse=0.5 --w_purchase=6.0 --neighbor_uri=gs://lbanor/pyspark/marreco/neighbor/neighbor_matrix --algorithm=neighbor
2 |
--------------------------------------------------------------------------------
/tests/system/data/neighbor/transformed_1.json:
--------------------------------------------------------------------------------
1 | {"user_id":"2","interacted_items":[{"key":"1","score":1.0}, {"key":"2","score":0.5}, {"key":"3","score":0.5}]}
2 | {"user_id":"3","interacted_items":[{"key":"0","score":0.5}, {"key":"1","score":0.5}, {"key":"2","score":6.0}, {"key":"3","score":6.0}]}
3 | {"user_id":"0","interacted_items":[{"key":"0","score":0.5}, {"key":"1","score":1.0}, {"key":"2","score":0.5}, {"key":"3","score":1.0}]}
4 | {"user_id":"1","interacted_items":[{"key":"0","score":1.0}, {"key":"1","score":1.0}, {"key":"2","score":1.0}, {"key":"3","score":0.5}]}
5 |
--------------------------------------------------------------------------------
/bin/utils.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | function_exists () {
4 | declare -f -F $1 > /dev/null
5 | return $?
6 | }
7 |
8 | throw () {
9 | echo "$*" >&2
10 | echo
11 | function_exists usage && usage
12 | exit 1
13 | }
14 |
15 | get_metadata_property () {
16 | [[ -z $1 ]] && throw "missing function param for DATAPROC_CLUSTER_NAME" || DATAPROC_CLUSTER_NAME=$1
17 | [[ -z $2 ]] && throw "missing function param for METADATA_KEY" || METADATA_KEY=$2
18 | # Get $DATAPROC_CLUSTER_NAME metadata value for key $METADATA_KEY...
19 | gcloud dataproc clusters describe $DATAPROC_CLUSTER_NAME | python -c "import sys,yaml; cluster = yaml.load(sys.stdin); print(cluster['config']['gceClusterConfig']['metadata']['$METADATA_KEY'])"
20 | }
21 |
--------------------------------------------------------------------------------
/tests/data/search_mock.json:
--------------------------------------------------------------------------------
1 | {
2 | "event": {
3 | "schema_version": 1,
4 | "user": {
5 | "location": {}
6 | },
7 | "device": {
8 | "origin": "web"
9 | },
10 | "source": {
11 | "tracker": "hawk",
12 | "url": "/",
13 | "url_referrer": "/"
14 | },
15 | "created_at": 1502582400127,
16 | "type": "search_response",
17 | "details": {
18 | "generation_ms": 69,
19 | "request": {
20 | "category_dept": 1,
21 | "facet_count": 1000,
22 | "facets": ["brand", "price", "size", "gender", "color", "categories_slugs", "categories_ids", "owner", "category"],
23 | "fields": "*",
24 | "filters": {
25 | "brand.slug": ["calvin-klein-kids"],
26 | "categories_ids": ["257"]
27 | },
28 | "gs": 3,
29 | "rq": 5,
30 | "size": 48,
31 | "sort": "relevance",
32 | "top_product_ids": [""]
33 | },
34 | "response": {
35 | "count": 189,
36 | "id": "d7c104cab610e7edf07290428c4db4e6ec49fcc1",
37 | "items": ["CA947APM37XCS",
38 | "CA947APM24OVZ"
39 | ]
40 | }
41 | }
42 | },
43 | "created_at": 1502582400127
44 | }
45 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Willian Fuks
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/tests/data/orderconfirmation_mock.json:
--------------------------------------------------------------------------------
1 | {"event":{"schema_version":1,"user":{"location":{}},"identifiers":{"bid":{"value":"03651597b830fbbee9c7f4299989bd48","type":"bid"},"djUCID":{"value":"610574c802ba3b33","type":"djUCID"}},"device":{"client":"Mozilla/5.0 (Linux; Android 7.0; XT1635-02 Build/NPN25.137-24-1; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/59.0.3071.125 Mobile Safari/537.36 [FB_IAB/FB4A;FBAV/136.0.0.22.91;]","os":"Linux armv7l","origin":"web"},"source":{"tracker":"fish","url":"m.dafiti.com.br/checkout/success/","url_referrer":"checkout.dafiti.com.br/checkout/finish/"},"created_at":1502582416663,"local_timestamp":1502582415616,"type":"orderconfirmation","details":{"order_id":"15965531","products":[{"id":"","price":{"current":74.5},"group_id":"DA923SHF35RHK","skus":["DA923SHF35RHK"],"main_category_path":[{"name":"calcados","slug":"calcados"},{"name":"calcados femininos","slug":"calcados-femininos"}]},{"id":"","price":{"current":74.5},"group_id":"VI618SHF69UQC","skus":["VI618SHF69UQC"],"main_category_path":[{"name":"calcados","slug":"calcados"},{"name":"calcados femininos","slug":"calcados-femininos"}]}],"quantities":[1,1]}},"created_at":1502582416663}
2 |
--------------------------------------------------------------------------------
/bin/create_cluster.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | function usage {
5 | echo "Creates a Dataproc cluster with a Jupyter interface."
6 | echo "usage $0: [-h] [-n=name] [-b=bucket]"
7 | echo " -h display help"
8 | echo " -n=name name of cluster to create"
9 | echo " -b=bucket name of bucket in GCS for persistence"
10 | exit 1
11 | }
12 |
13 | for i in "$@"
14 | do
15 | case $i in
16 | -n=*)
17 | CLUSTER_NAME="${i#*=}"
18 | shift
19 | ;;
20 | -b=*)
21 | BUCKET_NAME="${i#*=}"
22 | shift
23 | ;;
24 | -h)
25 | usage
26 | ;;
27 | *)
28 | ;;
29 | esac
30 | done
31 |
32 |
33 | [[ -z $CLUSTER_NAME ]] && usage
34 | [[ -z $BUCKET_NAME ]] && usage
35 |
36 | gcloud dataproc clusters create $CLUSTER_NAME \
37 | --metadata "JUPYTER_PORT=8124,JUPYTER_CONDA_PACKAGES=numpy:pandas:scikit-learn:jinja2:mock:pytest:pytest-cov" \
38 | --initialization-actions \
39 | gs://dataproc-initialization-actions/jupyter/jupyter.sh \
40 | --bucket $BUCKET_NAME \
41 | --num-workers 2
42 | #--worker-machine-type=n1-highcpu-8 \
43 | #--master-machine-type=n1-highcpu-8
44 |
--------------------------------------------------------------------------------
/tests/system/data/neighbor/train/2/train.json:
--------------------------------------------------------------------------------
1 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}}
2 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}}
3 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}}
4 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}}
5 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}}
6 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "0"}}}}
7 | {"event": {"identifiers": {"djUCID": {"value": "2", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "orderconfirmation", "details": {"products": [{"group_id": "0"}]}}}
8 |
--------------------------------------------------------------------------------
/data/queries/marreco/datajet/search.sql:
--------------------------------------------------------------------------------
1 | #standardSQL
2 | SELECT
3 | data.*
4 | FROM(
5 | SELECT
6 | ARRAY(
7 | SELECT AS STRUCT
8 | STRUCT(1 as schema_version, STRUCT(STRUCT(NULL) AS location, "" AS gender) as user,
9 | STRUCT(STRUCT("11" AS value, "bid" AS type) AS bid, STRUCT("_392" AS value, "customer_user_id" AS type) AS customer_user_id, STRUCT(fullvisitorid AS value, "djUCID" as type) AS djUCID) AS identifiers,
10 | STRUCT(device.browser AS client, device.operatingSystem AS os, device.deviceCategory AS origin) AS device,
11 | STRUCT("fish" AS tracker, page.pagePath AS url, referer AS url_referrer) AS source,
12 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS created_at,
13 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS local_timestamp,
14 | "search" AS type,
15 | STRUCT(REGEXP_EXTRACT(page.pagePath, r'/\?q=(.*)') AS query, "keyword" AS query_type) AS details) event,
16 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS created_at
17 | FROM UNNEST(hits) WHERE REGEXP_CONTAINS(page.pagePath, r'/\?q=')) data
18 | FROM `{{dataset}}.ga_sessions_*`
19 | WHERE True
20 | AND EXISTS(SELECT 1 FROM UNNEST(hits) WHERE REGEXP_CONTAINS(page.pagePath, r'/\?q='))
21 | AND _TABLE_SUFFIX BETWEEN FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL {{days_interval}} DAY)) AND FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL {{days_interval_end}} DAY))
22 | ),
23 | UNNEST(data) data
24 |
--------------------------------------------------------------------------------
/tests/data/productview_mock.json:
--------------------------------------------------------------------------------
1 | {"event":{"schema_version":1,"user":{"location":{}},"identifiers":{"bid":{"value":"dcb7b9b540188da2ef245e15785d2ecb","type":"bid"},"djUCID":{"value":"25e35a54c8cace51","type":"djUCID"}},"device":{"client":"Mozilla/5.0 (Linux; Android 4.4.4; SM-G530BT Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36","os":"Linux armv7l","origin":"web"},"source":{"tracker":"fish","url":"m.dafiti.com.br/Moletom-Enfim-Estampado-Azul-2923423.html","url_referrer":"m.dafiti.com.br/catalog/?q=Enfim\\u0026wtqs=1\\u0026dft_capi=1\\u0026page=7"},"created_at":1502582400021,"local_timestamp":1502589588052,"type":"productview","details":{"product":{"id":"","title":"Moletom Enfim Estampado Azul","brand":{"name":"Enfim"},"price":{"current":84.99},"group_id":"MA042APM76IPJ","skus":["MA042APM76IPJ"],"categories":[[{"name":"masculino","slug":"masculino"},{"name":"roupas masculinas","slug":"roupas-masculinas"},{"name":"moletons","slug":"moletons"},{"name":"moletom aberto","slug":"moletom-aberto"}]],"main_category_path":[{"name":"masculino","slug":"masculino"},{"name":"roupas masculinas","slug":"roupas-masculinas"},{"name":"moletons","slug":"moletons"},{"name":"moletom aberto","slug":"moletom-aberto"}],"url":"m.dafiti.com.br/Moletom-Enfim-Estampado-Azul-2923423.html","images":["https://dafitistatic-a.akamaihd.net/p/Enfim-Moletom-Enfim-Estampado-Azul-5611-3243292-1-zoom.jpg"],"colors":["Azul"]}}},"created_at":1502582400021}
2 |
--------------------------------------------------------------------------------
/tests/system/data/top_seller/train/2/train.json:
--------------------------------------------------------------------------------
1 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}}
2 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}}
3 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}}
4 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}}
5 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}}
6 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "0"}}}}
7 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "orderconfirmation", "details": {"products": [{"group_id": "0"}], "quantities": [1]}}}
8 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "orderconfirmation", "details": {"products": [{"group_id": "1"}], "quantities": [1]}}}
9 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "orderconfirmation", "details": {"products": [{"group_id": "2"}], "quantities": [2]}}}
10 |
--------------------------------------------------------------------------------
/tests/unit/spark_jobs/test_factory.py:
--------------------------------------------------------------------------------
1 | #MIT License
2 | #
3 | #Copyright (c) 2017 Willian Fuks
4 | #
5 | #Permission is hereby granted, free of charge, to any person obtaining a copy
6 | #of this software and associated documentation files (the "Software"), to deal
7 | #in the Software without restriction, including without limitation the rights
8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | #copies of the Software, and to permit persons to whom the Software is
10 | #furnished to do so, subject to the following conditions:
11 | #
12 | #The above copyright notice and this permission notice shall be included in all
13 | #copies or substantial portions of the Software.
14 | #
15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | #SOFTWARE.
22 |
23 | import unittest
24 | import sys
25 | import os
26 | import mock
27 |
28 | sys.path.append('./spark_jobs')
29 |
30 | class Test_factory(unittest.TestCase):
31 | @staticmethod
32 | def _get_target_class():
33 | from factory import MarrecoFactory
34 |
35 |
36 | return MarrecoFactory
37 |
38 |
39 | def test_factor_alg(self):
40 | klass = self._get_target_class()
41 | with self.assertRaises(ValueError):
42 | klass._factor_alg('test')
43 |
44 | top_seller = klass._factor_alg('top_seller')
45 | self.assertEqual(top_seller.__name__, 'MarrecoTopSellerJob')
46 |
47 | neighbor = klass._factor_alg('neighbor')
48 | self.assertEqual(neighbor.__name__, 'MarrecoNeighborJob')
49 |
--------------------------------------------------------------------------------
/tests/unit/spark_jobs/test_base.py:
--------------------------------------------------------------------------------
1 | #MIT License
2 | #
3 | #Copyright (c) 2017 Willian Fuks
4 | #
5 | #Permission is hereby granted, free of charge, to any person obtaining a copy
6 | #of this software and associated documentation files (the "Software"), to deal
7 | #in the Software without restriction, including without limitation the rights
8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | #copies of the Software, and to permit persons to whom the Software is
10 | #furnished to do so, subject to the following conditions:
11 | #
12 | #The above copyright notice and this permission notice shall be included in all
13 | #copies or substantial portions of the Software.
14 | #
15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | #SOFTWARE.
22 |
23 | import unittest
24 | import sys
25 | import os
26 | import mock
27 |
28 | sys.path.append('./spark_jobs')
29 |
30 | class Test_base(unittest.TestCase):
31 | @staticmethod
32 | def _get_target_class():
33 | from base import MarrecoBase
34 |
35 |
36 | return MarrecoBase
37 |
38 |
39 | def test_ctor(self):
40 | klass = self._get_target_class()(['test'])
41 | self.assertEqual(klass.tasks, ['test'])
42 |
43 |
44 | def test_run_tasks(self):
45 | method = mock.Mock()
46 | kwargs = {'1': 1}
47 | sc = mock.Mock()
48 | klass = self._get_target_class()([(method, kwargs)])
49 | print(klass.tasks)
50 | klass.run_tasks(sc)
51 |
52 | method.assert_called_once_with(sc, **kwargs)
53 |
--------------------------------------------------------------------------------
/data/queries/marreco/datajet/orderconfirmation.sql:
--------------------------------------------------------------------------------
1 | #standardSQL
2 | SELECT
3 | data.*
4 | FROM(
5 | SELECT
6 | ARRAY(
7 | SELECT AS STRUCT
8 | STRUCT(1 as schema_version, STRUCT(STRUCT(NULL) AS location, "" AS gender) as user,
9 | STRUCT(STRUCT("11" AS value, "bid" AS type) AS bid, STRUCT("_392" AS value, "customer_user_id" AS type) AS customer_user_id, STRUCT(fullvisitorid AS value, "djUCID" as type) AS djUCID) AS identifiers,
10 | STRUCT(device.browser AS client, device.operatingSystem AS os, device.deviceCategory AS origin) AS device,
11 | STRUCT("fish" AS tracker, page.pagePath AS url, referer AS url_referrer) AS source,
12 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS created_at,
13 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS local_timestamp,
14 | "orderconfirmation" AS type,
15 | STRUCT(transaction.transactionId AS order_id, ARRAY(SELECT STRUCT(STRUCT(productBrand AS name) AS brand, STRUCT(productPrice / 10e6 AS `current`, productPrice / 10e6 AS previous) AS price, REGEXP_EXTRACT(productSKU, r'(.*)-\d+') AS group_id, [COALESCE(REGEXP_EXTRACT(productSKU, r'(.*)-\d+'), productSKU), productSKU] AS skus, ARRAY(SELECT AS STRUCT v AS name, REGEXP_REPLACE(v, ' ', '-') AS slug FROM UNNEST(SPLIT( v2productCategory, '|')) v) AS main_category_path) FROM UNNEST(product)) AS products, ARRAY(SELECT productQuantity FROM UNNEST(product)) AS quantities) AS details) event,
16 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS created_at
17 | FROM UNNEST(hits) WHERE ecommerceaction.action_type = '6') data
18 | FROM `40663402.ga_sessions_*`
19 | WHERE True
20 | AND EXISTS(SELECT 1 FROM UNNEST(hits) WHERE ecommerceaction.action_type = '6')
21 | AND NOT EXISTS(SELECT 1 FROM UNNEST(hits), UNNEST(product) WHERE productSKU IS NULL OR productQuantity IS NULL)
22 | AND _TABLE_SUFFIX BETWEEN FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL {{days_init}} DAY)) AND FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL {{days_end}} DAY))
23 | ),
24 | UNNEST(data) data
25 |
26 |
--------------------------------------------------------------------------------
/data/queries/marreco/datajet/purchase.sql:
--------------------------------------------------------------------------------
1 | #standardSQL
2 | SELECT
3 | data.*
4 | FROM(
5 | SELECT
6 | ARRAY(
7 | SELECT AS STRUCT
8 | STRUCT(1 as schema_version, STRUCT(STRUCT(NULL) AS location, "" AS gender) as user,
9 | STRUCT(STRUCT("11" AS value, "bid" AS type) AS bid, STRUCT("_392" AS value, "customer_user_id" AS type) AS customer_user_id, STRUCT(fullvisitorid AS value, "djUCID" as type) AS djUCID) AS identifiers,
10 | STRUCT(device.browser AS client, device.operatingSystem AS os, device.deviceCategory AS origin) AS device,
11 | STRUCT("fish" AS tracker, page.pagePath AS url, referer AS url_referrer) AS source,
12 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS created_at,
13 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS local_timestamp,
14 | "orderconfirmation" AS type,
15 | STRUCT(transaction.transactionId AS order_id, ARRAY(SELECT STRUCT(STRUCT(productBrand AS name) AS brand, STRUCT(productPrice / 10e6 AS `current`, productPrice / 10e6 AS previous) AS price, REGEXP_EXTRACT(productSKU, r'(.*)-\d+') AS group_id, [COALESCE(REGEXP_EXTRACT(productSKU, r'(.*)-\d+'), productSKU), productSKU] AS skus, ARRAY(SELECT AS STRUCT v AS name, REGEXP_REPLACE(v, ' ', '-') AS slug FROM UNNEST(SPLIT( v2productCategory, '|')) v) AS main_category_path) FROM UNNEST(product)) AS products, ARRAY(SELECT productQuantity FROM UNNEST(product)) AS quantities) AS details) event,
16 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS created_at
17 | FROM UNNEST(hits) WHERE ecommerceaction.action_type = '6') data
18 | FROM `{{dataset}}.ga_sessions_*`
19 | WHERE True
20 | AND EXISTS(SELECT 1 FROM UNNEST(hits) WHERE ecommerceaction.action_type = '6')
21 | AND NOT EXISTS(SELECT 1 FROM UNNEST(hits), UNNEST(product) WHERE productSKU IS NULL OR productQuantity IS NULL)
22 | AND _TABLE_SUFFIX BETWEEN FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL {{days_interval}} DAY)) AND FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL {{days_interval_end}} DAY))
23 | ),
24 | UNNEST(data) data
25 |
--------------------------------------------------------------------------------
/tests/unit/data/test_help.py:
--------------------------------------------------------------------------------
1 | #MIT License
2 | #
3 | #Copyright (c) 2017 Willian Fuks
4 | #
5 | #Permission is hereby granted, free of charge, to any person obtaining a copy
6 | #of this software and associated documentation files (the "Software"), to deal
7 | #in the Software without restriction, including without limitation the rights
8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | #copies of the Software, and to permit persons to whom the Software is
10 | #furnished to do so, subject to the following conditions:
11 | #
12 | #The above copyright notice and this permission notice shall be included in all
13 | #copies or substantial portions of the Software.
14 | #
15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | #SOFTWARE.
22 |
23 |
24 | import unittest
25 | import sys
26 | import os
27 | import mock
28 |
29 | sys.path.append('./data')
30 |
31 | class Test_help(unittest.TestCase):
32 | @staticmethod
33 | def _get_target_class():
34 | from help import Jinjafy
35 |
36 |
37 | return Jinjafy
38 |
39 |
40 | def test_ctor(self):
41 | klass = self._get_target_class()('.')
42 | self.assertEqual(klass.env.loader.searchpath, ['.'])
43 |
44 |
45 | def test_render_template(self):
46 | print(os.path.abspath('.'))
47 | klass = self._get_target_class()('tests/data')
48 | result = klass.render_template('test_template.html',
49 | **{'name': 'test'})
50 |
51 | expected = """
test
"""
52 | self.assertEqual(result, expected)
53 |
--------------------------------------------------------------------------------
/spark_jobs/factory.py:
--------------------------------------------------------------------------------
1 | #MIT License
2 | #
3 | #Copyright (c) 2017 Willian Fuks
4 | #
5 | #Permission is hereby granted, free of charge, to any person obtaining a copy
6 | #of this software and associated documentation files (the "Software"), to deal
7 | #in the Software without restriction, including without limitation the rights
8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | #copies of the Software, and to permit persons to whom the Software is
10 | #furnished to do so, subject to the following conditions:
11 | #
12 | #The above copyright notice and this permission notice shall be included in all
13 | #copies or substantial portions of the Software.
14 | #
15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | #SOFTWARE.
22 |
23 |
24 | """
25 | Main Class to manage Spark Jobs.
26 | """
27 |
28 | class MarrecoFactory(object):
29 | """Factory to get appropriate algorithm strategy.
30 |
31 | :type algorithm: str
32 | :param algorithm: states which algorithm should be prepared.
33 |
34 | :rtype: `base.MarrecoBase`
35 | :returns: algorithm strategy ready to run jobs and analysis.
36 | """
37 | @classmethod
38 | def _factor_alg(cls, alg):
39 | if alg == 'top_seller':
40 | from top_seller import MarrecoTopSellerJob
41 | return MarrecoTopSellerJob
42 | elif alg == 'neighbor':
43 | from neighbor import MarrecoNeighborJob
44 | return MarrecoNeighborJob
45 | else:
46 | raise ValueError("Algorithm '{}' is not available. Please choose "
47 | "between 'neighbor' or 'top_seller'".format(alg))
48 |
--------------------------------------------------------------------------------
/data/queries/marreco/datajet/productview.sql:
--------------------------------------------------------------------------------
1 | #standardSQL
2 | SELECT
3 | data.*
4 | FROM(
5 | SELECT
6 | ARRAY(
7 | SELECT AS STRUCT
8 | STRUCT(1 as schema_version, STRUCT(STRUCT(NULL) AS location, "" AS gender) as user,
9 | STRUCT(STRUCT("11" AS value, "bid" AS type) AS bid, STRUCT("_392" AS value, "customer_user_id" AS type) AS customer_user_id, STRUCT(fullvisitorid AS value, "djUCID" as type) AS djUCID) AS identifiers,
10 | STRUCT(device.browser AS client, device.operatingSystem AS os, device.deviceCategory AS origin) AS device,
11 | STRUCT("fish" AS tracker, page.pagePath AS url, referer AS url_referrer) AS source,
12 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS created_at,
13 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS local_timestamp,
14 | "productview" AS type,
15 | STRUCT(STRUCT((SELECT v2ProductName FROM UNNEST(product)) AS title, STRUCT((SELECT productBrand FROM UNNEST(product)) AS name) AS brand, STRUCT((SELECT productPrice / 10e6 FROM UNNEST(product)) AS `current`, (SELECT productPrice / 10e6 FROM UNNEST(product)) AS `previous`) AS price, (SELECT productSKU FROM UNNEST(product)) AS group_id, ARRAY(SELECT productSKU FROM UNNEST(product)) AS skus, ARRAY(SELECT AS STRUCT v AS name, REGEXP_REPLACE(v, ' ', '-') AS slug FROM UNNEST(SPLIT((SELECT v2productCategory FROM UNNEST(product)), ',')) v) AS categories, ARRAY(SELECT AS STRUCT v AS name, REGEXP_REPLACE(v, ' ', '-') AS slug FROM UNNEST(SPLIT((SELECT v2productCategory FROM UNNEST(product)), ',')) v) AS main_category_path, page.pagePath AS url, ARRAY(SELECT page.pagePath FROM UNNEST(hits) LIMIT 1) AS images) AS product) AS details) event,
16 | UNIX_MILLIS(CURRENT_TIMESTAMP()) AS created_at
17 | FROM UNNEST(hits) WHERE ecommerceaction.action_type = '2') data
18 | FROM `{{dataset}}.ga_sessions_*`
19 | WHERE True
20 | AND EXISTS(SELECT 1 FROM UNNEST(hits) WHERE ecommerceaction.action_type = '2')
21 | AND _TABLE_SUFFIX BETWEEN FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL {{days_interval}} DAY)) AND FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL {{days_interval_end}} DAY))
22 | ),
23 | UNNEST(data) data
24 |
--------------------------------------------------------------------------------
/data/help.py:
--------------------------------------------------------------------------------
1 | #MIT License
2 | #
3 | #Copyright (c) 2017 Willian Fuks
4 | #
5 | #Permission is hereby granted, free of charge, to any person obtaining a copy
6 | #of this software and associated documentation files (the "Software"), to deal
7 | #in the Software without restriction, including without limitation the rights
8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | #copies of the Software, and to permit persons to whom the Software is
10 | #furnished to do so, subject to the following conditions:
11 | #
12 | #The above copyright notice and this permission notice shall be included in all
13 | #copies or substantial portions of the Software.
14 | #
15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | #SOFTWARE.
22 |
23 | """Helper functions with general scopes"""
24 |
25 | import os
26 | import uuid
27 | from jinja2 import Environment, FileSystemLoader
28 |
29 | from google.cloud.bigquery import Client as bq_Client
30 | from google.cloud.storage import Client as s_Client
31 |
32 |
33 | class Jinjafy(object):
34 | """Handles main operations related to Jinja such as creating
35 | environments, rendering templates and related operations.
36 |
37 | :type env: str
38 | :param env: folder of where to build jinja environment
39 | """
40 |
41 | def __init__(self, loader_path):
42 | self.env = Environment(loader=FileSystemLoader(loader_path))
43 |
44 |
45 | def render_template(self, file_path, **kwargs):
46 | """Gets Jinja template and return the file rendered based on kwargs input.
47 |
48 | :type file_path: str
49 | :param file_path: path to file containing jinja template
50 |
51 | :param kwargs: key values to render jinja template.
52 | """
53 | return self.env.get_template(file_path).render(**kwargs)
54 |
--------------------------------------------------------------------------------
/spark_jobs/run_marreco.py:
--------------------------------------------------------------------------------
1 | #MIT License
2 | #
3 | #Copyright (c) 2017 Willian Fuks
4 | #
5 | #Permission is hereby granted, free of charge, to any person obtaining a copy
6 | #of this software and associated documentation files (the "Software"), to deal
7 | #in the Software without restriction, including without limitation the rights
8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | #copies of the Software, and to permit persons to whom the Software is
10 | #furnished to do so, subject to the following conditions:
11 | #
12 | #The above copyright notice and this permission notice shall be included in all
13 | #copies or substantial portions of the Software.
14 | #
15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | #SOFTWARE.
22 |
23 |
24 | """
25 | Builds Marreco to run Jobs in Spark.
26 | """
27 |
28 | import sys
29 | import argparse
30 |
31 | import pyspark
32 | from factory import MarrecoFactory
33 |
34 | def get_alg(args):
35 | parser = argparse.ArgumentParser()
36 |
37 | args = [e for e in args if 'algorithm' in e or '-h' in e]
38 | if len(args) == 2:
39 | args.remove('-h')
40 | parser.add_argument('--algorithm',
41 | dest='algorithm',
42 | type=str,
43 | help=('Which algorithm to run. Currently options are '
44 | '"neighbor" or "top seller"'))
45 |
46 | args = parser.parse_args(args)
47 | return args
48 |
49 | def main():
50 | alg = get_alg(sys.argv[1:]).algorithm
51 | if alg:
52 | job = MarrecoFactory._factor_alg(alg)()
53 | args = job.process_sysargs(
54 | [e for e in sys.argv[1:] if 'algorithm' not in e])
55 |
56 | with pyspark.SparkContext() as sc:
57 | job.transform_data(sc, args)
58 | job.build_marreco(sc, args)
59 |
60 |
61 | if __name__ == '__main__':
62 | sys.exit(main())
63 |
--------------------------------------------------------------------------------
/nox.py:
--------------------------------------------------------------------------------
1 | #MIT License
2 | #
3 | #Copyright (c) 2017 Willian Fuks
4 | #
5 | #Permission is hereby granted, free of charge, to any person obtaining a copy
6 | #of this software and associated documentation files (the "Software"), to deal
7 | #in the Software without restriction, including without limitation the rights
8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | #copies of the Software, and to permit persons to whom the Software is
10 | #furnished to do so, subject to the following conditions:
11 | #
12 | #The above copyright notice and this permission notice shall be included in all
13 | #copies or substantial portions of the Software.
14 | #
15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | #SOFTWARE.
22 |
23 | import os
24 | import nox
25 |
26 | @nox.session
27 | @nox.parametrize('python_version', ['3.6'])
28 | def unit_tests(session, python_version):
29 | """Run just unit testings"""
30 |
31 | session.intepreter = 'python{}'.format(python_version)
32 |
33 | # Set virtualenv dirname
34 | session.virtualenv_dirname = 'unit-' + python_version
35 |
36 | session.install('mock', 'pytest', 'pytest-cov')
37 | session.install('-e', '.')
38 |
39 | session.run('py.test',
40 | '--quite',
41 | '--cov=tests.unit',
42 | '--cov-append',
43 | '--cov-config=.coveragerc',
44 | '--cov-report=',
45 | '--cov-fail-under=100',
46 | os.path.join('tests', 'unit'),
47 | *session.posargs
48 | )
49 |
50 | @nox.session
51 | @nox.parametrize('python_version', ['3.6'])
52 | def system_tests(session, python_version):
53 | """Run tests against a live spark (preferably a local cluster)."""
54 |
55 | session.interpreter = 'python{}'.format(python_version)
56 |
57 | session.virtualenv_dirname = 'sys-' + python_version
58 |
59 | session.install('mock', 'pytest')
60 | session.install('-e', '.')
61 |
62 | session.run('py.test',
63 | '--quiet',
64 | os.path.join('tests', 'system.py'),
65 | *session.posargs
66 | )
67 |
--------------------------------------------------------------------------------
/bin/launch_jupyter_interface.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | DIR="${BASH_SOURCE%/*}"
5 | [[ ! -d "$DIR" ]] && DIR="$PWD"
6 |
7 | source "utils.sh"
8 |
9 | function usage {
10 | echo "Creates an SSH tunnel and socks proxy and launches Chrome, using the environment "
11 | echo "variable DATAPROC_CLUSTER_NAME for the unique cluster name. The cluster metadata "
12 | echo "must contain a value for the key 'JUPYTER_PORT'."
13 | echo ""
14 | echo "If the appropriate environment variables are not set and the appropriate command"
15 | echo "line arguments are not given, then the usage message will be displayed and the "
16 | echo "script will exit."
17 | echo ""
18 | echo "usage: $0 [-h] [-c=cluster-name] [-z=zone]"
19 | echo " -h display help"
20 | echo " -z=zone specify cloud zone for cluster"
21 | echo " -c=cluster-name specify unique dataproc cluster name to launch"
22 | exit 1
23 | }
24 |
25 | for i in "$@"
26 | do
27 | case $i in
28 | -z=*)
29 | ZONE="${i#*=}"
30 | shift # past argument=value
31 | ;;
32 | -c=*)
33 | DATAPROC_CLUSTER_NAME="${i#*=}"
34 | shift # past argument=value
35 | ;;
36 | -h)
37 | usage
38 | ;;
39 | *)
40 | ;;
41 | esac
42 | done
43 |
44 | [[ -z $DATAPROC_CLUSTER_NAME ]] && usage
45 | [[ -z $ZONE ]] && usage
46 | JUPYTER_PORT=$(get_metadata_property $DATAPROC_CLUSTER_NAME JUPYTER_PORT)
47 | [[ ! $JUPYTER_PORT =~ ^[0-9]+$ ]] && throw "metadata must contain a valid 'JUPYTER_PORT' value, but instead has the value \"$JUPYTER_PORT\""
48 |
49 | # TODO: Ensure that Jupyter notebook is running on cluster master node
50 |
51 | echo "Using following cluster name: $DATAPROC_CLUSTER_NAME"
52 | echo "Using following cluster zone: $ZONE"
53 | echo "Using following remote dataproc jupyter port: $JUPYTER_PORT"
54 | echo ""
55 |
56 | # 0. Set default path to Chrome application (by operating system type).
57 | # OS X
58 | #CHROME_APP_PATH="/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome"
59 | # Linux
60 | CHROME_APP_PATH="/usr/bin/google-chrome"
61 | # Windows
62 | #CHROME_APP_PATH="C:\Program Files (x86)\Google\Chrome\Application\chrome.exe"
63 |
64 | # Following configuration at:
65 | # https://cloud.google.com/dataproc/cluster-web-interfaces
66 | # 1. Setup ssh tunnel and socks proxy
67 | ZONE_FLAG=""
68 | [[ -v ZONE ]] && ZONE_FLAG="--zone=$ZONE"
69 | gcloud compute ssh $ZONE_FLAG --ssh-flag="-D 10000" --ssh-flag="-N" --ssh-flag="-n" "$DATAPROC_CLUSTER_NAME-m" &
70 | sleep 5 # Wait for tunnel to be ready before opening browser...
71 |
72 | # 2.Launch Chrome instance, referencing the proxy server.
73 | # TODO: Parameterize the chrome app path
74 | eval $CHROME_APP_PATH \
75 | "http://$DATAPROC_CLUSTER_NAME-m:$JUPYTER_PORT" \
76 | --proxy-server="socks5://localhost:10000" \
77 | --host-resolver-rules="MAP * 0.0.0.0 , EXCLUDE localhost" \
78 | --user-data-dir=/tmp/
79 |
80 |
--------------------------------------------------------------------------------
/tests/system/data/datajet_test.json:
--------------------------------------------------------------------------------
1 | {"event":{"schema_version":1,"user":{"location":{}},"identifiers":{"bid":{"value":"dcb7b9b540188da2ef245e15785d2ecb","type":"bid"},"djUCID":{"value":"25e35a54c8cace51","type":"djUCID"}},"device":{"client":"Mozilla/5.0 (Linux; Android 4.4.4; SM-G530BT Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36","os":"Linux armv7l","origin":"web"},"source":{"tracker":"fish","url":"m.dafiti.com.br/Moletom-Enfim-Estampado-Azul-2923423.html","url_referrer":"m.dafiti.com.br/catalog/?q=Enfim\\u0026wtqs=1\\u0026dft_capi=1\\u0026page=7"},"created_at":1502582400021,"local_timestamp":1502589588052,"type":"productview","details":{"product":{"id":"","title":"Moletom Enfim Estampado Azul","brand":{"name":"Enfim"},"price":{"current":84.99},"group_id":"MA042APM76IPJ","skus":["MA042APM76IPJ"],"categories":[[{"name":"masculino","slug":"masculino"},{"name":"roupas masculinas","slug":"roupas-masculinas"},{"name":"moletons","slug":"moletons"},{"name":"moletom aberto","slug":"moletom-aberto"}]],"main_category_path":[{"name":"masculino","slug":"masculino"},{"name":"roupas masculinas","slug":"roupas-masculinas"},{"name":"moletons","slug":"moletons"},{"name":"moletom aberto","slug":"moletom-aberto"}],"url":"m.dafiti.com.br/Moletom-Enfim-Estampado-Azul-2923423.html","images":["https://dafitistatic-a.akamaihd.net/p/Enfim-Moletom-Enfim-Estampado-Azul-5611-3243292-1-zoom.jpg"],"colors":["Azul"]}}},"created_at":1502582400021}
2 | {"event":{"schema_version":1,"user":{"location":{}},"identifiers":{"bid":{"value":"03651597b830fbbee9c7f4299989bd48","type":"bid"},"djUCID":{"value":"610574c802ba3b33","type":"djUCID"}},"device":{"client":"Mozilla/5.0 (Linux; Android 7.0; XT1635-02 Build/NPN25.137-24-1; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/59.0.3071.125 Mobile Safari/537.36 [FB_IAB/FB4A;FBAV/136.0.0.22.91;]","os":"Linux armv7l","origin":"web"},"source":{"tracker":"fish","url":"m.dafiti.com.br/checkout/success/","url_referrer":"checkout.dafiti.com.br/checkout/finish/"},"created_at":1502582416663,"local_timestamp":1502582415616,"type":"orderconfirmation","details":{"order_id":"15965531","products":[{"id":"","price":{"current":74.5},"group_id":"DA923SHF35RHK","skus":["DA923SHF35RHK"],"main_category_path":[{"name":"calcados","slug":"calcados"},{"name":"calcados femininos","slug":"calcados-femininos"}]},{"id":"","price":{"current":74.5},"group_id":"VI618SHF69UQC","skus":["VI618SHF69UQC"],"main_category_path":[{"name":"calcados","slug":"calcados"},{"name":"calcados femininos","slug":"calcados-femininos"}]}],"quantities":[1,1]}},"created_at":1502582416663}
3 | {"event":{"schema_version":1,"user":{"location":{}},"device":{"origin":"web"},"source":{"tracker":"hawk","url":"/","url_referrer":"/"},"created_at":1502582400127,"type":"search_response","details":{"generation_ms":69,"request":{"category_dept":1,"facet_count":1000,"facets":["brand","price","size","gender","color","categories_slugs","categories_ids","owner","category"],"fields":"*","filters":{"brand.slug":["calvin-klein-kids"],"categories_ids":["257"]},"gs":3,"rq":5,"size":48,"sort":"relevance","top_product_ids":[""]},"response":{"count":189,"id":"d7c104cab610e7edf07290428c4db4e6ec49fcc1","items":["CA947APM37XCS","CA947APM24OVZ"]}}},"created_at":1502582400127}
4 |
--------------------------------------------------------------------------------
/tests/system/data/top_seller/datajet_test.json:
--------------------------------------------------------------------------------
1 | {"event":{"schema_version":1,"user":{"location":{}},"identifiers":{"bid":{"value":"dcb7b9b540188da2ef245e15785d2ecb","type":"bid"},"djUCID":{"value":"25e35a54c8cace51","type":"djUCID"}},"device":{"client":"Mozilla/5.0 (Linux; Android 4.4.4; SM-G530BT Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36","os":"Linux armv7l","origin":"web"},"source":{"tracker":"fish","url":"m.dafiti.com.br/Moletom-Enfim-Estampado-Azul-2923423.html","url_referrer":"m.dafiti.com.br/catalog/?q=Enfim\\u0026wtqs=1\\u0026dft_capi=1\\u0026page=7"},"created_at":1502582400021,"local_timestamp":1502589588052,"type":"productview","details":{"product":{"id":"","title":"Moletom Enfim Estampado Azul","brand":{"name":"Enfim"},"price":{"current":84.99},"group_id":"MA042APM76IPJ","skus":["MA042APM76IPJ"],"categories":[[{"name":"masculino","slug":"masculino"},{"name":"roupas masculinas","slug":"roupas-masculinas"},{"name":"moletons","slug":"moletons"},{"name":"moletom aberto","slug":"moletom-aberto"}]],"main_category_path":[{"name":"masculino","slug":"masculino"},{"name":"roupas masculinas","slug":"roupas-masculinas"},{"name":"moletons","slug":"moletons"},{"name":"moletom aberto","slug":"moletom-aberto"}],"url":"m.dafiti.com.br/Moletom-Enfim-Estampado-Azul-2923423.html","images":["https://dafitistatic-a.akamaihd.net/p/Enfim-Moletom-Enfim-Estampado-Azul-5611-3243292-1-zoom.jpg"],"colors":["Azul"]}}},"created_at":1502582400021}
2 | {"event":{"schema_version":1,"user":{"location":{}},"identifiers":{"bid":{"value":"03651597b830fbbee9c7f4299989bd48","type":"bid"},"djUCID":{"value":"610574c802ba3b33","type":"djUCID"}},"device":{"client":"Mozilla/5.0 (Linux; Android 7.0; XT1635-02 Build/NPN25.137-24-1; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/59.0.3071.125 Mobile Safari/537.36 [FB_IAB/FB4A;FBAV/136.0.0.22.91;]","os":"Linux armv7l","origin":"web"},"source":{"tracker":"fish","url":"m.dafiti.com.br/checkout/success/","url_referrer":"checkout.dafiti.com.br/checkout/finish/"},"created_at":1502582416663,"local_timestamp":1502582415616,"type":"orderconfirmation","details":{"order_id":"15965531","products":[{"id":"","price":{"current":74.5},"group_id":"DA923SHF35RHK","skus":["DA923SHF35RHK"],"main_category_path":[{"name":"calcados","slug":"calcados"},{"name":"calcados femininos","slug":"calcados-femininos"}]},{"id":"","price":{"current":74.5},"group_id":"VI618SHF69UQC","skus":["VI618SHF69UQC"],"main_category_path":[{"name":"calcados","slug":"calcados"},{"name":"calcados femininos","slug":"calcados-femininos"}]}],"quantities":[1,1]}},"created_at":1502582416663}
3 | {"event":{"schema_version":1,"user":{"location":{}},"device":{"origin":"web"},"source":{"tracker":"hawk","url":"/","url_referrer":"/"},"created_at":1502582400127,"type":"search_response","details":{"generation_ms":69,"request":{"category_dept":1,"facet_count":1000,"facets":["brand","price","size","gender","color","categories_slugs","categories_ids","owner","category"],"fields":"*","filters":{"brand.slug":["calvin-klein-kids"],"categories_ids":["257"]},"gs":3,"rq":5,"size":48,"sort":"relevance","top_product_ids":[""]},"response":{"count":189,"id":"d7c104cab610e7edf07290428c4db4e6ec49fcc1","items":["CA947APM37XCS","CA947APM24OVZ"]}}},"created_at":1502582400127}
4 |
--------------------------------------------------------------------------------
/tests/unit/spark_jobs/test_run_marreco.py:
--------------------------------------------------------------------------------
1 | #MIT License
2 | #
3 | #Copyright (c) 2017 Willian Fuks
4 | #
5 | #Permission is hereby granted, free of charge, to any person obtaining a copy
6 | #of this software and associated documentation files (the "Software"), to deal
7 | #in the Software without restriction, including without limitation the rights
8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | #copies of the Software, and to permit persons to whom the Software is
10 | #furnished to do so, subject to the following conditions:
11 | #
12 | #The above copyright notice and this permission notice shall be included in all
13 | #copies or substantial portions of the Software.
14 | #
15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | #SOFTWARE.
22 |
23 | import unittest
24 | import sys
25 | import mock
26 | from collections import namedtuple
27 |
28 | sys.path.append('./spark_jobs')
29 |
30 |
31 | class Test_run_marreco(unittest.TestCase):
32 | def test_get_alg(self):
33 | from run_marreco import get_alg
34 |
35 |
36 | expected = 'test'
37 | args = get_alg(['--algorithm=test'])
38 | self.assertEqual(expected, args.algorithm)
39 |
40 |
41 | @mock.patch('run_marreco.get_alg')
42 | @mock.patch('run_marreco.pyspark')
43 | @mock.patch('run_marreco.MarrecoFactory')
44 | def test_main_runs(self, factory_mock, spark_mock, get_alg_mock):
45 | from run_marreco import main
46 |
47 |
48 | Args = namedtuple('args', 'algorithm')
49 | args = Args('test')
50 | get_alg_mock.return_value = args
51 |
52 | job_mock = mock.Mock()
53 | factory_mock._factor_alg.return_value.return_value = job_mock
54 |
55 | job_mock.process_sysargs.return_value = 'test'
56 | context_mock = mock.Mock()
57 | spark_mock.SparkContext.return_value = context_mock
58 | context_enter_mock = mock.Mock()
59 |
60 | context_mock.__enter__ = context_enter_mock
61 | context_mock.__exit__ = mock.Mock()
62 |
63 | main()
64 | job_mock.transform_data.assert_called_once_with(context_enter_mock(), 'test')
65 | job_mock.build_marreco.assert_called_once_with(context_enter_mock(), 'test')
66 |
67 |
68 | @mock.patch('run_marreco.get_alg')
69 | @mock.patch('run_marreco.pyspark')
70 | @mock.patch('run_marreco.MarrecoFactory')
71 | def test_main_does_not_run(self, factory_mock, spark_mock, get_alg_mock):
72 | from run_marreco import main
73 |
74 |
75 | Args = namedtuple('args', 'algorithm')
76 | args = Args(None)
77 | get_alg_mock.return_value = args
78 |
79 | job_mock = mock.Mock()
80 | factory_mock._factor_alg.return_value.return_value = job_mock
81 |
82 | job_mock.process_sysargs.return_value = 'test'
83 | context_mock = mock.Mock()
84 | spark_mock.SparkContext.return_value = context_mock
85 | context_enter_mock = mock.Mock()
86 |
87 | context_mock.__enter__ = context_enter_mock
88 | context_mock.__exit__ = mock.Mock()
89 |
90 | main()
91 | job_mock.transform_data.assert_not_called()
92 | job_mock.build_marreco.assert_not_called()
93 |
--------------------------------------------------------------------------------
/spark_jobs/base.py:
--------------------------------------------------------------------------------
1 | #MIT License
2 | #
3 | #Copyright (c) 2017 Willian Fuks
4 | #
5 | #Permission is hereby granted, free of charge, to any person obtaining a copy
6 | #of this software and associated documentation files (the "Software"), to deal
7 | #in the Software without restriction, including without limitation the rights
8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | #copies of the Software, and to permit persons to whom the Software is
10 | #furnished to do so, subject to the following conditions:
11 | #
12 | #The above copyright notice and this permission notice shall be included in all
13 | #copies or substantial portions of the Software.
14 | #
15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | #SOFTWARE.
22 |
23 |
24 | """
25 | Base Class for Algorithms in Spark.
26 | """
27 |
28 | import abc
29 | import datetime
30 |
31 | class MarrecoBase(object):
32 | """Base Class to run Jobs against Spark
33 |
34 | :type tasks: list
35 | :param tasks: list with values [(task, {key:value}] pairs to be used later
36 | on when invoking command ``self.run()``
37 | """
38 | def __init__(self, tasks=[]):
39 | self.tasks = tasks
40 |
41 |
42 | def run_tasks(self, sc):
43 | """For each task saved in ``self.task``, uses the context ``sc`` to
44 | execute the jobs.
45 |
46 | :type sc: `pyspark.SparkContext`
47 | :param sc: spark context used to run the jobs.
48 | """
49 | if not self.tasks:
50 | raise ValueError("``self.tasks`` list is empty. Please specify"
51 | " which jobs you want to run")
52 |
53 | for method, kwargs in self.tasks:
54 | method(sc, **kwargs)
55 |
56 |
57 | @abc.abstractmethod
58 | def process_sysargs(self, args):
59 | """Process input arguments sent in sys args. Each algorithm have its
60 | own implementation for making the parsing.
61 |
62 | :type args: list
63 | :param args: list of arguments like ['--days_init=2', '--days_end=1']
64 | """
65 | pass
66 |
67 |
68 | @abc.abstractmethod
69 | def transform_data(self, sc, args):
70 | """Gets data from datajet and transforms so that Marreco can read
71 | and use it properly. Each algorithm shall implement its own strategy
72 | """
73 | pass
74 |
75 |
76 | @abc.abstractmethod
77 | def build_marreco(self, sc, args):
78 | """Main method for each algorithm where results are calculated, such
79 | as computing matrix similarities or top selling items.
80 | """
81 | pass
82 |
83 | @abc.abstractmethod
84 | def get_formatted_date(self, day):
85 | """This method is used mainly to transform the input of ``days``
86 | into a string of type ``YYYY-MM-DD``
87 |
88 | :type day: int
89 | :param day: how many days in time to come back from today to make
90 | the string transformation.
91 |
92 | :rtype: str
93 | :returns: formated date of today - day in format %Y-%m-%d
94 | """
95 | return (datetime.datetime.now() -
96 | datetime.timedelta(days=day)).strftime('%Y-%m-%d')
97 |
98 |
--------------------------------------------------------------------------------
/notebooks/marreco_dense_dimsum.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | cells: [
3 | {
4 | cell_type: "code",
5 | execution_count: null,
6 | metadata: {
7 | collapsed: true
8 | },
9 | outputs: [ ],
10 | source: [
11 | "from pyspark.mllib.linalg import SparseVector ",
12 | "from pyspark.mllib.linalg.distributed import RowMatrix ",
13 | "import numpy as np ",
14 | "from sklearn.metrics.pairwise import cosine_similarity ",
15 | "import time ",
16 | "from collections import defaultdict ",
17 | "from pyspark.sql import functions as sfunc ",
18 | "from pyspark.sql import types as stypes ",
19 | "import math ",
20 | "import sys ",
21 | "from pyspark.ml.linalg import SparseVector ",
22 | "from pyspark.mllib.linalg.distributed import RowMatrix ",
23 | "from operator import itemgetter "
24 | ]
25 | },
26 | {
27 | cell_type: "code",
28 | execution_count: null,
29 | metadata: {
30 | collapsed: true
31 | },
32 | outputs: [ ],
33 | source: [
34 | "schema = stypes.StructType().add("fv", stypes.StringType()).add("sku", stypes.StringType()).add("score", stypes.FloatType()) ",
35 | "train_df = spark.read.csv('gs://lbanor/pyspark/train_query*.gz', header=True, schema=schema) ",
36 | "train_df.createOrReplaceTempView('test1')"
37 | ]
38 | },
39 | {
40 | cell_type: "code",
41 | execution_count: null,
42 | metadata: {
43 | collapsed: true
44 | },
45 | outputs: [ ],
46 | source: [
47 | "query = """ ",
48 | "SELECT ",
49 | " sku, ",
50 | " ROW_NUMBER() OVER (ORDER BY SUM(1)) -1 idx ",
51 | "FROM test1 ",
52 | "GROUP BY 1 ",
53 | """" ",
54 | "skus_rdd = spark.sql(query).rdd"
55 | ]
56 | },
57 | {
58 | cell_type: "code",
59 | execution_count: null,
60 | metadata: {
61 | collapsed: true
62 | },
63 | outputs: [ ],
64 | source: [
65 | "d = {row.sku: row.idx for row in skus_rdd.collect()} ",
66 | "db = sc.broadcast(d) ",
67 | " ",
68 | "id_ = {value: key for key, value in d.items()} ",
69 | "id_b = sc.broadcast(id_)"
70 | ]
71 | },
72 | {
73 | cell_type: "code",
74 | execution_count: null,
75 | metadata: {
76 | collapsed: true
77 | },
78 | outputs: [ ],
79 | source: [
80 | "query_users_items = """ ",
81 | "SELECT ",
82 | "data ",
83 | "FROM( ",
84 | " SELECT ",
85 | " fv, ",
86 | " COLLECT_LIST(STRUCT(sku, score * 2 AS score)) data ",
87 | " FROM test1 ",
88 | " GROUP BY 1 ",
89 | ") ",
90 | "WHERE size(data) between 2 and 20 ",
91 | "LIMIT 3 ",
92 | """""
93 | ]
94 | },
95 | {
96 | cell_type: "code",
97 | execution_count: null,
98 | metadata: {
99 | collapsed: true
100 | },
101 | outputs: [ ],
102 | source: [
103 | "users_rdd = spark.sql(query_users_items).rdd"
104 | ]
105 | },
106 | {
107 | cell_type: "code",
108 | execution_count: null,
109 | metadata: {
110 | collapsed: true
111 | },
112 | outputs: [ ],
113 | source: [
114 | "def make_sparse(row): ",
115 | " tmp = sorted([(db.value[i.sku], i.score) for i in row.data], key=itemgetter(0)) ",
116 | " return (SparseVector(len(db.value), [e[0] for e in tmp], [e[1] for e in tmp]),) ",
117 | " ",
118 | "t0 = time.time() ",
119 | "mat = RowMatrix(users_rdd.map(lambda x: make_sparse(x)).toDF())"
120 | ]
121 | },
122 | {
123 | cell_type: "code",
124 | execution_count: null,
125 | metadata: {
126 | collapsed: true
127 | },
128 | outputs: [ ],
129 | source: [
130 | "a = mat.columnSimilarities(0)"
131 | ]
132 | }
133 | ],
134 | metadata: {
135 | kernelspec: {
136 | display_name: "PySpark",
137 | language: "python",
138 | name: "pyspark"
139 | },
140 | language_info: {
141 | codemirror_mode: {
142 | name: "ipython",
143 | version: 3
144 | },
145 | file_extension: ".py",
146 | mimetype: "text/x-python",
147 | name: "python",
148 | nbconvert_exporter: "python",
149 | pygments_lexer: "ipython3",
150 | version: "3.5.2"
151 | }
152 | },
153 | nbformat: 4,
154 | nbformat_minor: 2
155 | }
156 |
--------------------------------------------------------------------------------
/tests/unit/data/test_exporter.py:
--------------------------------------------------------------------------------
1 | #MIT License
2 | #
3 | #Copyright (c) 2017 Willian Fuks
4 | #
5 | #Permission is hereby granted, free of charge, to any person obtaining a copy
6 | #of this software and associated documentation files (the "Software"), to deal
7 | #in the Software without restriction, including without limitation the rights
8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | #copies of the Software, and to permit persons to whom the Software is
10 | #furnished to do so, subject to the following conditions:
11 | #
12 | #The above copyright notice and this permission notice shall be included in all
13 | #copies or substantial portions of the Software.
14 | #
15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | #SOFTWARE.
22 |
23 | import unittest
24 | import sys
25 | import os
26 | import mock
27 |
28 | sys.path.append('./data')
29 |
30 | class Test_Exporter(unittest.TestCase):
31 | @staticmethod
32 | def _get_target_class():
33 | from exporter import Exporter
34 |
35 |
36 | return Exporter
37 |
38 | @mock.patch('exporter.uuid')
39 | def test_run_bq_query(self, uuid_mock):
40 | class JobSpec(object):
41 | def __init__(self, destination):
42 | self._destination = destination
43 | self._errors = None
44 | self._maximum_bytes_billed = None
45 |
46 | @property
47 | def destination(self):
48 | return self._desgination
49 |
50 | @destination.setter
51 | def destination(self, value):
52 | self._destination = value
53 |
54 | def run(self):
55 | pass
56 |
57 | @property
58 | def errors(self):
59 | return self._errors
60 |
61 | @errors.setter
62 | def errors(self, value):
63 | self._errors = value
64 |
65 | @property
66 | def maximum_bytes_billed(self):
67 | return self._maximum_bytes_billed
68 |
69 | @maximum_bytes_billed.setter
70 | def maximum_bytes_billed(self, value):
71 | self._maximum_bytes_billed = value
72 |
73 | def begin(self):
74 | pass
75 |
76 | def result(self):
77 | pass
78 |
79 | uuid_mock.uuid4.return_value = 'test_id'
80 | klass = self._get_target_class()()
81 | job_mock = mock.Mock(spec=JobSpec)
82 | job_mock.errors = None
83 |
84 | client_mock = mock.Mock()
85 | client_mock.run_async_query.return_value = job_mock
86 |
87 | klass.run_bq_query(client_mock,
88 | 'query_test',
89 | {'threshold': 2,
90 | 'destination': 'test',
91 | 'maximum_bytes_billed': 100})
92 |
93 | self.assertEqual(job_mock.destination, 'test')
94 | self.assertEqual(job_mock.maximum_bytes_billed, 100)
95 | client_mock.run_async_query.assert_called_once_with(*['test_id', 'query_test'])
96 |
97 | with self.assertRaises(Exception):
98 | job_mock.errors = 'error'
99 | klass.run_bq_query(client_mock, 'test', {})
100 |
--------------------------------------------------------------------------------
/tests/system/data/neighbor/train/1/train.json:
--------------------------------------------------------------------------------
1 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "0"}}}}
2 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}}
3 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}}
4 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "2"}}}}
5 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}}
6 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}}
7 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "0"}}}}
8 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "0"}}}}
9 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}}
10 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}}
11 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "2"}}}}
12 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "2"}}}}
13 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}}
14 | {"event": {"identifiers": {"djUCID": {"value": "2", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}}
15 | {"event": {"identifiers": {"djUCID": {"value": "2", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}}
16 | {"event": {"identifiers": {"djUCID": {"value": "2", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "2"}}}}
17 | {"event": {"identifiers": {"djUCID": {"value": "2", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}}
18 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "0"}}}}
19 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}}
20 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "orderconfirmation", "details": {"products": [{"group_id": "2"}, {"group_id": "3"}]}}}
21 |
--------------------------------------------------------------------------------
/tests/unit/spark_jobs/test_top_seller.py:
--------------------------------------------------------------------------------
1 | #MIT License
2 | #
3 | #Copyright (c) 2017 Willian Fuks
4 | #
5 | #Permission is hereby granted, free of charge, to any person obtaining a copy
6 | #of this software and associated documentation files (the "Software"), to deal
7 | #in the Software without restriction, including without limitation the rights
8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | #copies of the Software, and to permit persons to whom the Software is
10 | #furnished to do so, subject to the following conditions:
11 | #
12 | #The above copyright notice and this permission notice shall be included in all
13 | #copies or substantial portions of the Software.
14 | #
15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | #SOFTWARE.
22 |
23 |
24 | import unittest
25 | import sys
26 | import mock
27 | import json
28 | import datetime
29 | import math
30 | from collections import namedtuple
31 |
32 | from pyspark.sql import types as stypes
33 | sys.path.append('./spark_jobs')
34 |
35 |
36 | class Test_TopSeller(unittest.TestCase):
37 | @staticmethod
38 | def _get_target_class():
39 | from top_seller import MarrecoTopSellerJob
40 |
41 |
42 | return MarrecoTopSellerJob
43 |
44 |
45 | def test_load_top_seller_schema(self):
46 | klass = self._get_target_class()()
47 | expected = stypes.StructType(fields=[
48 | stypes.StructField("item_key", stypes.StringType()),
49 | stypes.StructField("value", stypes.IntegerType())])
50 |
51 | result = klass._load_top_seller_schema()
52 |
53 | self.assertEqual(expected, result)
54 |
55 |
56 | def test_render_inter_uri(self):
57 | klass = self._get_target_class()()
58 | expected = 'folder/part-*'
59 | result = klass._render_inter_uri('folder')
60 | self.assertEqual(expected, result)
61 |
62 |
63 | def test_process_json_product_view(self):
64 | klass = self._get_target_class()()
65 | data = open('tests/data/productview_mock.json').read()
66 |
67 | result = list(klass._process_json(data))
68 | self.assertEqual(result, [])
69 |
70 |
71 | def test_process_json_search(self):
72 | klass = self._get_target_class()()
73 |
74 | data = open('tests/data/search_mock.json').read()
75 | result = list(klass._process_json(data))
76 | self.assertEqual(result, [])
77 |
78 |
79 | def test_process_json_orderconfirmation(self):
80 | klass = self._get_target_class()()
81 |
82 | data = open('tests/data/orderconfirmation_mock.json').read()
83 | result = list(klass._process_json(data))
84 | expected = [('DA923SHF35RHK', 1), ('VI618SHF69UQC', 1)]
85 |
86 | self.assertEqual(expected, result)
87 |
88 |
89 | def test_process_sysargs(self):
90 | input = ['--days_init=2',
91 | '--days_end=3',
92 | '--source_uri=source_uri',
93 | '--inter_uri=inter_uri',
94 | '--top_seller_uri=top_seller_uri',
95 | '--force=no']
96 |
97 | klass = self._get_target_class()()
98 | args = klass.process_sysargs(input)
99 | self.assertEqual(args.days_init, 2)
100 | self.assertEqual(args.days_end, 3)
101 | self.assertEqual(args.source_uri, 'source_uri')
102 | self.assertEqual(args.inter_uri, 'inter_uri')
103 | self.assertEqual(args.top_seller_uri, 'top_seller_uri')
104 | self.assertEqual(args.force, 'no')
105 |
106 |
--------------------------------------------------------------------------------
/tests/system/data/top_seller/train/1/train.json:
--------------------------------------------------------------------------------
1 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "0"}}}}
2 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}}
3 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}}
4 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "2"}}}}
5 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}}
6 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}}
7 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "0"}}}}
8 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "0"}}}}
9 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}}
10 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}}
11 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "2"}}}}
12 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "2"}}}}
13 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}}
14 | {"event": {"identifiers": {"djUCID": {"value": "2", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}}
15 | {"event": {"identifiers": {"djUCID": {"value": "2", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}}
16 | {"event": {"identifiers": {"djUCID": {"value": "2", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "2"}}}}
17 | {"event": {"identifiers": {"djUCID": {"value": "2", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "3"}}}}
18 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "0"}}}}
19 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "productview", "details": {"product": {"group_id": "1"}}}}
20 | {"event": {"identifiers": {"djUCID": {"value": "3", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "orderconfirmation", "details": {"products": [{"group_id": "2"}, {"group_id": "3"}], "quantities": [1, 1]}}}
21 | {"event": {"identifiers": {"djUCID": {"value": "0", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "orderconfirmation", "details": {"products": [{"group_id": "0"}], "quantities": [1]}}}
22 | {"event": {"identifiers": {"djUCID": {"value": "1", "type": "djUCID"}}, "source": {"tracker": "fish"}, "local_timestamp": 1, "type": "orderconfirmation", "details": {"products": [{"group_id": "0"}], "quantities": [1]}}}
23 |
--------------------------------------------------------------------------------
/data/exporter.py:
--------------------------------------------------------------------------------
1 | #MIT License
2 | #
3 | #Copyright (c) 2017 Willian Fuks
4 | #
5 | #Permission is hereby granted, free of charge, to any person obtaining a copy
6 | #of this software and associated documentation files (the "Software"), to deal
7 | #in the Software without restriction, including without limitation the rights
8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | #copies of the Software, and to permit persons to whom the Software is
10 | #furnished to do so, subject to the following conditions:
11 | #
12 | #The above copyright notice and this permission notice shall be included in all
13 | #copies or substantial portions of the Software.
14 | #
15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | #SOFTWARE.
22 |
23 | """
24 | Exports data from BigQuery to GCS for future spark jobs.
25 | """
26 |
27 | import os
28 | import uuid
29 |
30 | from jinja2 import Environment, FileSystemLoader
31 | from google.cloud.bigquery import Client
32 |
33 |
34 | class Exporter(object):
35 | def bq_to_gcs(self,
36 | client,
37 | query,
38 | bq_config,
39 | gcs_config):
40 | """Runs ``query`` against BigQuery and exports the results to GCS.
41 |
42 | :type config: dict
43 | :param config: parameters to set the job constructor to run in BQ,
44 | such as destination table, dataset, expiration time.
45 |
46 | :type gcs_bucket: str
47 | :param gcs_bucket: bucket in where to save the query results.
48 | """
49 | self.run_bq_query(client, query, bq_config)
50 | self.export_to_gcs(client, gcs_config)
51 |
52 |
53 | def run_bq_query(self, client, query, config):
54 | """Runs ``query`` against BQ
55 |
56 | :type client: data.clients.bq.uClient
57 | :param client: bq client for job operations.
58 |
59 | :type config: dict
60 | :param config: general information for job execution.
61 |
62 | :raises Exception: on ``job.errors`` is not None.
63 | """
64 | job = client.run_async_query(str(uuid.uuid4()), query)
65 | job = self._update_job_attrs(job, config)
66 | job.begin()
67 | job.result()
68 | if job.errors:
69 | raise Exception(str(job.errors))
70 |
71 |
72 | def export_to_gcs(self, client, config):
73 | """Runs job to export table from BigQuery to GCS.
74 |
75 | :type client: `google.cloud.bigquery.Client`
76 | :param client: bigquery client to run the job.
77 |
78 | :type config: dict
79 | :param config: key values to setup the job execution.
80 |
81 | :raises Exception: on ``job.errors`` is not None.
82 | """
83 | job = client.extract_table_to_storage(str(uuid.uuid4()),
84 | config['table'],
85 | config['uri'])
86 |
87 | job = self._update_job_attrs(job, config)
88 | job.begin()
89 | result = job.result()
90 | if result.errors:
91 | raise Exception(str(result.errors))
92 |
93 |
94 | def _update_job_attrs(self, job, config):
95 | """Updates job attributes before running ``begin`` or ``run``.
96 |
97 | :type job: `google.cloud.bigquery.job.Job`
98 | :param job: job to be executed.
99 |
100 | :type config: dict
101 | :param config: values with attributes to update how ``job`` should be
102 | executed.
103 |
104 | :rtype job: Job
105 | :returns: job with updated attributes.
106 | """
107 | for key, value in config.items():
108 | if key in set(dir(job)):
109 | job.__setattr__(key, value)
110 | return job
111 |
--------------------------------------------------------------------------------
/bin/export_datajet.py:
--------------------------------------------------------------------------------
1 | #MIT License
2 | #
3 | #Copyright (c) 2017 Willian Fuks
4 | #
5 | #Permission is hereby granted, free of charge, to any person obtaining a copy
6 | #of this software and associated documentation files (the "Software"), to deal
7 | #in the Software without restriction, including without limitation the rights
8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | #copies of the Software, and to permit persons to whom the Software is
10 | #furnished to do so, subject to the following conditions:
11 | #
12 | #The above copyright notice and this permission notice shall be included in all
13 | #copies or substantial portions of the Software.
14 | #
15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | #SOFTWARE.
22 |
23 |
24 | """
25 | Export data from BigQuery to GCS for PySpark Neighborhood
26 | """
27 |
28 | import argparse
29 | import sys
30 |
31 | sys.path.append('..')
32 |
33 | from data.exporter import Exporter
34 | from data.help import Jinjafy
35 | from google.cloud.bigquery import Client
36 |
37 |
38 | def get_sysargs(args):
39 | parser = argparse.ArgumentParser()
40 |
41 | parser.add_argument('--days_init',
42 | dest='days_init',
43 | type=int,
44 | help=("Total amount of days to come back in time "
45 | "from today's date."))
46 |
47 | parser.add_argument('--days_end',
48 | dest='days_end',
49 | type=int,
50 | help=("Total amount of days to come back in time "
51 | "from today's date."))
52 |
53 | parser.add_argument('--table',
54 | dest='table',
55 | type=str,
56 | help=("Table name for where to save results in BQ."))
57 |
58 | parser.add_argument('--dataset',
59 | dest='dataset',
60 | type=str,
61 | help=('Name of dataset to export BQ tables to.'))
62 |
63 | parser.add_argument('--uri',
64 | dest='uri',
65 | type=str,
66 | help=('URI name to save the contents in GCS'))
67 |
68 | args = parser.parse_args(args)
69 | return args
70 |
71 | def build_query(jinjafy, query, input):
72 | """builds neighborhood query.
73 |
74 | :type jinjafy: `data.help.Jinjafy`
75 | :param jinjafy: handler for jinja operations.
76 |
77 | :type input: dict
78 | :param input: values to be used in jinja rendering.
79 |
80 | :rtype query: str
81 | :param query: query after jinja rendered runs.
82 | """
83 | return jinjafy.render_template(query, **input)
84 |
85 | def main():
86 | args = get_sysargs(sys.argv[1:])
87 | exporter = Exporter()
88 | jinjafy = Jinjafy('../data/queries/marreco/datajet/')
89 |
90 | client = Client()
91 | dataset = client.dataset(args.dataset)
92 | table = dataset.table(args.table)
93 |
94 | for day in range(args.days_init, args.days_end - 1, -1):
95 | print('processing day: ', day)
96 | for idx, file_ in enumerate(['productview.sql',
97 | 'search.sql',
98 | 'purchase.sql']):
99 |
100 | query = build_query(jinjafy,
101 | file_,
102 | {'dataset': '40663402',
103 | 'days_interval': day,
104 | 'days_interval_end': day})
105 |
106 | exporter.bq_to_gcs(client,
107 | query,
108 | {'destination': table,
109 | 'maximum_bytes_billed': 1000000000000,
110 | 'write_disposition': 'WRITE_TRUNCATE'},
111 | {'uri': args.uri.format(day=day, idx=idx),
112 | 'table': table,
113 | 'compression': 'GZIP',
114 | 'destination_format': 'NEWLINE_DELIMITED_JSON'})
115 |
116 |
117 | if __name__ == '__main__':
118 | sys.exit(main())
119 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PySpark Marreco
2 | Implements the algorithm [DIMSUM](http://arxiv.org/abs/1304.1467) using a PySpark implementation.
3 |
4 | ## Getting Started
5 | This repository is built to implement the algorithm DIMSUM on a set of data containing customers interactions on products for a given web commerce.
6 |
7 | The folder `data` was implemented so to manipulate data that is used as input for the algorithm. It follows already a pre-defined schema that transforms data from Google BigQuery GA data to the specified schema (and saves results to a user input specified URI, as will further be discussed below.
8 |
9 | The main folder of this repository is `spark_jobs` where you'll find the main algorithm implemented, specifically, the file `spark_jobs/neighbor.py`.
10 |
11 | To run a neighbor job against spark using [Google Dataproc](https://cloud.google.com/dataproc/), this is one example of how to do so:
12 |
13 | ```sh
14 | gcloud dataproc jobs submit pyspark \
15 | --cluster=test3 \
16 | --properties=spark.hadoop.fs.s3n.awsAccessKeyId=,spark.hadoop.fs.s3n.awsSecretAccessKey= \
17 | --py-files=base.py,factory.py,neighbor.py \
18 | --bucket=lbanor \
19 | run_marreco.py -- \
20 | --days_init=7 \
21 | --days_end=3 \
22 | --source_uri=gs://lbanor/pyspark/datajet/dt={}/*.gz \
23 | --inter_uri=gs://lbanor/pyspark/marreco/neighbor/intermediate/{} \
24 | --threshold=0.1 \
25 | --force=no \
26 | --decay=0.03 \
27 | --w_browse=0.5 \
28 | --w_purchase=6.0 \
29 | --neighbor_uri=s3n://gfg-reco/similarities_matrix/ \
30 | --algorithm=neighbor
31 | ```
32 |
33 | In this example, notice the `source_uri` is a template for where to get datajet data from. The `{}` is
34 | later used for string formatting in python (where the date is set).
35 |
36 | Next we have `inter_uri` and this is where intermediary results are saved. By intermediary results, this means
37 | the result of the pre-processing that each algorithm applies on datajet data to get its input schema setup for
38 | later usage.
39 |
40 | Finally we have the `neighbor_uri` and that's where we save the final results. The example shown above contains values
41 | that we used in our own production environment. Please change them accordingly to your infrastructure.
42 |
43 | For the `top_seller` algorithm, here follows an example:
44 |
45 | ```sh
46 | gcloud dataproc jobs submit pyspark --cluster=test3 \
47 | --properties=spark.hadoop.fs.s3n.awsAccessKeyId=,spark.hadoop.fs.s3n.awsSecretAccessKey= \
48 | --py-files=base.py,factory.py,top_seller.py \
49 | --bucket=lbanor \
50 | run_marreco.py -- \
51 | --days_init=7 \
52 | --days_end=3 \
53 | --source_uri=gs://lbanor/pyspark/datajet/dt={}/*.gz \
54 | --inter_uri=gs://lbanor/pyspark/marreco/top_seller/intermediate/{} \
55 | --force=no \
56 | --top_seller_uri=s3n://gfg-reco/top_seller_array/ \
57 | --algorithm=top_seller
58 | ```
59 |
60 | To get access for the *help* menu, you can run:
61 |
62 | ```sh
63 | python run_marreco.py -h
64 | ```
65 |
66 | And for information about each algorithm, you can run (replace "neighbor" with any other available *algorithm* you desire):
67 |
68 | ```sh
69 | python run_marreco.py --algorithm=neighbor -h
70 | ```
71 |
72 | Examples of running each algorithm can be found in the folder `bin` such as the file `bin/dataproc_neighbor.sh`.
73 |
74 | ### Neighbor Algorithm
75 |
76 | For the neighborhood algorithm, you can send the parameter `threshold` which sets from which number the similarities should converge to real values with given probability. For instance, if you choose `threshold=0.1`, then everything above this value will be guaranteed to converge to real value with given probability and with a given relative error. The trade-off is that less computing resources is required to run the job.
77 |
78 | ## Pre-Requisites
79 |
80 | Main dependecies are:
81 | * *pyspark* with spark installed and ready to receive jobs.
82 | * Jinja2
83 | * Numpy (for unit test)
84 | * *pytest*, *pytest-cov* and *mock*
85 |
86 | ## Running Unit Tests
87 |
88 | There are two types of tests in this project, *unit* and *system*. To run the latter, it's required to have a local spark cluster running in order to receive the jobs.
89 |
90 | To run *unit testing*, go to main folder and run:
91 |
92 | ```sh
93 | py.test tests/unit/ --quiet --cov=.
94 | ```
95 |
96 | For *integration testing*, it's required to run each test separately so to not have spark conflicts:
97 |
98 | ```sh
99 | py.test tests/system/spark_jobs/test_neighbor.py --quiet --cov=. --cov-fail-under=100
100 | ```
101 |
102 | Or for top seller:
103 |
104 | ```sh
105 | py.test tests/system/spark_jobs/test_top_seller.py --quiet --cov=. --cov-fail-under=100
106 | ```
107 |
108 | Notice the integration tests will take much longer as it initializes a spark context for the tests.
109 |
--------------------------------------------------------------------------------
/tests/unit/spark_jobs/test_neighbor.py:
--------------------------------------------------------------------------------
1 | #MIT License
2 | #
3 | #Copyright (c) 2017 Willian Fuks
4 | #
5 | #Permission is hereby granted, free of charge, to any person obtaining a copy
6 | #of this software and associated documentation files (the "Software"), to deal
7 | #in the Software without restriction, including without limitation the rights
8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | #copies of the Software, and to permit persons to whom the Software is
10 | #furnished to do so, subject to the following conditions:
11 | #
12 | #The above copyright notice and this permission notice shall be included in all
13 | #copies or substantial portions of the Software.
14 | #
15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | #SOFTWARE.
22 |
23 | import unittest
24 | import sys
25 | import mock
26 | import json
27 | import datetime
28 | import math
29 | from collections import namedtuple
30 |
31 | from pyspark.sql import types as stypes
32 | sys.path.append('./spark_jobs')
33 |
34 |
35 | class Test_neighbor(unittest.TestCase):
36 | @staticmethod
37 | def _get_target_class():
38 | from neighbor import MarrecoNeighborJob
39 |
40 |
41 | return MarrecoNeighborJob
42 |
43 |
44 | def test_users_matrix_schema(self):
45 | klass = self._get_target_class()()
46 | expected = stypes.StructType(fields=[
47 | stypes.StructField("user_id", stypes.StringType()),
48 | stypes.StructField('interacted_items', stypes.ArrayType(
49 | stypes.StructType(fields=[stypes.StructField('key',
50 | stypes.StringType()), stypes.StructField('score',
51 | stypes.FloatType())])))])
52 |
53 | self.assertEqual(expected, klass._load_users_matrix_schema())
54 |
55 |
56 | def test_neighbor_schema(self):
57 | klass = self._get_target_class()()
58 | expected = stypes.StructType(fields=[
59 | stypes.StructField("item_key", stypes.StringType()),
60 | stypes.StructField("similarity_items", stypes.ArrayType(
61 | stypes.StructType(fields=[
62 | stypes.StructField("key", stypes.StringType()),
63 | stypes.StructField("score", stypes.FloatType())])))])
64 |
65 | self.assertEqual(expected, klass._load_neighbor_schema())
66 |
67 |
68 | @mock.patch('neighbor.random')
69 | def test_run_dimsum(self, random_mock):
70 | klass = self._get_target_class()()
71 |
72 | random_mock.random.return_value = 0.5
73 | class BroadDict(object):
74 | def __init__(self, dict_):
75 | self.value = dict_
76 |
77 | pq_b = BroadDict({'0': [0.6, 2.],
78 | '1': [0.6, 2.],
79 | '2': [0.3, 2.],
80 | '3': [0.6, 4.]})
81 |
82 | row = [('0', 2.), ('1', 4.), ('2', 6.), ('3', 8)]
83 | expected = [(('0', '1'), 2), (('0', '3'), 2.), (('1', '3'), 4.)]
84 |
85 | result = list(klass._run_DIMSUM(row, pq_b))
86 | self.assertEqual(expected, result)
87 |
88 |
89 | def test_process_scores(self):
90 | klass = self._get_target_class()()
91 | row = ['0', [('0', 1.), ('1', 2.), ('2', 3.)]]
92 | expected = [('0', 1.), ('1', 4.), ('2', 9)]
93 |
94 | result = list(klass._process_scores(row))
95 | self.assertEqual(expected, result)
96 |
97 |
98 | def test_render_inter_uri(self):
99 | klass = self._get_target_class()()
100 |
101 | expected = 'test_uri/part-*'
102 | result = klass._render_inter_uri('test_uri')
103 |
104 | self.assertEqual(expected, result)
105 |
106 | @mock.patch('neighbor.datetime')
107 | def test_process_json_product_view(self, datetime_mock):
108 | datetime_mock.datetime.now.return_value = datetime.datetime.utcfromtimestamp(
109 | 1502685428091 / 1000)
110 | datetime_mock.datetime.utcfromtimestamp.return_value = \
111 | datetime.datetime(*[2017, 8, 13])
112 |
113 | data = open('tests/data/productview_mock.json').read()
114 |
115 | Args = namedtuple('args', ['w_browse', 'w_purchase', 'decay'])
116 | args = Args(0.5, 2., 1.5)
117 |
118 | klass = self._get_target_class()()
119 | result = list(klass._process_json(data, args))
120 | expected = [['25e35a54c8cace51', ('MA042APM76IPJ', math.exp(-1.5 * 1) * args.w_browse)]]
121 | self.assertEqual(expected, result)
122 |
123 |
124 | @mock.patch('neighbor.datetime')
125 | def test_process_json_orderconfirmation(self, datetime_mock):
126 | datetime_mock.datetime.now.return_value = datetime.datetime.utcfromtimestamp(
127 | 1502685428091 / 1000)
128 | datetime_mock.datetime.utcfromtimestamp.return_value = \
129 | datetime.datetime(*[2017, 8, 13])
130 |
131 | data = open('tests/data/orderconfirmation_mock.json').read()
132 |
133 | Args = namedtuple('args', ['w_browse', 'w_purchase', 'decay'])
134 | args = Args(0.5, 2., 1.5)
135 |
136 | klass = self._get_target_class()()
137 | result = list(klass._process_json(data, args))
138 | expected = [['610574c802ba3b33',
139 | ('DA923SHF35RHK', math.exp(-1.5 * 1) * args.w_purchase)],
140 | ['610574c802ba3b33',
141 | ('VI618SHF69UQC', math.exp(-1.5 * 1) * args.w_purchase)]]
142 | self.assertEqual(expected, result)
143 |
144 |
145 | def test_process_json_search(self):
146 | data = open('tests/data/search_mock.json').read()
147 |
148 | Args = namedtuple('args', ['w_browse', 'w_purchase', 'decay'])
149 | args = Args(0.5, 2., 1.5)
150 |
151 | klass = self._get_target_class()()
152 | result = list(klass._process_json(data, args))
153 | expected = []
154 | self.assertEqual(expected, result)
155 |
156 |
157 | def test_aggregate_skus(self):
158 | row = ['0', [('1', 0.5), ('2', 1.), ('1', 1.)]]
159 | expected = [('0', [('1', 1.5), ('2', 1.)])]
160 |
161 | klass = self._get_target_class()()
162 | result = list(klass._aggregate_skus(row))
163 | self.assertEqual(expected, result)
164 |
165 |
166 | def test_process_sysargs(self):
167 | args = ['--days_init=3',
168 | '--days_end=2',
169 | '--source_uri=source_uri',
170 | '--inter_uri=inter_uri',
171 | '--threshold=0.5',
172 | '--force=yes',
173 | '--users_matrix_uri=users_uri',
174 | '--neighbor_uri=neighbor_uri',
175 | '--w_browse=0.6',
176 | '--w_purchase=1.5']
177 |
178 | klass = self._get_target_class()()
179 | args = klass.process_sysargs(args)
180 | self.assertEqual(args.days_init, 3)
181 | self.assertEqual(args.days_end, 2)
182 | self.assertEqual(args.source_uri, 'source_uri')
183 | self.assertEqual(args.inter_uri, 'inter_uri')
184 | self.assertEqual(args.threshold, 0.5)
185 | self.assertEqual(args.force, 'yes')
186 | self.assertEqual(args.users_matrix_uri, 'users_uri')
187 | self.assertEqual(args.neighbor_uri, 'neighbor_uri')
188 | self.assertEqual(args.w_browse, 0.6)
189 | self.assertEqual(args.w_purchase, 1.5)
190 |
--------------------------------------------------------------------------------
/notebooks/marreco_df.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | cells: [
3 | {
4 | cell_type: "code",
5 | execution_count: 57,
6 | metadata: { },
7 | outputs: [
8 | {
9 | data: {
10 | text/plain: [
11 | "['In', ",
12 | " 'Out', ",
13 | " 'SQLContext', ",
14 | " 'SparkConf', ",
15 | " 'SparkContext', ",
16 | " 'SparkSession', ",
17 | " '_', ",
18 | " '_19', ",
19 | " '_23', ",
20 | " '_28', ",
21 | " '_33', ",
22 | " '_36', ",
23 | " '_39', ",
24 | " '_4', ",
25 | " '_44', ",
26 | " '_47', ",
27 | " '_50', ",
28 | " '_56', ",
29 | " '__', ",
30 | " '___', ",
31 | " '__builtin__', ",
32 | " '__builtins__', ",
33 | " '__doc__', ",
34 | " '__loader__', ",
35 | " '__name__', ",
36 | " '__package__', ",
37 | " '__spec__', ",
38 | " '_dh', ",
39 | " '_i', ",
40 | " '_i1', ",
41 | " '_i10', ",
42 | " '_i11', ",
43 | " '_i12', ",
44 | " '_i13', ",
45 | " '_i14', ",
46 | " '_i15', ",
47 | " '_i16', ",
48 | " '_i17', ",
49 | " '_i18', ",
50 | " '_i19', ",
51 | " '_i2', ",
52 | " '_i20', ",
53 | " '_i21', ",
54 | " '_i22', ",
55 | " '_i23', ",
56 | " '_i24', ",
57 | " '_i25', ",
58 | " '_i26', ",
59 | " '_i27', ",
60 | " '_i28', ",
61 | " '_i29', ",
62 | " '_i3', ",
63 | " '_i30', ",
64 | " '_i31', ",
65 | " '_i32', ",
66 | " '_i33', ",
67 | " '_i34', ",
68 | " '_i35', ",
69 | " '_i36', ",
70 | " '_i37', ",
71 | " '_i38', ",
72 | " '_i39', ",
73 | " '_i4', ",
74 | " '_i40', ",
75 | " '_i41', ",
76 | " '_i42', ",
77 | " '_i43', ",
78 | " '_i44', ",
79 | " '_i45', ",
80 | " '_i46', ",
81 | " '_i47', ",
82 | " '_i48', ",
83 | " '_i49', ",
84 | " '_i5', ",
85 | " '_i50', ",
86 | " '_i51', ",
87 | " '_i52', ",
88 | " '_i53', ",
89 | " '_i54', ",
90 | " '_i55', ",
91 | " '_i56', ",
92 | " '_i57', ",
93 | " '_i6', ",
94 | " '_i7', ",
95 | " '_i8', ",
96 | " '_i9', ",
97 | " '_ih', ",
98 | " '_ii', ",
99 | " '_iii', ",
100 | " '_oh', ",
101 | " '_pythonstartup', ",
102 | " 'atexit', ",
103 | " 'build_correlations', ",
104 | " 'combine_skus', ",
105 | " 'conf', ",
106 | " 'defaultdict', ",
107 | " 'exit', ",
108 | " 'get_ipython', ",
109 | " 'math', ",
110 | " 'os', ",
111 | " 'platform', ",
112 | " 'py4j', ",
113 | " 'query', ",
114 | " 'quit', ",
115 | " 'r', ",
116 | " 'sc', ",
117 | " 'schema', ",
118 | " 'sfunc', ",
119 | " 'spark', ",
120 | " 'sql', ",
121 | " 'sqlContext', ",
122 | " 'sqlCtx', ",
123 | " 'stypes', ",
124 | " 'time', ",
125 | " 'train_df', ",
126 | " 'warnings']"
127 | ]
128 | },
129 | execution_count: 57,
130 | metadata: { },
131 | output_type: "execute_result"
132 | }
133 | ],
134 | source: [
135 | "dir()"
136 | ]
137 | },
138 | {
139 | cell_type: "code",
140 | execution_count: 92,
141 | metadata: {
142 | collapsed: true
143 | },
144 | outputs: [ ],
145 | source: [
146 | "import time ",
147 | "from collections import defaultdict ",
148 | "from pyspark.sql import functions as sfunc ",
149 | "from pyspark.sql import types as stypes ",
150 | "import math ",
151 | "import sys"
152 | ]
153 | },
154 | {
155 | cell_type: "code",
156 | execution_count: 2,
157 | metadata: {
158 | collapsed: true
159 | },
160 | outputs: [ ],
161 | source: [
162 | "schema = stypes.StructType().add("fv", stypes.StringType()).add("sku", stypes.StringType()).add("score", stypes.FloatType())"
163 | ]
164 | },
165 | {
166 | cell_type: "code",
167 | execution_count: 3,
168 | metadata: {
169 | collapsed: true
170 | },
171 | outputs: [ ],
172 | source: [
173 | "train_df = spark.read.csv('gs://lbanor/pyspark/train_query*.gz', header=True, schema=schema)"
174 | ]
175 | },
176 | {
177 | cell_type: "code",
178 | execution_count: 4,
179 | metadata: { },
180 | outputs: [
181 | {
182 | data: {
183 | text/plain: [
184 | "[Row(fv='3383270414872112082', sku='MO578SHF77RTI', score=0.5), ",
185 | " Row(fv='7143168022217708588', sku='DA923SHF54UJP', score=0.5), ",
186 | " Row(fv='8844960186636261737', sku='LU621ACM67NYU', score=0.5)]"
187 | ]
188 | },
189 | execution_count: 4,
190 | metadata: { },
191 | output_type: "execute_result"
192 | }
193 | ],
194 | source: [
195 | "tt = train_df.head(3)"
196 | ]
197 | },
198 | {
199 | cell_type: "code",
200 | execution_count: 96,
201 | metadata: {
202 | collapsed: true
203 | },
204 | outputs: [ ],
205 | source: [
206 | "tt = train_df.collect()"
207 | ]
208 | },
209 | {
210 | cell_type: "code",
211 | execution_count: 98,
212 | metadata: { },
213 | outputs: [
214 | {
215 | data: {
216 | text/plain: [
217 | "Row(fv='3383270414872112082', sku='MO578SHF77RTI', score=0.5)"
218 | ]
219 | },
220 | execution_count: 98,
221 | metadata: { },
222 | output_type: "execute_result"
223 | }
224 | ],
225 | source: [
226 | "tt[0]"
227 | ]
228 | },
229 | {
230 | cell_type: "code",
231 | execution_count: 97,
232 | metadata: { },
233 | outputs: [
234 | {
235 | data: {
236 | text/plain: [
237 | "42915448"
238 | ]
239 | },
240 | execution_count: 97,
241 | metadata: { },
242 | output_type: "execute_result"
243 | }
244 | ],
245 | source: [
246 | "sys.getsizeof(tt)"
247 | ]
248 | },
249 | {
250 | cell_type: "code",
251 | execution_count: 20,
252 | metadata: {
253 | collapsed: true
254 | },
255 | outputs: [ ],
256 | source: [
257 | "train_df.createOrReplaceTempView('test1')"
258 | ]
259 | },
260 | {
261 | cell_type: "code",
262 | execution_count: 10,
263 | metadata: {
264 | collapsed: true
265 | },
266 | outputs: [ ],
267 | source: [
268 | "def build_correlations(row): ",
269 | " return [{"sku": e.sku, "corr": [{"sku": i.sku, "score": e.score * i.score} for i in row]} for e in row] ",
270 | "sqlContext.udf.register("BUILD_CORRELATIONS", build_correlations, stypes.ArrayType(stypes.StructType(fields=[stypes.StructField("sku", stypes.StringType(), False), stypes.StructField("corr", stypes.ArrayType(stypes.StructType(fields=[stypes.StructField("sku", stypes.StringType(), False), stypes.StructField("score", stypes.FloatType(), False)])), False)])))"
271 | ]
272 | },
273 | {
274 | cell_type: "code",
275 | execution_count: 51,
276 | metadata: {
277 | collapsed: true
278 | },
279 | outputs: [ ],
280 | source: [
281 | "def combine_skus(ref_sku, row): ",
282 | " d = defaultdict(float) ",
283 | " ref_norm = 0.0 ",
284 | " for inner_row in row: ",
285 | " for e in inner_row: ",
286 | " d[e.sku] += e.score ",
287 | " if e.sku == ref_sku: ",
288 | " ref_norm += e.score ",
289 | " ref_norm = math.sqrt(ref_norm) ",
290 | " return {"norm": ref_norm, "corr": [{"sku": key, "similarity": value / ref_norm} for key, value in d.items()]} ",
291 | "sqlContext.udf.register("COMBINE_SKUS", combine_skus, stypes.StructType(fields=[stypes.StructField("norm", stypes.FloatType(), False), stypes.StructField("corr", stypes.ArrayType(stypes.StructType(fields=[stypes.StructField("sku", stypes.StringType(), False), stypes.StructField("similarity", stypes.FloatType(), False)]) ) )]))"
292 | ]
293 | },
294 | {
295 | cell_type: "code",
296 | execution_count: 85,
297 | metadata: {
298 | collapsed: true
299 | },
300 | outputs: [ ],
301 | source: [
302 | "query = """ ",
303 | "SELECT ",
304 | " data.sku sku, ",
305 | " COMBINE_SKUS(data.sku, COLLECT_LIST(data.corr)) data ",
306 | "FROM( ",
307 | " SELECT ",
308 | " EXPLODE(BUILD_CORRELATIONS(data)) data ",
309 | " FROM( ",
310 | " SELECT ",
311 | " fv, ",
312 | " COLLECT_LIST(STRUCT(sku, score)) data ",
313 | " FROM test1 ",
314 | " GROUP BY ",
315 | " fv ",
316 | " HAVING SIZE(data) > 1 AND SIZE(data) < 200 ",
317 | " ) ",
318 | ") ",
319 | "GROUP BY ",
320 | " data.sku ",
321 | """""
322 | ]
323 | },
324 | {
325 | cell_type: "code",
326 | execution_count: 81,
327 | metadata: {
328 | collapsed: true
329 | },
330 | outputs: [ ],
331 | source: [
332 | "r1 = spark.sql(query)"
333 | ]
334 | },
335 | {
336 | cell_type: "code",
337 | execution_count: 82,
338 | metadata: {
339 | collapsed: true
340 | },
341 | outputs: [ ],
342 | source: [
343 | "r1.createOrReplaceTempView('test2')"
344 | ]
345 | },
346 | {
347 | cell_type: "code",
348 | execution_count: 69,
349 | metadata: {
350 | collapsed: true
351 | },
352 | outputs: [ ],
353 | source: [
354 | "query_extract_norms = """ ",
355 | "SELECT ",
356 | " sku, ",
357 | " data.norm norm ",
358 | "FROM test2 ",
359 | """""
360 | ]
361 | },
362 | {
363 | cell_type: "code",
364 | execution_count: 84,
365 | metadata: { },
366 | outputs: [
367 | {
368 | name: "stdout",
369 | output_type: "stream",
370 | text: [
371 | "1481.6083595752716 "
372 | ]
373 | }
374 | ],
375 | source: [
376 | "t0 = time.time() ",
377 | "r2 = {e.sku: e.norm for e in spark.sql(query_extract_norms).collect()} ",
378 | "print(time.time() - t0)"
379 | ]
380 | },
381 | {
382 | cell_type: "code",
383 | execution_count: 86,
384 | metadata: {
385 | collapsed: true
386 | },
387 | outputs: [ ],
388 | source: [
389 | "r2_broad = sc.broadcast(r2)"
390 | ]
391 | },
392 | {
393 | cell_type: "code",
394 | execution_count: 87,
395 | metadata: {
396 | collapsed: true
397 | },
398 | outputs: [ ],
399 | source: [
400 | "def normalize_corrs(corrs): ",
401 | " return [{"sku": e.sku, "similarity": e.similarity / r2_broad.value[e.sku]} for e in corrs] ",
402 | "sqlContext.udf.register("NORMALIZE_CORRS", normalize_corrs, stypes.ArrayType(stypes.StructType(fields=[stypes.StructField("sku", stypes.StringType(), False), stypes.StructField("similarity", stypes.FloatType(), False)])))"
403 | ]
404 | },
405 | {
406 | cell_type: "code",
407 | execution_count: 88,
408 | metadata: {
409 | collapsed: true
410 | },
411 | outputs: [ ],
412 | source: [
413 | "final_query = """ ",
414 | "select ",
415 | "sku, ",
416 | "NORMALIZE_CORRS(data.corr) corr ",
417 | "FROM test2 ",
418 | """""
419 | ]
420 | },
421 | {
422 | cell_type: "code",
423 | execution_count: 90,
424 | metadata: {
425 | collapsed: true
426 | },
427 | outputs: [ ],
428 | source: [
429 | "final = spark.sql(final_query)"
430 | ]
431 | },
432 | {
433 | cell_type: "code",
434 | execution_count: 91,
435 | metadata: { },
436 | outputs: [
437 | {
438 | name: "stdout",
439 | output_type: "stream",
440 | text: [
441 | "381.65184354782104 "
442 | ]
443 | }
444 | ],
445 | source: [
446 | "t0 = time.time() ",
447 | "final.head(1) ",
448 | "print(time.time() - t0)"
449 | ]
450 | },
451 | {
452 | cell_type: "code",
453 | execution_count: null,
454 | metadata: {
455 | collapsed: true
456 | },
457 | outputs: [ ],
458 | source: [ ]
459 | },
460 | {
461 | cell_type: "code",
462 | execution_count: null,
463 | metadata: {
464 | collapsed: true
465 | },
466 | outputs: [ ],
467 | source: [ ]
468 | }
469 | ],
470 | metadata: {
471 | kernelspec: {
472 | display_name: "PySpark",
473 | language: "python",
474 | name: "pyspark"
475 | },
476 | language_info: {
477 | codemirror_mode: {
478 | name: "ipython",
479 | version: 3
480 | },
481 | file_extension: ".py",
482 | mimetype: "text/x-python",
483 | name: "python",
484 | nbconvert_exporter: "python",
485 | pygments_lexer: "ipython3",
486 | version: "3.5.2"
487 | }
488 | },
489 | nbformat: 4,
490 | nbformat_minor: 2
491 | }
492 |
--------------------------------------------------------------------------------
/spark_jobs/top_seller.py:
--------------------------------------------------------------------------------
1 | #MIT License
2 | #
3 | #Copyright (c) 2017 Willian Fuks
4 | #
5 | #Permission is hereby granted, free of charge, to any person obtaining a copy
6 | #of this software and associated documentation files (the "Software"), to deal
7 | #in the Software without restriction, including without limitation the rights
8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | #copies of the Software, and to permit persons to whom the Software is
10 | #furnished to do so, subject to the following conditions:
11 | #
12 | #The above copyright notice and this permission notice shall be included in all
13 | #copies or substantial portions of the Software.
14 | #
15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | #SOFTWARE.
22 |
23 | """
24 | Set of tools to run Marreco's Top Seller algorithm in spark.
25 | """
26 |
27 | import os
28 | import sys
29 | import json
30 | import operator
31 | import math
32 | import random
33 | import argparse
34 | from collections import defaultdict
35 |
36 | sys.path.append('..')
37 |
38 | from base import MarrecoBase
39 | from py4j.protocol import Py4JJavaError
40 | from pyspark.sql.utils import AnalysisException
41 | from pyspark.sql import SparkSession
42 | from pyspark.sql import types as stypes
43 |
44 |
45 | class MarrecoTopSellerJob(MarrecoBase):
46 | """This Class has all methods necessary to build Marreco Neighborhood
47 | against Spark.
48 |
49 | :type context: `pyspark.SparkContext`
50 | :param context: context in which Jobs are ran against.
51 | """
52 | def transform_data(self, sc, args):
53 | """This method gets datajet files as input and prepare them on a daily
54 | intermediary basis for Marreco's Top Seller algorithm.
55 |
56 | :type sc: spark context
57 | :param sc: spark context for running jobs.
58 |
59 | :param kwargs:
60 |
61 | :type days_init: int
62 | :param days: how many days to scan through the files to be used
63 | in the transformation phase.
64 |
65 | :type days_end: int
66 | :param days_end:
67 |
68 | :type inter_uri: str
69 | :param inter_uri: uri for where to save intermediate results.
70 |
71 | :type force: str
72 | :param force: either ``yes``, in which case forces recreation of
73 | files, or ``no``, which in case if files already
74 | exist then does nothing.
75 |
76 | :type source_uri: str
77 | :param source_uri: URI from where to read files.
78 | """
79 | spark = SparkSession(sc)
80 | for day in range(args.days_init, args.days_end - 1, -1):
81 | formatted_day = self.get_formatted_date(day)
82 |
83 | source_uri = args.source_uri.format(formatted_day)
84 | inter_uri = args.inter_uri.format(formatted_day)
85 | try:
86 | inter_data = spark.read.json(inter_uri,
87 | schema = self._load_top_seller_schema()).first()
88 |
89 | if args.force == 'yes' or not inter_data:
90 | self._process_datajet_day(sc,
91 | source_uri,
92 | inter_uri,
93 | 'overwrite')
94 | except (Py4JJavaError, AnalysisException):
95 | self._process_datajet_day(sc, source_uri, inter_uri)
96 | finally:
97 | print('processed data for {} day'.format(day))
98 |
99 |
100 | def _process_datajet_day(self, sc, uri, inter_uri, mode=None):
101 | """Gets datajet json like files and transforms them into data like
102 | [(sku, items_sold),...] saving it in the end.
103 |
104 | :type sc: spark context
105 | :param sc: context to run spark jobs.
106 |
107 | :type uri: str
108 | :param uri: where the files are located.
109 |
110 | :type inter_uri: str
111 | :param inter_uri: where intermediate results should be saved.
112 |
113 | :type mode: str
114 | :param mode: indicates how data should be saved. If ``None`` then
115 | throws error if file already exist. If ``overwrite`` then
116 | deletes previous file and saves new one.
117 | """
118 | sc.textFile(uri) \
119 | .flatMap(lambda x: self._process_json(x)) \
120 | .filter(lambda x: x) \
121 | .reduceByKey(operator.add) \
122 | .toDF(schema=self._load_top_seller_schema()) \
123 | .write.json(inter_uri, compression='gzip', mode=mode)
124 |
125 |
126 | def _load_top_seller_schema(self):
127 | """Loads schema for top seller intermediate data saved like
128 | [sku, items_sold]
129 |
130 | :rtype: `pyspark.sql.StructType`
131 | :returns: schema for top selling data
132 | """
133 | return stypes.StructType(fields=[
134 | stypes.StructField("item_key", stypes.StringType()),
135 | stypes.StructField("value", stypes.IntegerType())])
136 |
137 |
138 | def build_marreco(self, sc, args):
139 | """Main method for building Marreco's algorithms and saving results
140 | for later usage.
141 |
142 | :type sc: `pyspark.SparkContext`
143 | :param sc: spark context for running jobs.
144 |
145 | :type args: Namespace
146 | :param args:
147 | :type days_init: int
148 | :param days_init: which date time that will be used for reading data
149 | with intermediary daily results.
150 |
151 | :type days_end: int
152 | :param days_end: until what file to read input data.
153 |
154 | :type inter_uri: str
155 | :param inter_uri: URI where intermediary results should be read from
156 |
157 | :type source_uri: str
158 | :param source_uri: source from where to read input data
159 |
160 | :type force: str
161 | :param force: either ``yes`` in which case replace intermediate files
162 | or ``no`` where nothing is done if file already exists.
163 |
164 | :type top_seller_uri: str
165 | :param top_seller_uri: URI for where to save results
166 | """
167 | spark = SparkSession(sc)
168 | data = sc.emptyRDD()
169 |
170 | for day in range(args.days_init, args.days_end - 1, -1):
171 | formatted_day = self.get_formatted_date(day)
172 | inter_uri = self._render_inter_uri(args.inter_uri.format(
173 | formatted_day))
174 |
175 | data = data.union(spark.read.json(inter_uri,
176 | schema=self._load_top_seller_schema()).rdd)
177 |
178 | data = data.reduceByKey(operator.add) \
179 | .sortBy(lambda x: x[1], False)
180 | self._save_top_seller_matrix(args.top_seller_uri, data)
181 |
182 |
183 | def _save_top_seller_matrix(self, top_seller_uri, data):
184 | """Loads top seller schema and saves final results as
185 | [(item_key, items_sold), (item_key, items_sold)...]}
186 |
187 | :type top_seller_uri: str
188 | :param top_seller_uri: uri for where to save the matrix.
189 |
190 | :type data: RDD
191 | :param data: RDD with data like [item_key, items_sold]
192 | """
193 | data.toDF(schema=self._load_top_seller_schema()) \
194 | .write.json(top_seller_uri, compression='gzip', mode='overwrite')
195 |
196 |
197 | def _render_inter_uri(self, inter_uri, name_pattern='part-*'):
198 | """Helper function to process inter_uri's for later usage.
199 |
200 | :type inter_uri: str
201 | :param inter_uri: URI used for saving intermediate data transformation
202 | results.
203 |
204 | :type name_pattern: str
205 | :param name_pattern: pattern used by spark to save multiple files.
206 |
207 | :rtype: str
208 | :returns: URI rendered template for retrieving data back to code.
209 | """
210 | return os.path.join(inter_uri, name_pattern)
211 |
212 |
213 | @staticmethod
214 | def _process_json(row):
215 | """Mapper function to extract from each line from datajet file
216 | and return interactions between customers and sold skus.
217 |
218 | :type row: str
219 | :param row: json string with datajet data.
220 |
221 | :rtype: list
222 | :returns: `yield` on [sku, items_sold]
223 | """
224 | try:
225 | r = json.loads(row)
226 | if (r['event']['source']['tracker'] == 'fish' and
227 | 'local_timestamp' in r['event'] and
228 | r['event']['identifiers']['djUCID']['value'] and
229 | r['event']['type'] == "orderconfirmation"):
230 |
231 | for e in list(zip([e['group_id'] for e in
232 | r['event']['details']['products']],
233 | ([int(e) for e in
234 | r['event']['details']['quantities']]))):
235 | yield e
236 |
237 | except:
238 | yield []
239 |
240 |
241 | @staticmethod
242 | def process_sysargs(args):
243 | parser = argparse.ArgumentParser()
244 |
245 | parser.add_argument('--days_init',
246 | dest='days_init',
247 | type=int,
248 | help=("Total amount of days to come back in time "
249 | "from today's date."))
250 |
251 | parser.add_argument('--days_end',
252 | dest='days_end',
253 | type=int,
254 | help=("Total amount of days to come back in time "
255 | "from today's date."))
256 |
257 | parser.add_argument('--source_uri',
258 | dest='source_uri',
259 | type=str,
260 | help=("URI template from where to read source "
261 | "files from."))
262 |
263 | parser.add_argument('--inter_uri',
264 | dest='inter_uri',
265 | type=str,
266 | help=('URI for saving intermediary results.'))
267 |
268 | parser.add_argument('--top_seller_uri',
269 | dest='top_seller_uri',
270 | type=str,
271 | help=('URI for saving top_seller results.'))
272 |
273 | parser.add_argument('--force',
274 | dest='force',
275 | type=str,
276 | help=('If ``yes`` then replace all files with new ones. '
277 | ' If ``no``, then no replacing happens.'))
278 |
279 | args = parser.parse_args(args)
280 | return args
281 |
--------------------------------------------------------------------------------
/tests/system/spark_jobs/test_top_seller.py:
--------------------------------------------------------------------------------
1 | #MIT License
2 | #
3 | #Copyright (c) 2017 Willian Fuks
4 | #
5 | #Permission is hereby granted, free of charge, to any person obtaining a copy
6 | #of this software and associated documentation files (the "Software"), to deal
7 | #in the Software without restriction, including without limitation the rights
8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | #copies of the Software, and to permit persons to whom the Software is
10 | #furnished to do so, subject to the following conditions:
11 | #
12 | #The above copyright notice and this permission notice shall be included in all
13 | #copies or substantial portions of the Software.
14 | #
15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | #SOFTWARE.
22 |
23 | """
24 | This is system tests and should only be run if the environment has pyspark
25 | and a spark cluster installed to receive on-demand jobs
26 | """
27 |
28 | import os
29 | import unittest
30 | import sys
31 | import mock
32 | import json
33 | import datetime
34 | import pyspark
35 | import math
36 | import glob
37 | import shutil
38 | from collections import namedtuple
39 | import numpy as np
40 |
41 | from pyspark.sql import types as stypes
42 | sys.path.append('./spark_jobs')
43 |
44 | py_files = ['./spark_jobs/top_seller.py',
45 | './spark_jobs/base.py',
46 | './spark_jobs/factory.py']
47 |
48 |
49 | class Test_top_seller(unittest.TestCase):
50 |
51 | _sc = pyspark.SparkContext(pyFiles=py_files)
52 | _session = pyspark.sql.SparkSession(_sc)
53 | _to_delete_uris = []
54 |
55 |
56 | @staticmethod
57 | def _get_target_class():
58 | from top_seller import MarrecoTopSellerJob
59 |
60 |
61 | return MarrecoTopSellerJob
62 |
63 |
64 | @staticmethod
65 | def _delete_dirs(*args):
66 | for arg in args:
67 | if os.path.isdir(arg):
68 | shutil.rmtree(arg)
69 |
70 |
71 | def _prepare_daily_data(self):
72 | for i in [1, 2]:
73 | uri = 'tests/system/data/top_seller/train/{}/train.json'.format(
74 | i)
75 | data = self._sc.textFile(uri)
76 | formatted_day = (datetime.datetime.now() -
77 | datetime.timedelta(days=i)).strftime('%Y-%m-%d')
78 |
79 | save_uri = 'tests/system/data/top_seller/train/{}/train.json'.format(
80 | formatted_day)
81 | self._delete_dirs(save_uri)
82 | self._to_delete_uris.append(os.path.dirname(save_uri))
83 | data.saveAsTextFile(save_uri)
84 |
85 |
86 | def _delete_uris(self):
87 | for uri in self._to_delete_uris:
88 | self._delete_dirs(uri)
89 | self._to_delete_uris = []
90 |
91 |
92 | def test_process_datajet_day_no_force(self):
93 | klass = self._get_target_class()()
94 | inter_uri = 'tests/system/data/top_seller/dj'
95 | self._delete_dirs(inter_uri)
96 | self.assertFalse(os.path.isdir(inter_uri))
97 |
98 | klass._process_datajet_day(self._sc,
99 | 'tests/system/data/top_seller/datajet_test.json',
100 | inter_uri,
101 | mode=None)
102 |
103 | result = self._session.read.json(inter_uri).toJSON().collect()
104 | expected = ['{"item_key":"DA923SHF35RHK","value":1}',
105 | '{"item_key":"VI618SHF69UQC","value":1}']
106 |
107 | self.assertEqual(result, expected)
108 | self._delete_dirs(inter_uri)
109 | self.assertFalse(os.path.isdir(inter_uri))
110 |
111 |
112 | def test_process_datajet_day_yes_force(self):
113 | klass = self._get_target_class()()
114 | inter_uri = 'tests/system/data/top_seller/dj'
115 | self._delete_dirs(inter_uri)
116 | self.assertFalse(os.path.isdir(inter_uri))
117 |
118 | klass._process_datajet_day(self._sc,
119 | 'tests/system/data/top_seller/datajet_test.json',
120 | inter_uri,
121 | mode=None)
122 |
123 | klass._process_datajet_day(self._sc,
124 | 'tests/system/data/top_seller/datajet_test.json',
125 | inter_uri,
126 | mode='overwrite')
127 |
128 | result = self._session.read.json(inter_uri).toJSON().collect()
129 | expected = ['{"item_key":"DA923SHF35RHK","value":1}',
130 | '{"item_key":"VI618SHF69UQC","value":1}']
131 |
132 | self.assertEqual(result, expected)
133 | self._delete_dirs(inter_uri)
134 | self.assertFalse(os.path.isdir(inter_uri))
135 |
136 |
137 | def test_transform_data_no_force(self):
138 | klass = self._get_target_class()()
139 | inter_uri = 'tests/system/data/top_seller/inter/{}'
140 | Args = namedtuple('args', ['days_init',
141 | 'days_end',
142 | 'force',
143 | 'source_uri',
144 | 'inter_uri'])
145 |
146 | self._prepare_daily_data()
147 |
148 | args = Args(2, 1, 'no',
149 | 'tests/system/data/top_seller/train/{}/train.json',
150 | inter_uri)
151 | klass.transform_data(self._sc, args)
152 |
153 | data1_uri = ['{"item_key":"2","value":1}',
154 | '{"item_key":"3","value":1}',
155 | '{"item_key":"0","value":2}']
156 |
157 | data2_uri = ['{"item_key":"0","value":1}',
158 | '{"item_key":"1","value":1}',
159 | '{"item_key":"2","value":2}']
160 |
161 | expected = {2: data2_uri,
162 | 1: data1_uri}
163 |
164 | for day in range(args.days_init, args.days_end - 1, -1):
165 | formatted_day = klass.get_formatted_date(day)
166 | result = self._session.read.json(inter_uri.format(formatted_day),
167 | schema=klass._load_top_seller_schema()).toJSON().collect()
168 | self.assertEqual(result, expected[day])
169 |
170 | for day in [2, 1]:
171 | formatted_day = klass.get_formatted_date(day)
172 | self._delete_dirs(inter_uri.format(formatted_day))
173 | self.assertFalse(os.path.isdir(inter_uri.format(formatted_day)))
174 | self._delete_uris()
175 | self.assertEqual(self._to_delete_uris, [])
176 |
177 |
178 | def test_transform_data_yes_force(self):
179 | klass = self._get_target_class()()
180 | inter_uri = 'tests/system/data/top_seller/inter/{}'
181 | Args = namedtuple('args', ['days_init',
182 | 'days_end',
183 | 'force',
184 | 'source_uri',
185 | 'inter_uri'])
186 |
187 | self._prepare_daily_data()
188 |
189 | args = Args(2, 1, 'no',
190 | 'tests/system/data/top_seller/train/{}/train.json',
191 | inter_uri)
192 | klass.transform_data(self._sc, args)
193 |
194 | args = Args(2, 1, 'yes',
195 | 'tests/system/data/top_seller/train/{}/train.json',
196 | inter_uri)
197 | klass.transform_data(self._sc, args)
198 |
199 | data1_uri = ['{"item_key":"2","value":1}',
200 | '{"item_key":"3","value":1}',
201 | '{"item_key":"0","value":2}']
202 |
203 | data2_uri = ['{"item_key":"0","value":1}',
204 | '{"item_key":"1","value":1}',
205 | '{"item_key":"2","value":2}']
206 |
207 | expected = {2: data2_uri,
208 | 1: data1_uri}
209 |
210 | for day in range(args.days_init, args.days_end - 1, -1):
211 | formatted_day = klass.get_formatted_date(day)
212 | result = self._session.read.json(inter_uri.format(formatted_day),
213 | schema=klass._load_top_seller_schema()).toJSON().collect()
214 | self.assertEqual(result, expected[day])
215 |
216 | for day in [2, 1]:
217 | formatted_day = klass.get_formatted_date(day)
218 | self._delete_dirs(inter_uri.format(formatted_day))
219 | self.assertFalse(os.path.isdir(inter_uri.format(formatted_day)))
220 | self._delete_uris()
221 | self.assertEqual(self._to_delete_uris, [])
222 |
223 |
224 | def test_build_marreco_yes_force(self):
225 | klass = self._get_target_class()()
226 | inter_uri = 'tests/system/data/top_seller/inter/{}'
227 | result_uri = 'tests/system/data/top_seller/result'
228 |
229 | self._prepare_daily_data()
230 |
231 | Args = namedtuple('args', ['days_init',
232 | 'days_end',
233 | 'force',
234 | 'source_uri',
235 | 'inter_uri',
236 | 'top_seller_uri'])
237 |
238 | args = Args(2, 1, 'no',
239 | 'tests/system/data/top_seller/train/{}/train.json',
240 | inter_uri,
241 | 'tests/system/data/top_seller/result')
242 | klass.transform_data(self._sc, args)
243 | klass.build_marreco(self._sc, args)
244 |
245 | args = Args(2, 1, 'yes',
246 | 'tests/system/data/top_seller/train/{}/train.json',
247 | inter_uri,
248 | 'tests/system/data/top_seller/result')
249 | klass.transform_data(self._sc, args)
250 | klass.build_marreco(self._sc, args)
251 |
252 | expected = ['{"item_key":"0","value":3}',
253 | '{"item_key":"1","value":1}',
254 | '{"item_key":"2","value":3}',
255 | '{"item_key":"3","value":1}']
256 |
257 | result = sorted(self._session.read.json(result_uri,
258 | schema=klass._load_top_seller_schema()).toJSON().collect())
259 |
260 | self.assertEqual(result, expected)
261 | self._delete_dirs(result_uri)
262 |
263 | for day in [2, 1]:
264 | formatted_day = klass.get_formatted_date(day)
265 | self._delete_dirs(inter_uri.format(formatted_day))
266 | self.assertFalse(os.path.isdir(inter_uri.format(formatted_day)))
267 | self._delete_uris()
268 | self.assertEqual(self._to_delete_uris, [])
269 |
270 |
271 | def test_build_marreco_no_force(self):
272 | klass = self._get_target_class()()
273 | inter_uri = 'tests/system/data/top_seller/inter/{}'
274 | result_uri = 'tests/system/data/top_seller/result'
275 | Args = namedtuple('args', ['days_init',
276 | 'days_end',
277 | 'force',
278 | 'source_uri',
279 | 'inter_uri',
280 | 'top_seller_uri'])
281 |
282 | self._prepare_daily_data()
283 |
284 | args = Args(2, 1, 'no',
285 | 'tests/system/data/top_seller/train/{}/train.json',
286 | inter_uri,
287 | 'tests/system/data/top_seller/result')
288 | klass.transform_data(self._sc, args)
289 | klass.build_marreco(self._sc, args)
290 |
291 | expected = ['{"item_key":"0","value":3}',
292 | '{"item_key":"1","value":1}',
293 | '{"item_key":"2","value":3}',
294 | '{"item_key":"3","value":1}']
295 |
296 | result = sorted(self._session.read.json(result_uri,
297 | schema=klass._load_top_seller_schema())\
298 | .toJSON().collect())
299 |
300 | self.assertEqual(result, expected)
301 |
302 | self._delete_dirs(result_uri)
303 |
304 | for day in [2, 1]:
305 | formatted_day = klass.get_formatted_date(day)
306 | self._delete_dirs(inter_uri.format(formatted_day))
307 | self.assertFalse(os.path.isdir(inter_uri.format(formatted_day)))
308 | self._delete_uris()
309 | self.assertEqual(self._to_delete_uris, [])
310 |
311 |
--------------------------------------------------------------------------------
/notebooks/marreco_dimsum_internal.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | cells: [
3 | {
4 | cell_type: "code",
5 | execution_count: 2,
6 | metadata: {
7 | collapsed: true
8 | },
9 | outputs: [ ],
10 | source: [
11 | "from pyspark.mllib.linalg import SparseVector ",
12 | "from pyspark.mllib.linalg.distributed import RowMatrix ",
13 | "import numpy as np ",
14 | "from sklearn.metrics.pairwise import cosine_similarity ",
15 | "import time ",
16 | "from collections import defaultdict ",
17 | "from pyspark.sql import functions as sfunc ",
18 | "from pyspark.sql import types as stypes ",
19 | "import math ",
20 | "import sys ",
21 | "from pyspark.ml.linalg import SparseVector ",
22 | "from pyspark.mllib.linalg.distributed import RowMatrix ",
23 | "from operator import itemgetter ",
24 | "import operator ",
25 | "import random"
26 | ]
27 | },
28 | {
29 | cell_type: "code",
30 | execution_count: 3,
31 | metadata: {
32 | collapsed: true
33 | },
34 | outputs: [ ],
35 | source: [
36 | "schema = stypes.StructType().add("fv", stypes.StringType()).add("sku", stypes.StringType()).add("score", stypes.FloatType()) ",
37 | "train_df = spark.read.csv('gs://lbanor/pyspark/train_query*.gz', header=True, schema=schema) ",
38 | "train_df.createOrReplaceTempView('test1')"
39 | ]
40 | },
41 | {
42 | cell_type: "code",
43 | execution_count: 188,
44 | metadata: { },
45 | outputs: [
46 | {
47 | name: "stdout",
48 | output_type: "stream",
49 | text: [
50 | "[Row(fv='1005105267406228429', sku='FI911SHF89UBM-50', score=5.0)] "
51 | ]
52 | }
53 | ],
54 | source: [
55 | "print(train_df.rdd.filter(lambda x: x.sku == 'FI911SHF89UBM-50').take(3))"
56 | ]
57 | },
58 | {
59 | cell_type: "code",
60 | execution_count: 82,
61 | metadata: {
62 | collapsed: true
63 | },
64 | outputs: [ ],
65 | source: [
66 | "# query = """ ",
67 | "# SELECT ",
68 | "# sku, ",
69 | "# ROW_NUMBER() OVER (ORDER BY SUM(1)) -1 idx ",
70 | "# FROM test1 ",
71 | "# GROUP BY 1 ",
72 | "# """ ",
73 | "# skus_rdd = spark.sql(query).rdd"
74 | ]
75 | },
76 | {
77 | cell_type: "code",
78 | execution_count: 4,
79 | metadata: {
80 | collapsed: true
81 | },
82 | outputs: [ ],
83 | source: [
84 | "query_statistics = """ ",
85 | "SELECT ",
86 | " sku, ",
87 | " SQRT(10 * LOG(COUNT(sku) OVER()) / {threshold}) / SQRT(SUM(score * score)) p, ",
88 | " IF(SQRT(10 * LOG(COUNT(sku) OVER()) / {threshold}) > SQRT(SUM(score * score)), SQRT(SUM(score * score)), SQRT(10 * LOG(COUNT(sku) OVER()) / {threshold})) q --- implements the min(gamma, ||c||) ",
89 | "FROM test1 ",
90 | "GROUP BY 1 ",
91 | """""
92 | ]
93 | },
94 | {
95 | cell_type: "code",
96 | execution_count: 8,
97 | metadata: { },
98 | outputs: [ ],
99 | source: [
100 | "skus_stats = spark.sql(query_statistics.format(threshold=0.1))"
101 | ]
102 | },
103 | {
104 | cell_type: "code",
105 | execution_count: 9,
106 | metadata: { },
107 | outputs: [
108 | {
109 | name: "stdout",
110 | output_type: "stream",
111 | text: [
112 | "[Row(sku='FI911SHF89UBM-50', p=7.132311576894841, q=5.0)] "
113 | ]
114 | }
115 | ],
116 | source: [
117 | "print(skus_stats.rdd.filter(lambda x: x.sku == 'FI911SHF89UBM-50').take(3))"
118 | ]
119 | },
120 | {
121 | cell_type: "code",
122 | execution_count: 178,
123 | metadata: { },
124 | outputs: [
125 | {
126 | data: {
127 | text/plain: [
128 | "[Row(sku='PO140ACU06DDD', p=2.4697175158107982, q=14.439529078193651), ",
129 | " Row(sku='PO140ACU76FVN', p=35.661557884474206, q=1.0), ",
130 | " Row(sku='JU082SHF02WWZ', p=3.790780833876121, q=9.40744386111339)]"
131 | ]
132 | },
133 | execution_count: 178,
134 | metadata: { },
135 | output_type: "execute_result"
136 | }
137 | ],
138 | source: [
139 | "sku_stats.take(3)"
140 | ]
141 | },
142 | {
143 | cell_type: "code",
144 | execution_count: 194,
145 | metadata: { },
146 | outputs: [
147 | {
148 | name: "stdout",
149 | output_type: "stream",
150 | text: [
151 | "[] "
152 | ]
153 | }
154 | ],
155 | source: [
156 | "print(skus_stats.rdd.filter(lambda x: x.sku == 'FI911SHF89UBM-50').take(3))"
157 | ]
158 | },
159 | {
160 | cell_type: "code",
161 | execution_count: null,
162 | metadata: {
163 | collapsed: true
164 | },
165 | outputs: [ ],
166 | source: [
167 | "# query_statistics = """ ",
168 | "# SELECT ",
169 | "# sku, ",
170 | "# {gamma} / SQRT(SUM(score * score)) p, ",
171 | "# IF({gamma} > SQRT(SUM(score * score)), SQRT(SUM(score * score)), {gamma}) q ",
172 | "# FROM test1 ",
173 | "# GROUP BY 1 ",
174 | "# """"
175 | ]
176 | },
177 | {
178 | cell_type: "code",
179 | execution_count: 60,
180 | metadata: {
181 | collapsed: true
182 | },
183 | outputs: [ ],
184 | source: [
185 | "# def get_gamma(threshold, numCols): ",
186 | "# return math.sqrt(10 * math.log(numCols) / threshold) if threshold > 10e-6 else math.inf"
187 | ]
188 | },
189 | {
190 | cell_type: "code",
191 | execution_count: 76,
192 | metadata: { },
193 | outputs: [
194 | {
195 | name: "stdout",
196 | output_type: "stream",
197 | text: [
198 | "35.57234899487128 "
199 | ]
200 | }
201 | ],
202 | source: [
203 | "# gamma_b = sc.broadcast(get_gamma(10e-2)) ",
204 | "# print(gamma_b.value)"
205 | ]
206 | },
207 | {
208 | cell_type: "code",
209 | execution_count: 77,
210 | metadata: { },
211 | outputs: [ ],
212 | source: [
213 | "# skus_stats = spark.sql(query_statistics.format(gamma=gamma_b.value))"
214 | ]
215 | },
216 | {
217 | cell_type: "code",
218 | execution_count: 78,
219 | metadata: { },
220 | outputs: [
221 | {
222 | data: {
223 | text/plain: [
224 | "[Row(sku='NI531SRM74IHX', p=2.8758539658272255, q=12.36931687685298), ",
225 | " Row(sku='MO578SHF45QNE', p=0.5225157525775272, q=35.57234899487128)]"
226 | ]
227 | },
228 | execution_count: 78,
229 | metadata: { },
230 | output_type: "execute_result"
231 | }
232 | ],
233 | source: [
234 | "# skus_stats.head(2)"
235 | ]
236 | },
237 | {
238 | cell_type: "code",
239 | execution_count: 10,
240 | metadata: { },
241 | outputs: [ ],
242 | source: [
243 | "pq_b = sc.broadcast({row.sku: [row.p, row.q] for row in skus_stats.collect()})"
244 | ]
245 | },
246 | {
247 | cell_type: "code",
248 | execution_count: 11,
249 | metadata: { },
250 | outputs: [
251 | {
252 | data: {
253 | text/plain: [
254 | "[7.132311576894841, 5.0]"
255 | ]
256 | },
257 | execution_count: 11,
258 | metadata: { },
259 | output_type: "execute_result"
260 | }
261 | ],
262 | source: [
263 | "pq_b.value['FI911SHF89UBM-50']"
264 | ]
265 | },
266 | {
267 | cell_type: "code",
268 | execution_count: 157,
269 | metadata: {
270 | collapsed: true
271 | },
272 | outputs: [ ],
273 | source: [
274 | "#skus_idx_b = sc.broadcast({sku: idx for idx, sku in enumerate(pq_b.value.keys())})"
275 | ]
276 | },
277 | {
278 | cell_type: "code",
279 | execution_count: 158,
280 | metadata: { },
281 | outputs: [ ],
282 | source: [
283 | "#idx_skus_b = sc.broadcast({value: key for key, value in skus_idx_b.value.items()})"
284 | ]
285 | },
286 | {
287 | cell_type: "code",
288 | execution_count: 53,
289 | metadata: {
290 | collapsed: true
291 | },
292 | outputs: [ ],
293 | source: [
294 | "# d = {row.sku: row.idx for row in skus_rdd.collect()} ",
295 | "# db = sc.broadcast(d) ",
296 | " ",
297 | "# id_ = {value: key for key, value in d.items()} ",
298 | "# id_b = sc.broadcast(id_)"
299 | ]
300 | },
301 | {
302 | cell_type: "code",
303 | execution_count: 159,
304 | metadata: {
305 | collapsed: true
306 | },
307 | outputs: [ ],
308 | source: [
309 | "#numCols = sc.broadcast(len(idx_skus_b.value))"
310 | ]
311 | },
312 | {
313 | cell_type: "code",
314 | execution_count: 57,
315 | metadata: { },
316 | outputs: [ ],
317 | source: [
318 | "# p = [0] * numCols.value ",
319 | "# for row in skus_stats"
320 | ]
321 | },
322 | {
323 | cell_type: "code",
324 | execution_count: 55,
325 | metadata: {
326 | collapsed: true
327 | },
328 | outputs: [ ],
329 | source: [
330 | "#p = {row.sku: gamma_b.value / row.norm for row in skus_stats.collect()} # if 0 happens as the ``norm`` we expected an Exception to be raised. ",
331 | "#p_b = sc.broadcast(p)"
332 | ]
333 | },
334 | {
335 | cell_type: "code",
336 | execution_count: 34,
337 | metadata: { },
338 | outputs: [ ],
339 | source: [
340 | "#q = {row.sku: gamma_b.value / row.norm for row in skus_stats.collect()}"
341 | ]
342 | },
343 | {
344 | cell_type: "code",
345 | execution_count: 35,
346 | metadata: { },
347 | outputs: [
348 | {
349 | data: {
350 | text/plain: [
351 | "312988"
352 | ]
353 | },
354 | execution_count: 35,
355 | metadata: { },
356 | output_type: "execute_result"
357 | }
358 | ],
359 | source: [
360 | "#numCols.value"
361 | ]
362 | },
363 | {
364 | cell_type: "code",
365 | execution_count: 31,
366 | metadata: { },
367 | outputs: [
368 | {
369 | data: {
370 | text/plain: [
371 | "12.36931687685298"
372 | ]
373 | },
374 | execution_count: 31,
375 | metadata: { },
376 | output_type: "execute_result"
377 | }
378 | ],
379 | source: [
380 | "#skus_s['NI531SRM74IHX']"
381 | ]
382 | },
383 | {
384 | cell_type: "code",
385 | execution_count: 12,
386 | metadata: {
387 | collapsed: true
388 | },
389 | outputs: [ ],
390 | source: [
391 | "query_users_items = """ ",
392 | "SELECT ",
393 | "data ",
394 | "FROM( ",
395 | " SELECT ",
396 | " fv, ",
397 | " COLLECT_LIST(STRUCT(sku, score)) data ",
398 | " FROM test1 ",
399 | " GROUP BY 1 ",
400 | ") ",
401 | "WHERE SIZE(data) BETWEEN 2 AND 200 ",
402 | """" ",
403 | " ",
404 | "t0 = time.time() ",
405 | "users = spark.sql(query_users_items) ",
406 | "users_rdd = users.rdd"
407 | ]
408 | },
409 | {
410 | cell_type: "code",
411 | execution_count: 148,
412 | metadata: { },
413 | outputs: [
414 | {
415 | data: {
416 | text/plain: [
417 | "[Row(data=[Row(sku='CO796SCF87LXG', score=0.5), Row(sku='CO796SCM72JGT', score=0.5), Row(sku='CO796SCM23HHW', score=0.5)]), ",
418 | " Row(data=[Row(sku='HA723APF18CPL', score=0.5), Row(sku='CO515APF44YPR', score=0.5), Row(sku='LA906APF69OQC', score=0.5), Row(sku='TU142APF19BPC', score=0.5), Row(sku='CO515APF27DIA', score=0.5), Row(sku='GA753APF40NJR', score=0.5), Row(sku='GA753APF41NJQ', score=1.0)])]"
419 | ]
420 | },
421 | execution_count: 148,
422 | metadata: { },
423 | output_type: "execute_result"
424 | }
425 | ],
426 | source: [
427 | "users.head(2)"
428 | ]
429 | },
430 | {
431 | cell_type: "code",
432 | execution_count: 13,
433 | metadata: {
434 | collapsed: true
435 | },
436 | outputs: [ ],
437 | source: [
438 | "def map_cosines(row): ",
439 | " for i in range(len(row)): ",
440 | " value_i = row[i].score / pq_b.value[row[i].sku][1] ",
441 | " if random.random() < pq_b.value[row[i].sku][0]: ",
442 | " for j in range(i + 1, len(row)): ",
443 | " value_j = row[j].score / pq_b.value[row[j].sku][1] ",
444 | " if random.random() < pq_b.value[row[i].sku][0]: ",
445 | " yield ((row[i].sku, row[j].sku), value_i * value_j)"
446 | ]
447 | },
448 | {
449 | cell_type: "code",
450 | execution_count: 14,
451 | metadata: { },
452 | outputs: [ ],
453 | source: [
454 | "users2 = users.rdd.flatMap(lambda row: map_cosines(row.data))"
455 | ]
456 | },
457 | {
458 | cell_type: "code",
459 | execution_count: 150,
460 | metadata: { },
461 | outputs: [
462 | {
463 | data: {
464 | text/plain: [
465 | "[(('CO796SCM72JGT', 'CO796SCM23HHW'), 0.0002015811797719921), ",
466 | " (('HA723APF18CPL', 'CO515APF44YPR'), 0.031234752377721216)]"
467 | ]
468 | },
469 | execution_count: 150,
470 | metadata: { },
471 | output_type: "execute_result"
472 | }
473 | ],
474 | source: [
475 | "users2.take(2)"
476 | ]
477 | },
478 | {
479 | cell_type: "code",
480 | execution_count: 15,
481 | metadata: { },
482 | outputs: [ ],
483 | source: [
484 | "final = users2.reduceByKey(operator.add)"
485 | ]
486 | },
487 | {
488 | cell_type: "code",
489 | execution_count: 16,
490 | metadata: { },
491 | outputs: [
492 | {
493 | name: "stdout",
494 | output_type: "stream",
495 | text: [
496 | "[(('VI618SHF35NCY-51', 'LU773ACF56ILV'), 0.029501220638256383), (('FI911APF72ZHF', 'KA952APF52DNB'), 0.015504341823651058), (('FA865ACF45CCS', 'QU097ACF14BCMN'), 0.7071067811865475)] ",
497 | "363.733115196228 "
498 | ]
499 | }
500 | ],
501 | source: [
502 | "t0 = time.time() ",
503 | "print(final.take(3)) ",
504 | "print(time.time() - t0)"
505 | ]
506 | },
507 | {
508 | cell_type: "code",
509 | execution_count: null,
510 | metadata: {
511 | collapsed: true
512 | },
513 | outputs: [ ],
514 | source: [ ]
515 | },
516 | {
517 | cell_type: "code",
518 | execution_count: null,
519 | metadata: {
520 | collapsed: true
521 | },
522 | outputs: [ ],
523 | source: [ ]
524 | },
525 | {
526 | cell_type: "code",
527 | execution_count: null,
528 | metadata: {
529 | collapsed: true
530 | },
531 | outputs: [ ],
532 | source: [ ]
533 | },
534 | {
535 | cell_type: "code",
536 | execution_count: null,
537 | metadata: {
538 | collapsed: true
539 | },
540 | outputs: [ ],
541 | source: [ ]
542 | },
543 | {
544 | cell_type: "code",
545 | execution_count: null,
546 | metadata: {
547 | collapsed: true
548 | },
549 | outputs: [ ],
550 | source: [ ]
551 | },
552 | {
553 | cell_type: "code",
554 | execution_count: null,
555 | metadata: {
556 | collapsed: true
557 | },
558 | outputs: [ ],
559 | source: [ ]
560 | }
561 | ],
562 | metadata: {
563 | kernelspec: {
564 | display_name: "PySpark",
565 | language: "python",
566 | name: "pyspark"
567 | },
568 | language_info: {
569 | codemirror_mode: {
570 | name: "ipython",
571 | version: 3
572 | },
573 | file_extension: ".py",
574 | mimetype: "text/x-python",
575 | name: "python",
576 | nbconvert_exporter: "python",
577 | pygments_lexer: "ipython3",
578 | version: "3.5.2"
579 | }
580 | },
581 | nbformat: 4,
582 | nbformat_minor: 2
583 | }
584 |
--------------------------------------------------------------------------------
/notebooks/rdd_marreco_test.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | cells: [
3 | {
4 | cell_type: "code",
5 | execution_count: 40,
6 | metadata: {
7 | collapsed: true
8 | },
9 | outputs: [ ],
10 | source: [
11 | "import operator"
12 | ]
13 | },
14 | {
15 | cell_type: "code",
16 | execution_count: 50,
17 | metadata: {
18 | collapsed: true
19 | },
20 | outputs: [ ],
21 | source: [
22 | "import pyspark"
23 | ]
24 | },
25 | {
26 | cell_type: "code",
27 | execution_count: 51,
28 | metadata: { },
29 | outputs: [
30 | {
31 | data: {
32 | text/plain: [
33 | "'2.2.0'"
34 | ]
35 | },
36 | execution_count: 51,
37 | metadata: { },
38 | output_type: "execute_result"
39 | }
40 | ],
41 | source: [
42 | "pyspark.__version__"
43 | ]
44 | },
45 | {
46 | cell_type: "code",
47 | execution_count: 30,
48 | metadata: {
49 | collapsed: true
50 | },
51 | outputs: [ ],
52 | source: [
53 | "train_rdd = sc.textFile('gs://lbanor/pyspark/train_query*.gz')"
54 | ]
55 | },
56 | {
57 | cell_type: "code",
58 | execution_count: 31,
59 | metadata: {
60 | collapsed: true
61 | },
62 | outputs: [ ],
63 | source: [
64 | "header = train_rdd.first() ",
65 | "train_rdd = train_rdd.filter(lambda x: x != header).map(lambda x: x.split(','))"
66 | ]
67 | },
68 | {
69 | cell_type: "code",
70 | execution_count: 32,
71 | metadata: { },
72 | outputs: [
73 | {
74 | data: {
75 | text/plain: [
76 | "[['3383270414872112082', 'MO578SHF77RTI', '0.5'], ",
77 | " ['7143168022217708588', 'DA923SHF54UJP', '0.5'], ",
78 | " ['8844960186636261737', 'LU621ACM67NYU', '0.5']]"
79 | ]
80 | },
81 | execution_count: 32,
82 | metadata: { },
83 | output_type: "execute_result"
84 | }
85 | ],
86 | source: [
87 | "train_rdd.take(3)"
88 | ]
89 | },
90 | {
91 | cell_type: "code",
92 | execution_count: 33,
93 | metadata: {
94 | collapsed: true
95 | },
96 | outputs: [ ],
97 | source: [
98 | "train_rdd = train_rdd.map(lambda x: (x[0], (x[1], float(x[2])))).groupByKey().mapValues(list).filter(lambda x: len(x[1]) > 1)"
99 | ]
100 | },
101 | {
102 | cell_type: "code",
103 | execution_count: 34,
104 | metadata: { },
105 | outputs: [
106 | {
107 | data: {
108 | text/plain: [
109 | "[('7357279563665682536', ",
110 | " [('CO515SHF91TPO', 0.5), ",
111 | " ('MA862SHF07OZG', 1.0), ",
112 | " ('DA923SHF00KPV', 0.5), ",
113 | " ('RA626SHF48VKP', 0.5), ",
114 | " ('UP428APF54RFP', 0.5), ",
115 | " ('OU295APF41KVE', 0.5)]), ",
116 | " ('3831524958866269889', ",
117 | " [('CA278SHF45EVY', 1.0), ",
118 | " ('SA232SHF43XLS', 0.5), ",
119 | " ('SA232SHF74ADT', 0.5), ",
120 | " ('LA628SHF52JSD', 0.5), ",
121 | " ('SA232SHF29PWG', 0.5), ",
122 | " ('DO302SHF23LDS', 0.5), ",
123 | " ('DO302SHF37LDE', 0.5), ",
124 | " ('CA278SHF48EVV', 0.5), ",
125 | " ('LA628SHF40IHZ', 0.5), ",
126 | " ('CA278SHF97UGW', 0.5), ",
127 | " ('CA278SHF45EVY', 0.5)]), ",
128 | " ('7808161502332133024', ",
129 | " [('AS296SCF58FCV', 0.5), ('CA278SHF78LIB', 0.5), ('CA278SHF77LIC', 0.5)])]"
130 | ]
131 | },
132 | execution_count: 34,
133 | metadata: { },
134 | output_type: "execute_result"
135 | }
136 | ],
137 | source: [
138 | "train_rdd.take(3)"
139 | ]
140 | },
141 | {
142 | cell_type: "code",
143 | execution_count: 35,
144 | metadata: {
145 | collapsed: true
146 | },
147 | outputs: [ ],
148 | source: [
149 | "train_rdd = train_rdd.map(lambda corr: [[((corr[1][i][0], corr[1][j][0]), corr[1][i][1] * corr[1][j][1]) for i in range(len(corr[1]))] for j in range(len(corr[1]))])"
150 | ]
151 | },
152 | {
153 | cell_type: "code",
154 | execution_count: 36,
155 | metadata: { },
156 | outputs: [
157 | {
158 | data: {
159 | text/plain: [
160 | "[[[(('CO515SHF91TPO', 'CO515SHF91TPO'), 0.25), ",
161 | " (('MA862SHF07OZG', 'CO515SHF91TPO'), 0.5), ",
162 | " (('DA923SHF00KPV', 'CO515SHF91TPO'), 0.25), ",
163 | " (('RA626SHF48VKP', 'CO515SHF91TPO'), 0.25), ",
164 | " (('UP428APF54RFP', 'CO515SHF91TPO'), 0.25), ",
165 | " (('OU295APF41KVE', 'CO515SHF91TPO'), 0.25)], ",
166 | " [(('CO515SHF91TPO', 'MA862SHF07OZG'), 0.5), ",
167 | " (('MA862SHF07OZG', 'MA862SHF07OZG'), 1.0), ",
168 | " (('DA923SHF00KPV', 'MA862SHF07OZG'), 0.5), ",
169 | " (('RA626SHF48VKP', 'MA862SHF07OZG'), 0.5), ",
170 | " (('UP428APF54RFP', 'MA862SHF07OZG'), 0.5), ",
171 | " (('OU295APF41KVE', 'MA862SHF07OZG'), 0.5)], ",
172 | " [(('CO515SHF91TPO', 'DA923SHF00KPV'), 0.25), ",
173 | " (('MA862SHF07OZG', 'DA923SHF00KPV'), 0.5), ",
174 | " (('DA923SHF00KPV', 'DA923SHF00KPV'), 0.25), ",
175 | " (('RA626SHF48VKP', 'DA923SHF00KPV'), 0.25), ",
176 | " (('UP428APF54RFP', 'DA923SHF00KPV'), 0.25), ",
177 | " (('OU295APF41KVE', 'DA923SHF00KPV'), 0.25)], ",
178 | " [(('CO515SHF91TPO', 'RA626SHF48VKP'), 0.25), ",
179 | " (('MA862SHF07OZG', 'RA626SHF48VKP'), 0.5), ",
180 | " (('DA923SHF00KPV', 'RA626SHF48VKP'), 0.25), ",
181 | " (('RA626SHF48VKP', 'RA626SHF48VKP'), 0.25), ",
182 | " (('UP428APF54RFP', 'RA626SHF48VKP'), 0.25), ",
183 | " (('OU295APF41KVE', 'RA626SHF48VKP'), 0.25)], ",
184 | " [(('CO515SHF91TPO', 'UP428APF54RFP'), 0.25), ",
185 | " (('MA862SHF07OZG', 'UP428APF54RFP'), 0.5), ",
186 | " (('DA923SHF00KPV', 'UP428APF54RFP'), 0.25), ",
187 | " (('RA626SHF48VKP', 'UP428APF54RFP'), 0.25), ",
188 | " (('UP428APF54RFP', 'UP428APF54RFP'), 0.25), ",
189 | " (('OU295APF41KVE', 'UP428APF54RFP'), 0.25)], ",
190 | " [(('CO515SHF91TPO', 'OU295APF41KVE'), 0.25), ",
191 | " (('MA862SHF07OZG', 'OU295APF41KVE'), 0.5), ",
192 | " (('DA923SHF00KPV', 'OU295APF41KVE'), 0.25), ",
193 | " (('RA626SHF48VKP', 'OU295APF41KVE'), 0.25), ",
194 | " (('UP428APF54RFP', 'OU295APF41KVE'), 0.25), ",
195 | " (('OU295APF41KVE', 'OU295APF41KVE'), 0.25)]], ",
196 | " [[(('CA278SHF45EVY', 'CA278SHF45EVY'), 1.0), ",
197 | " (('SA232SHF43XLS', 'CA278SHF45EVY'), 0.5), ",
198 | " (('SA232SHF74ADT', 'CA278SHF45EVY'), 0.5), ",
199 | " (('LA628SHF52JSD', 'CA278SHF45EVY'), 0.5), ",
200 | " (('SA232SHF29PWG', 'CA278SHF45EVY'), 0.5), ",
201 | " (('DO302SHF23LDS', 'CA278SHF45EVY'), 0.5), ",
202 | " (('DO302SHF37LDE', 'CA278SHF45EVY'), 0.5), ",
203 | " (('CA278SHF48EVV', 'CA278SHF45EVY'), 0.5), ",
204 | " (('LA628SHF40IHZ', 'CA278SHF45EVY'), 0.5), ",
205 | " (('CA278SHF97UGW', 'CA278SHF45EVY'), 0.5), ",
206 | " (('CA278SHF45EVY', 'CA278SHF45EVY'), 0.5)], ",
207 | " [(('CA278SHF45EVY', 'SA232SHF43XLS'), 0.5), ",
208 | " (('SA232SHF43XLS', 'SA232SHF43XLS'), 0.25), ",
209 | " (('SA232SHF74ADT', 'SA232SHF43XLS'), 0.25), ",
210 | " (('LA628SHF52JSD', 'SA232SHF43XLS'), 0.25), ",
211 | " (('SA232SHF29PWG', 'SA232SHF43XLS'), 0.25), ",
212 | " (('DO302SHF23LDS', 'SA232SHF43XLS'), 0.25), ",
213 | " (('DO302SHF37LDE', 'SA232SHF43XLS'), 0.25), ",
214 | " (('CA278SHF48EVV', 'SA232SHF43XLS'), 0.25), ",
215 | " (('LA628SHF40IHZ', 'SA232SHF43XLS'), 0.25), ",
216 | " (('CA278SHF97UGW', 'SA232SHF43XLS'), 0.25), ",
217 | " (('CA278SHF45EVY', 'SA232SHF43XLS'), 0.25)], ",
218 | " [(('CA278SHF45EVY', 'SA232SHF74ADT'), 0.5), ",
219 | " (('SA232SHF43XLS', 'SA232SHF74ADT'), 0.25), ",
220 | " (('SA232SHF74ADT', 'SA232SHF74ADT'), 0.25), ",
221 | " (('LA628SHF52JSD', 'SA232SHF74ADT'), 0.25), ",
222 | " (('SA232SHF29PWG', 'SA232SHF74ADT'), 0.25), ",
223 | " (('DO302SHF23LDS', 'SA232SHF74ADT'), 0.25), ",
224 | " (('DO302SHF37LDE', 'SA232SHF74ADT'), 0.25), ",
225 | " (('CA278SHF48EVV', 'SA232SHF74ADT'), 0.25), ",
226 | " (('LA628SHF40IHZ', 'SA232SHF74ADT'), 0.25), ",
227 | " (('CA278SHF97UGW', 'SA232SHF74ADT'), 0.25), ",
228 | " (('CA278SHF45EVY', 'SA232SHF74ADT'), 0.25)], ",
229 | " [(('CA278SHF45EVY', 'LA628SHF52JSD'), 0.5), ",
230 | " (('SA232SHF43XLS', 'LA628SHF52JSD'), 0.25), ",
231 | " (('SA232SHF74ADT', 'LA628SHF52JSD'), 0.25), ",
232 | " (('LA628SHF52JSD', 'LA628SHF52JSD'), 0.25), ",
233 | " (('SA232SHF29PWG', 'LA628SHF52JSD'), 0.25), ",
234 | " (('DO302SHF23LDS', 'LA628SHF52JSD'), 0.25), ",
235 | " (('DO302SHF37LDE', 'LA628SHF52JSD'), 0.25), ",
236 | " (('CA278SHF48EVV', 'LA628SHF52JSD'), 0.25), ",
237 | " (('LA628SHF40IHZ', 'LA628SHF52JSD'), 0.25), ",
238 | " (('CA278SHF97UGW', 'LA628SHF52JSD'), 0.25), ",
239 | " (('CA278SHF45EVY', 'LA628SHF52JSD'), 0.25)], ",
240 | " [(('CA278SHF45EVY', 'SA232SHF29PWG'), 0.5), ",
241 | " (('SA232SHF43XLS', 'SA232SHF29PWG'), 0.25), ",
242 | " (('SA232SHF74ADT', 'SA232SHF29PWG'), 0.25), ",
243 | " (('LA628SHF52JSD', 'SA232SHF29PWG'), 0.25), ",
244 | " (('SA232SHF29PWG', 'SA232SHF29PWG'), 0.25), ",
245 | " (('DO302SHF23LDS', 'SA232SHF29PWG'), 0.25), ",
246 | " (('DO302SHF37LDE', 'SA232SHF29PWG'), 0.25), ",
247 | " (('CA278SHF48EVV', 'SA232SHF29PWG'), 0.25), ",
248 | " (('LA628SHF40IHZ', 'SA232SHF29PWG'), 0.25), ",
249 | " (('CA278SHF97UGW', 'SA232SHF29PWG'), 0.25), ",
250 | " (('CA278SHF45EVY', 'SA232SHF29PWG'), 0.25)], ",
251 | " [(('CA278SHF45EVY', 'DO302SHF23LDS'), 0.5), ",
252 | " (('SA232SHF43XLS', 'DO302SHF23LDS'), 0.25), ",
253 | " (('SA232SHF74ADT', 'DO302SHF23LDS'), 0.25), ",
254 | " (('LA628SHF52JSD', 'DO302SHF23LDS'), 0.25), ",
255 | " (('SA232SHF29PWG', 'DO302SHF23LDS'), 0.25), ",
256 | " (('DO302SHF23LDS', 'DO302SHF23LDS'), 0.25), ",
257 | " (('DO302SHF37LDE', 'DO302SHF23LDS'), 0.25), ",
258 | " (('CA278SHF48EVV', 'DO302SHF23LDS'), 0.25), ",
259 | " (('LA628SHF40IHZ', 'DO302SHF23LDS'), 0.25), ",
260 | " (('CA278SHF97UGW', 'DO302SHF23LDS'), 0.25), ",
261 | " (('CA278SHF45EVY', 'DO302SHF23LDS'), 0.25)], ",
262 | " [(('CA278SHF45EVY', 'DO302SHF37LDE'), 0.5), ",
263 | " (('SA232SHF43XLS', 'DO302SHF37LDE'), 0.25), ",
264 | " (('SA232SHF74ADT', 'DO302SHF37LDE'), 0.25), ",
265 | " (('LA628SHF52JSD', 'DO302SHF37LDE'), 0.25), ",
266 | " (('SA232SHF29PWG', 'DO302SHF37LDE'), 0.25), ",
267 | " (('DO302SHF23LDS', 'DO302SHF37LDE'), 0.25), ",
268 | " (('DO302SHF37LDE', 'DO302SHF37LDE'), 0.25), ",
269 | " (('CA278SHF48EVV', 'DO302SHF37LDE'), 0.25), ",
270 | " (('LA628SHF40IHZ', 'DO302SHF37LDE'), 0.25), ",
271 | " (('CA278SHF97UGW', 'DO302SHF37LDE'), 0.25), ",
272 | " (('CA278SHF45EVY', 'DO302SHF37LDE'), 0.25)], ",
273 | " [(('CA278SHF45EVY', 'CA278SHF48EVV'), 0.5), ",
274 | " (('SA232SHF43XLS', 'CA278SHF48EVV'), 0.25), ",
275 | " (('SA232SHF74ADT', 'CA278SHF48EVV'), 0.25), ",
276 | " (('LA628SHF52JSD', 'CA278SHF48EVV'), 0.25), ",
277 | " (('SA232SHF29PWG', 'CA278SHF48EVV'), 0.25), ",
278 | " (('DO302SHF23LDS', 'CA278SHF48EVV'), 0.25), ",
279 | " (('DO302SHF37LDE', 'CA278SHF48EVV'), 0.25), ",
280 | " (('CA278SHF48EVV', 'CA278SHF48EVV'), 0.25), ",
281 | " (('LA628SHF40IHZ', 'CA278SHF48EVV'), 0.25), ",
282 | " (('CA278SHF97UGW', 'CA278SHF48EVV'), 0.25), ",
283 | " (('CA278SHF45EVY', 'CA278SHF48EVV'), 0.25)], ",
284 | " [(('CA278SHF45EVY', 'LA628SHF40IHZ'), 0.5), ",
285 | " (('SA232SHF43XLS', 'LA628SHF40IHZ'), 0.25), ",
286 | " (('SA232SHF74ADT', 'LA628SHF40IHZ'), 0.25), ",
287 | " (('LA628SHF52JSD', 'LA628SHF40IHZ'), 0.25), ",
288 | " (('SA232SHF29PWG', 'LA628SHF40IHZ'), 0.25), ",
289 | " (('DO302SHF23LDS', 'LA628SHF40IHZ'), 0.25), ",
290 | " (('DO302SHF37LDE', 'LA628SHF40IHZ'), 0.25), ",
291 | " (('CA278SHF48EVV', 'LA628SHF40IHZ'), 0.25), ",
292 | " (('LA628SHF40IHZ', 'LA628SHF40IHZ'), 0.25), ",
293 | " (('CA278SHF97UGW', 'LA628SHF40IHZ'), 0.25), ",
294 | " (('CA278SHF45EVY', 'LA628SHF40IHZ'), 0.25)], ",
295 | " [(('CA278SHF45EVY', 'CA278SHF97UGW'), 0.5), ",
296 | " (('SA232SHF43XLS', 'CA278SHF97UGW'), 0.25), ",
297 | " (('SA232SHF74ADT', 'CA278SHF97UGW'), 0.25), ",
298 | " (('LA628SHF52JSD', 'CA278SHF97UGW'), 0.25), ",
299 | " (('SA232SHF29PWG', 'CA278SHF97UGW'), 0.25), ",
300 | " (('DO302SHF23LDS', 'CA278SHF97UGW'), 0.25), ",
301 | " (('DO302SHF37LDE', 'CA278SHF97UGW'), 0.25), ",
302 | " (('CA278SHF48EVV', 'CA278SHF97UGW'), 0.25), ",
303 | " (('LA628SHF40IHZ', 'CA278SHF97UGW'), 0.25), ",
304 | " (('CA278SHF97UGW', 'CA278SHF97UGW'), 0.25), ",
305 | " (('CA278SHF45EVY', 'CA278SHF97UGW'), 0.25)], ",
306 | " [(('CA278SHF45EVY', 'CA278SHF45EVY'), 0.5), ",
307 | " (('SA232SHF43XLS', 'CA278SHF45EVY'), 0.25), ",
308 | " (('SA232SHF74ADT', 'CA278SHF45EVY'), 0.25), ",
309 | " (('LA628SHF52JSD', 'CA278SHF45EVY'), 0.25), ",
310 | " (('SA232SHF29PWG', 'CA278SHF45EVY'), 0.25), ",
311 | " (('DO302SHF23LDS', 'CA278SHF45EVY'), 0.25), ",
312 | " (('DO302SHF37LDE', 'CA278SHF45EVY'), 0.25), ",
313 | " (('CA278SHF48EVV', 'CA278SHF45EVY'), 0.25), ",
314 | " (('LA628SHF40IHZ', 'CA278SHF45EVY'), 0.25), ",
315 | " (('CA278SHF97UGW', 'CA278SHF45EVY'), 0.25), ",
316 | " (('CA278SHF45EVY', 'CA278SHF45EVY'), 0.25)]], ",
317 | " [[(('AS296SCF58FCV', 'AS296SCF58FCV'), 0.25), ",
318 | " (('CA278SHF78LIB', 'AS296SCF58FCV'), 0.25), ",
319 | " (('CA278SHF77LIC', 'AS296SCF58FCV'), 0.25)], ",
320 | " [(('AS296SCF58FCV', 'CA278SHF78LIB'), 0.25), ",
321 | " (('CA278SHF78LIB', 'CA278SHF78LIB'), 0.25), ",
322 | " (('CA278SHF77LIC', 'CA278SHF78LIB'), 0.25)], ",
323 | " [(('AS296SCF58FCV', 'CA278SHF77LIC'), 0.25), ",
324 | " (('CA278SHF78LIB', 'CA278SHF77LIC'), 0.25), ",
325 | " (('CA278SHF77LIC', 'CA278SHF77LIC'), 0.25)]]]"
326 | ]
327 | },
328 | execution_count: 36,
329 | metadata: { },
330 | output_type: "execute_result"
331 | }
332 | ],
333 | source: [
334 | "train_rdd.take(3):"
335 | ]
336 | },
337 | {
338 | cell_type: "code",
339 | execution_count: 37,
340 | metadata: {
341 | collapsed: true
342 | },
343 | outputs: [ ],
344 | source: [
345 | "train_rdd = train_rdd.flatMap(lambda x: x).flatMap(lambda x: x)"
346 | ]
347 | },
348 | {
349 | cell_type: "code",
350 | execution_count: 38,
351 | metadata: { },
352 | outputs: [
353 | {
354 | data: {
355 | text/plain: [
356 | "[(('LU759APM92BCD', 'LU759APM92BCD'), 0.25), ",
357 | " (('MA099APM20LVF', 'LU759APM92BCD'), 0.25), ",
358 | " (('DU387APM43GAQ', 'LU759APM92BCD'), 0.25)]"
359 | ]
360 | },
361 | execution_count: 38,
362 | metadata: { },
363 | output_type: "execute_result"
364 | }
365 | ],
366 | source: [
367 | "train_rdd.take(3)"
368 | ]
369 | },
370 | {
371 | cell_type: "code",
372 | execution_count: 43,
373 | metadata: { },
374 | outputs: [
375 | {
376 | data: {
377 | text/plain: [
378 | "110201577"
379 | ]
380 | },
381 | execution_count: 43,
382 | metadata: { },
383 | output_type: "execute_result"
384 | }
385 | ],
386 | source: [
387 | "train_rdd.count()"
388 | ]
389 | },
390 | {
391 | cell_type: "code",
392 | execution_count: 41,
393 | metadata: {
394 | collapsed: true
395 | },
396 | outputs: [ ],
397 | source: [
398 | "r = train_rdd.reduceByKey(operator.add)"
399 | ]
400 | },
401 | {
402 | cell_type: "code",
403 | execution_count: 42,
404 | metadata: { },
405 | outputs: [
406 | {
407 | data: {
408 | text/plain: [
409 | "[(('VI185ACF98VRP', 'SA232ACF26KXP'), 0.75), ",
410 | " (('HA651APF16CBF', 'CA558APF27RGU'), 0.25), ",
411 | " (('AG170APF90ZUN', 'MA250APF39AWQ'), 0.5)]"
412 | ]
413 | },
414 | execution_count: 42,
415 | metadata: { },
416 | output_type: "execute_result"
417 | }
418 | ],
419 | source: [
420 | "r.take(3) # taking almost 30 mins now..."
421 | ]
422 | },
423 | {
424 | cell_type: "code",
425 | execution_count: 61,
426 | metadata: {
427 | collapsed: true
428 | },
429 | outputs: [ ],
430 | source: [
431 | "r = sc.parallelize([[(i, i) for i in range(300000) ] ]).flatMap(lambda x: x) ",
432 | "r2 = sc.parallelize([[(i, i) for i in range(300000) ] ]).flatMap(lambda x: x)"
433 | ]
434 | },
435 | {
436 | cell_type: "code",
437 | execution_count: 65,
438 | metadata: {
439 | collapsed: true
440 | },
441 | outputs: [ ],
442 | source: [
443 | "r3 = r.cartesian(r2)"
444 | ]
445 | }
446 | ],
447 | metadata: {
448 | kernelspec: {
449 | display_name: "PySpark",
450 | language: "python",
451 | name: "pyspark"
452 | },
453 | language_info: {
454 | codemirror_mode: {
455 | name: "ipython",
456 | version: 3
457 | },
458 | file_extension: ".py",
459 | mimetype: "text/x-python",
460 | name: "python",
461 | nbconvert_exporter: "python",
462 | pygments_lexer: "ipython3",
463 | version: "3.5.2"
464 | }
465 | },
466 | nbformat: 4,
467 | nbformat_minor: 2
468 | }
469 |
--------------------------------------------------------------------------------
/tests/system/spark_jobs/test_neighbor.py:
--------------------------------------------------------------------------------
1 | #MIT License
2 | #
3 | #Copyright (c) 2017 Willian Fuks
4 | #
5 | #Permission is hereby granted, free of charge, to any person obtaining a copy
6 | #of this software and associated documentation files (the "Software"), to deal
7 | #in the Software without restriction, including without limitation the rights
8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | #copies of the Software, and to permit persons to whom the Software is
10 | #furnished to do so, subject to the following conditions:
11 | #
12 | #The above copyright notice and this permission notice shall be included in all
13 | #copies or substantial portions of the Software.
14 | #
15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | #SOFTWARE.
22 |
23 | """
24 | This is system tests and should only be run if the environment has pyspark
25 | and a spark cluster installed to receive on-demand jobs
26 | """
27 |
28 | import os
29 | import unittest
30 | import sys
31 | import mock
32 | import json
33 | import datetime
34 | import pyspark
35 | import math
36 | import glob
37 | import shutil
38 | from collections import namedtuple
39 | import numpy as np
40 |
41 | from pyspark.sql import types as stypes
42 | sys.path.append('./spark_jobs')
43 |
44 | py_files = ['./spark_jobs/neighbor.py',
45 | './spark_jobs/base.py',
46 | './spark_jobs/factory.py']
47 |
48 |
49 | class Test_neighbor(unittest.TestCase):
50 |
51 | _sc = pyspark.SparkContext(pyFiles=py_files)
52 | _session = pyspark.sql.SparkSession(_sc)
53 | _to_delete_uris = []
54 |
55 |
56 | @staticmethod
57 | def _get_target_class():
58 | from neighbor import MarrecoNeighborJob
59 |
60 |
61 | return MarrecoNeighborJob
62 |
63 |
64 | @staticmethod
65 | def _delete_dirs(*args):
66 | for arg in args:
67 | if os.path.isdir(arg):
68 | shutil.rmtree(arg)
69 |
70 |
71 | def _prepare_daily_data(self):
72 | for i in [1, 2]:
73 | uri = 'tests/system/data/neighbor/train/{}/train.json'.format(
74 | i)
75 | data = self._sc.textFile(uri)
76 | formatted_day = (datetime.datetime.now() -
77 | datetime.timedelta(days=i)).strftime('%Y-%m-%d')
78 |
79 | save_uri = 'tests/system/data/neighbor/train/{}/train.json'.format(
80 | formatted_day)
81 | self._delete_dirs(save_uri)
82 | self._to_delete_uris.append(os.path.dirname(save_uri))
83 | data.saveAsTextFile(save_uri)
84 |
85 |
86 | def _delete_uris(self):
87 | for uri in self._to_delete_uris:
88 | self._delete_dirs(uri)
89 | self._to_delete_uris = []
90 |
91 |
92 | def test_process_datajet_day_no_force(self):
93 | klass = self._get_target_class()()
94 | Args = namedtuple('args', ['w_browse', 'w_purchase', 'decay'])
95 | args = Args(0.5, 2.0, 0.1)
96 | inter_uri = 'tests/system/data/dj'
97 | self._delete_dirs(inter_uri)
98 | self.assertFalse(os.path.isdir(inter_uri))
99 |
100 | test_days = (datetime.datetime.now()
101 | - datetime.datetime(*[2017, 8, 13])).days
102 |
103 | klass._process_datajet_day(self._sc,
104 | 'tests/system/data/datajet_test.json',
105 | inter_uri,
106 | args,
107 | mode=None,
108 | compression=None)
109 |
110 | expected = [str({"user_id": "25e35a54c8cace51",
111 | "interacted_items":[{"key":"MA042APM76IPJ",
112 | "score": float(str(args.w_browse * math.exp(
113 | -args.decay * test_days))[:9])}]}),
114 | str({"user_id": "610574c802ba3b33",
115 | "interacted_items":[{"key":"DA923SHF35RHK",
116 | "score": float(str(args.w_purchase * math.exp(
117 | -args.decay * test_days))[:9])},
118 | {"key": "VI618SHF69UQC",
119 | "score": float(str(args.w_purchase * math.exp(
120 | -args.decay * test_days))[:9])}]})]
121 |
122 | result = [json.loads(i) for i in ''.join([open(e).read()
123 | for e in glob.glob(inter_uri + '/*.json')]).split('\n') if i]
124 | for e in result:
125 | for item in e['interacted_items']:
126 | item['score'] = float(str(item['score'])[:9])
127 |
128 | self.assertEqual(expected, [str(e) for e in result])
129 |
130 |
131 | def test_process_datajet_day_yes_force(self):
132 | klass = self._get_target_class()()
133 | spark = pyspark.sql.SparkSession(self._sc)
134 | Args = namedtuple('args', ['w_browse', 'w_purchase', 'decay'])
135 | args = Args(0.5, 2.0, 0.1)
136 | inter_uri = '/tests/system/data/dj'
137 | self._delete_dirs(inter_uri)
138 |
139 | test_days = (datetime.datetime.now()
140 | - datetime.datetime(*[2017, 8, 13])).days
141 |
142 | klass._process_datajet_day(self._sc,
143 | 'tests/system/data/datajet_test.json',
144 | inter_uri,
145 | args,
146 | mode=None,
147 | compression=None)
148 |
149 | self.assertTrue(os.path.isdir(inter_uri))
150 |
151 | klass._process_datajet_day(self._sc,
152 | 'tests/system/data/datajet_test.json',
153 | inter_uri,
154 | args,
155 | mode='overwrite',
156 | compression=None)
157 |
158 | expected = [str({"user_id": "25e35a54c8cace51",
159 | "interacted_items":[{"key":"MA042APM76IPJ",
160 | "score": float(str(args.w_browse * math.exp(
161 | -args.decay * test_days))[:9])}]}),
162 | str({"user_id": "610574c802ba3b33",
163 | "interacted_items":[{"key":"DA923SHF35RHK",
164 | "score": float(str(args.w_purchase * math.exp(
165 | -args.decay * test_days))[:9])},
166 | {"key": "VI618SHF69UQC",
167 | "score": float(str(args.w_purchase * math.exp(
168 | -args.decay * test_days))[:9])}]})]
169 |
170 | result = [json.loads(i) for i in ''.join([open(e).read()
171 | for e in glob.glob(inter_uri + '/*.json')]).split('\n') if i]
172 | for e in result:
173 | for item in e['interacted_items']:
174 | item['score'] = float(str(item['score'])[:9])
175 |
176 | self.assertEqual(expected, [str(e) for e in result])
177 | self._delete_dirs(inter_uri)
178 | self.assertFalse(os.path.isdir(inter_uri))
179 |
180 |
181 | def test_transform_data_no_force(self):
182 | klass = self._get_target_class()()
183 | inter_uri = 'tests/system/data/neighbor/inter/{}'
184 | Args = namedtuple('args', ['days_init',
185 | 'days_end',
186 | 'w_browse',
187 | 'w_purchase',
188 | 'force',
189 | 'source_uri',
190 | 'inter_uri',
191 | 'neighbor_uri',
192 | 'threshold',
193 | 'decay'])
194 |
195 | self._prepare_daily_data()
196 |
197 | args = Args(2, 1, 0.5, 6, 'no',
198 | 'tests/system/data/neighbor/train/{}/train.json',
199 | inter_uri,
200 | 'tests/sytem/data/neighbor/result',
201 | 0.0, 0.0)
202 | klass.transform_data(self._sc, args)
203 |
204 | data1_uri = 'tests/system/data/neighbor/transformed_1.json'
205 | data2_uri = 'tests/system/data/neighbor/transformed_2.json'
206 |
207 | expected = {2: sorted(self._session.read.json(data2_uri,
208 | schema = klass._load_users_matrix_schema()) \
209 | .toJSON().collect()),
210 | 1: sorted(self._session.read.json(data1_uri,
211 | schema = klass._load_users_matrix_schema()) \
212 | .toJSON().collect())}
213 |
214 | for day in range(args.days_init, args.days_end - 1, -1):
215 | formatted_day = klass.get_formatted_date(day)
216 | result = sorted(self._session.read.json(
217 | inter_uri.format(formatted_day),
218 | schema=klass._load_users_matrix_schema())\
219 | .toJSON().collect())
220 | for i in range(len(result)):
221 | result_i = json.loads(result[i])
222 | result_i['interacted_items'] = sorted(
223 | result_i['interacted_items'], key=lambda x: x['key'])
224 | expected_i = json.loads(expected[day][i])
225 | expected_i['interacted_items'] = sorted(
226 | expected_i['interacted_items'], key=lambda x: x['key'])
227 | self.assertEqual(expected_i, result_i)
228 |
229 | for day in [2, 1]:
230 | formatted_day = klass.get_formatted_date(day)
231 | self._delete_dirs(inter_uri.format(formatted_day))
232 | self.assertFalse(os.path.isdir(inter_uri.format(formatted_day)))
233 | self._delete_uris()
234 | self.assertEqual(self._to_delete_uris, [])
235 |
236 |
237 | def test_transform_data_yes_force(self):
238 | klass = self._get_target_class()()
239 | inter_uri = 'tests/system/data/neighbor/inter/{}'
240 | Args = namedtuple('args', ['days_init',
241 | 'days_end',
242 | 'w_browse',
243 | 'w_purchase',
244 | 'force',
245 | 'source_uri',
246 | 'inter_uri',
247 | 'neighbor_uri',
248 | 'threshold',
249 | 'decay'])
250 |
251 | self._prepare_daily_data()
252 |
253 | args = Args(2, 1, 0.5, 6, 'no',
254 | 'tests/system/data/neighbor/train/{}/train.json',
255 | inter_uri,
256 | 'tests/sytem/data/neighbor/result',
257 | 0.0, 0.0)
258 | klass.transform_data(self._sc, args)
259 |
260 | args = Args(2, 1, 0.5, 6, 'yes',
261 | 'tests/system/data/neighbor/train/{}/train.json',
262 | inter_uri,
263 | 'tests/sytem/data/neighbor/result',
264 | 0.0, 0.0)
265 | klass.transform_data(self._sc, args)
266 |
267 | data1_uri = 'tests/system/data/neighbor/transformed_1.json'
268 | data2_uri = 'tests/system/data/neighbor/transformed_2.json'
269 |
270 | expected = {2: sorted(self._session.read.json(data2_uri,
271 | schema = klass._load_users_matrix_schema()) \
272 | .toJSON().collect()),
273 | 1: sorted(self._session.read.json(data1_uri,
274 | schema = klass._load_users_matrix_schema()) \
275 | .toJSON().collect())}
276 |
277 | for day in range(args.days_init, args.days_end - 1, -1):
278 | formatted_day = klass.get_formatted_date(day)
279 | result = sorted(self._session.read.json(
280 | inter_uri.format(formatted_day),
281 | schema=klass._load_users_matrix_schema())\
282 | .toJSON().collect())
283 | for i in range(len(result)):
284 | result_i = json.loads(result[i])
285 | result_i['interacted_items'] = sorted(
286 | result_i['interacted_items'], key=lambda x: x['key'])
287 | expected_i = json.loads(expected[day][i])
288 | expected_i['interacted_items'] = sorted(
289 | expected_i['interacted_items'], key=lambda x: x['key'])
290 | self.assertEqual(expected_i, result_i)
291 |
292 | for day in [2, 1]:
293 | formatted_day = klass.get_formatted_date(day)
294 | self._delete_dirs(inter_uri.format(formatted_day))
295 | self.assertFalse(os.path.isdir(inter_uri.format(formatted_day)))
296 | self._delete_uris()
297 | self.assertEqual(self._to_delete_uris, [])
298 |
299 |
300 | def test_build_marreco(self):
301 | klass = self._get_target_class()()
302 | result_uri = 'tests/system/data/neighbor/result/similarity'
303 | inter_uri = 'tests/system/data/neighbor/inter/{}'
304 | users_matrix_uri = 'tests/system/data/neighbor/result/users'
305 |
306 | self._delete_dirs(result_uri, users_matrix_uri)
307 | self.assertFalse(os.path.isdir(result_uri))
308 | self.assertFalse(os.path.isdir(users_matrix_uri))
309 |
310 | self._prepare_daily_data()
311 |
312 | Args = namedtuple('args', ['days_init',
313 | 'days_end',
314 | 'w_browse',
315 | 'w_purchase',
316 | 'force',
317 | 'source_uri',
318 | 'inter_uri',
319 | 'neighbor_uri',
320 | 'threshold',
321 | 'decay'])
322 |
323 | args = Args(2, 1, 0.5, 6, 'no',
324 | 'tests/system/data/neighbor/train/{}/train.json',
325 | inter_uri,
326 | 'tests/sytem/data/neighbor/result',
327 | 0.0, 0.0)
328 |
329 | klass.transform_data(self._sc, args)
330 |
331 | Args = namedtuple('args', ['days_init',
332 | 'days_end',
333 | 'inter_uri',
334 | 'neighbor_uri',
335 | 'threshold',
336 | 'users_matrix_uri'])
337 |
338 | args = Args(2, 1, inter_uri, result_uri, 0.0, users_matrix_uri)
339 |
340 | klass.build_marreco(self._sc, args)
341 | result = self._session.read.json(result_uri).collect()
342 |
343 | a = np.array([[0.5, 1., 0.5, 2.],
344 | [1., 2., 1., 0.5],
345 | [6., 1., 0.5, 0.5],
346 | [1., 1., 6., 6.]])
347 | n = np.linalg.norm(a, axis=0).reshape(1, a.shape[1])
348 |
349 | expected = a.T.dot(a) / n.T.dot(n)
350 |
351 | for row in result:
352 | key1 = row.item_key
353 | for inner_row in row.similarity_items:
354 | np.testing.assert_almost_equal(
355 | expected[int(key1), int(inner_row.key)],
356 | inner_row.score, decimal=6)
357 |
358 | for day in [2, 1]:
359 | formatted_day = klass.get_formatted_date(day)
360 | self._delete_dirs(inter_uri.format(formatted_day))
361 | self.assertFalse(os.path.isdir(inter_uri.format(formatted_day)))
362 | self._delete_dirs(result_uri)
363 | self.assertFalse(os.path.isdir(result_uri))
364 |
365 |
366 | def test_build_marreco_with_threshold(self):
367 | klass = self._get_target_class()()
368 | result_uri = 'tests/system/data/neighbor/result/similarity'
369 | inter_uri = 'tests/system/data/neighbor/inter/{}'
370 | users_matrix_uri = 'tests/system/data/neighbor/result/users'
371 |
372 | self._prepare_daily_data()
373 |
374 | self._delete_dirs(result_uri, users_matrix_uri)
375 | self.assertFalse(os.path.isdir(result_uri))
376 | self.assertFalse(os.path.isdir(users_matrix_uri))
377 |
378 | Args = namedtuple('args', ['days_init',
379 | 'days_end',
380 | 'w_browse',
381 | 'w_purchase',
382 | 'force',
383 | 'source_uri',
384 | 'inter_uri',
385 | 'neighbor_uri',
386 | 'threshold',
387 | 'decay'])
388 |
389 | args = Args(2, 1, 0.5, 6, 'no',
390 | 'tests/system/data/neighbor/train/{}/train.json',
391 | inter_uri,
392 | 'tests/sytem/data/neighbor/result',
393 | 0.0, 0.0)
394 |
395 | klass.transform_data(self._sc, args)
396 |
397 | Args = namedtuple('args', ['days_init',
398 | 'days_end',
399 | 'inter_uri',
400 | 'neighbor_uri',
401 | 'threshold',
402 | 'users_matrix_uri'])
403 |
404 | args = Args(2, 1, inter_uri, result_uri, 0.11, users_matrix_uri)
405 |
406 | klass.build_marreco(self._sc, args)
407 | result = self._session.read.json(result_uri).collect()
408 |
409 | a = np.array([[0.5, 1., 0.5, 2.],
410 | [1., 2., 1., 0.5],
411 | [6., 1., 0.5, 0.5],
412 | [1., 1., 6., 6.]])
413 |
414 | n = np.linalg.norm(a, axis=0).reshape(1, a.shape[1])
415 |
416 | expected = a.T.dot(a) / n.T.dot(n)
417 |
418 | Args = namedtuple('args', ['days_init',
419 | 'days_end',
420 | 'inter_uri',
421 | 'neighbor_uri',
422 | 'threshold',
423 | 'users_matrix_uri'])
424 |
425 | args = Args(2, 1, inter_uri, result_uri, 0.11, users_matrix_uri)
426 |
427 | klass.build_marreco(self._sc, args)
428 | result = self._session.read.json(result_uri).collect()
429 |
430 | for row in result:
431 | key1 = row.item_key
432 | for inner_row in row.similarity_items:
433 | actual = expected[int(key1), int(inner_row.key)]
434 | print('expected: ', actual)
435 | print('estimate: ', inner_row.score)
436 | self.assertTrue((actual -
437 | inner_row.score) / actual < 0.2)
438 |
439 |
440 | for day in [2, 1]:
441 | formatted_day = klass.get_formatted_date(day)
442 | self._delete_dirs(inter_uri.format(formatted_day))
443 | self.assertFalse(os.path.isdir(inter_uri.format(formatted_day)))
444 |
445 |
446 | self._delete_dirs(result_uri, users_matrix_uri)
447 | self.assertFalse(os.path.isdir(result_uri))
448 | self.assertFalse(os.path.isdir(users_matrix_uri))
449 |
--------------------------------------------------------------------------------
/notebooks/marreco_dimsum_sparse.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from pyspark.mllib.linalg import SparseVector\n",
12 | "from pyspark.mllib.linalg.distributed import RowMatrix\n",
13 | "import numpy as np\n",
14 | "from sklearn.metrics.pairwise import cosine_similarity\n",
15 | "import time\n",
16 | "from collections import defaultdict\n",
17 | "from pyspark.sql import functions as sfunc\n",
18 | "from pyspark.sql import types as stypes\n",
19 | "import math\n",
20 | "import sys\n",
21 | "from pyspark.ml.linalg import SparseVector\n",
22 | "from pyspark.mllib.linalg.distributed import RowMatrix\n",
23 | "from operator import itemgetter\n",
24 | "import operator\n",
25 | "import random"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "schema = stypes.StructType().add(\"fv\", stypes.StringType()).add(\"sku\", stypes.StringType()).add(\"score\", stypes.FloatType())\n",
37 | "train_df = spark.read.csv('gs://lbanor/pyspark/train_query*.gz', header=True, schema=schema)\n",
38 | "train_df.createOrReplaceTempView('test1')"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 17,
44 | "metadata": {},
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/plain": [
49 | "[Row(fv='6094795238635852694', sku='BR049APM25PCS', score=0.5),\n",
50 | " Row(fv='7454424246364596889', sku='TR763APF11DLC', score=0.5),\n",
51 | " Row(fv='5798933384203870548', sku='AN778SHF35NNG', score=0.5)]"
52 | ]
53 | },
54 | "execution_count": 17,
55 | "metadata": {},
56 | "output_type": "execute_result"
57 | }
58 | ],
59 | "source": [
60 | "train_df.head(3)"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 188,
66 | "metadata": {},
67 | "outputs": [
68 | {
69 | "name": "stdout",
70 | "output_type": "stream",
71 | "text": [
72 | "[Row(fv='1005105267406228429', sku='FI911SHF89UBM-50', score=5.0)]\n"
73 | ]
74 | }
75 | ],
76 | "source": [
77 | "print(train_df.rdd.filter(lambda x: x.sku == 'FI911SHF89UBM-50').take(3))"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 82,
83 | "metadata": {
84 | "collapsed": true
85 | },
86 | "outputs": [],
87 | "source": [
88 | "# query = \"\"\"\n",
89 | "# SELECT\n",
90 | "# sku,\n",
91 | "# ROW_NUMBER() OVER (ORDER BY SUM(1)) -1 idx\n",
92 | "# FROM test1\n",
93 | "# GROUP BY 1\n",
94 | "# \"\"\"\n",
95 | "# skus_rdd = spark.sql(query).rdd"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 4,
101 | "metadata": {
102 | "collapsed": true
103 | },
104 | "outputs": [],
105 | "source": [
106 | "query_statistics = \"\"\"\n",
107 | "SELECT\n",
108 | " sku,\n",
109 | " SQRT(10 * LOG(COUNT(sku) OVER()) / {threshold}) / SQRT(SUM(score * score)) p,\n",
110 | " IF(SQRT(10 * LOG(COUNT(sku) OVER()) / {threshold}) > SQRT(SUM(score * score)), SQRT(SUM(score * score)), SQRT(10 * LOG(COUNT(sku) OVER()) / {threshold})) q --- implements the min(gamma, ||c||)\n",
111 | "FROM test1\n",
112 | "GROUP BY 1\n",
113 | "\"\"\""
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 8,
119 | "metadata": {
120 | "collapsed": true
121 | },
122 | "outputs": [],
123 | "source": [
124 | "skus_stats = spark.sql(query_statistics.format(threshold=0.1))"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 9,
130 | "metadata": {},
131 | "outputs": [
132 | {
133 | "name": "stdout",
134 | "output_type": "stream",
135 | "text": [
136 | "[Row(sku='FI911SHF89UBM-50', p=7.132311576894841, q=5.0)]\n"
137 | ]
138 | }
139 | ],
140 | "source": [
141 | "print(skus_stats.rdd.filter(lambda x: x.sku == 'FI911SHF89UBM-50').take(3))"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 178,
147 | "metadata": {},
148 | "outputs": [
149 | {
150 | "data": {
151 | "text/plain": [
152 | "[Row(sku='PO140ACU06DDD', p=2.4697175158107982, q=14.439529078193651),\n",
153 | " Row(sku='PO140ACU76FVN', p=35.661557884474206, q=1.0),\n",
154 | " Row(sku='JU082SHF02WWZ', p=3.790780833876121, q=9.40744386111339)]"
155 | ]
156 | },
157 | "execution_count": 178,
158 | "metadata": {},
159 | "output_type": "execute_result"
160 | }
161 | ],
162 | "source": [
163 | "sku_stats.take(3)"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 194,
169 | "metadata": {},
170 | "outputs": [
171 | {
172 | "name": "stdout",
173 | "output_type": "stream",
174 | "text": [
175 | "[]\n"
176 | ]
177 | }
178 | ],
179 | "source": [
180 | "print(skus_stats.rdd.filter(lambda x: x.sku == 'FI911SHF89UBM-50').take(3))"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": null,
186 | "metadata": {
187 | "collapsed": true
188 | },
189 | "outputs": [],
190 | "source": [
191 | "# query_statistics = \"\"\"\n",
192 | "# SELECT\n",
193 | "# sku,\n",
194 | "# {gamma} / SQRT(SUM(score * score)) p,\n",
195 | "# IF({gamma} > SQRT(SUM(score * score)), SQRT(SUM(score * score)), {gamma}) q\n",
196 | "# FROM test1\n",
197 | "# GROUP BY 1\n",
198 | "# \"\"\""
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 60,
204 | "metadata": {
205 | "collapsed": true
206 | },
207 | "outputs": [],
208 | "source": [
209 | "# def get_gamma(threshold, numCols):\n",
210 | "# return math.sqrt(10 * math.log(numCols) / threshold) if threshold > 10e-6 else math.inf"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": 76,
216 | "metadata": {},
217 | "outputs": [
218 | {
219 | "name": "stdout",
220 | "output_type": "stream",
221 | "text": [
222 | "35.57234899487128\n"
223 | ]
224 | }
225 | ],
226 | "source": [
227 | "# gamma_b = sc.broadcast(get_gamma(10e-2))\n",
228 | "# print(gamma_b.value)"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": 77,
234 | "metadata": {
235 | "collapsed": true
236 | },
237 | "outputs": [],
238 | "source": [
239 | "# skus_stats = spark.sql(query_statistics.format(gamma=gamma_b.value))"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 78,
245 | "metadata": {},
246 | "outputs": [
247 | {
248 | "data": {
249 | "text/plain": [
250 | "[Row(sku='NI531SRM74IHX', p=2.8758539658272255, q=12.36931687685298),\n",
251 | " Row(sku='MO578SHF45QNE', p=0.5225157525775272, q=35.57234899487128)]"
252 | ]
253 | },
254 | "execution_count": 78,
255 | "metadata": {},
256 | "output_type": "execute_result"
257 | }
258 | ],
259 | "source": [
260 | "# skus_stats.head(2)"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": 10,
266 | "metadata": {
267 | "collapsed": true
268 | },
269 | "outputs": [],
270 | "source": [
271 | "pq_b = sc.broadcast({row.sku: [row.p, row.q] for row in skus_stats.collect()})"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": 11,
277 | "metadata": {},
278 | "outputs": [
279 | {
280 | "data": {
281 | "text/plain": [
282 | "[7.132311576894841, 5.0]"
283 | ]
284 | },
285 | "execution_count": 11,
286 | "metadata": {},
287 | "output_type": "execute_result"
288 | }
289 | ],
290 | "source": [
291 | "pq_b.value['FI911SHF89UBM-50']"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 157,
297 | "metadata": {
298 | "collapsed": true
299 | },
300 | "outputs": [],
301 | "source": [
302 | "#skus_idx_b = sc.broadcast({sku: idx for idx, sku in enumerate(pq_b.value.keys())})"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 158,
308 | "metadata": {
309 | "collapsed": true
310 | },
311 | "outputs": [],
312 | "source": [
313 | "#idx_skus_b = sc.broadcast({value: key for key, value in skus_idx_b.value.items()})"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": 53,
319 | "metadata": {
320 | "collapsed": true
321 | },
322 | "outputs": [],
323 | "source": [
324 | "# d = {row.sku: row.idx for row in skus_rdd.collect()}\n",
325 | "# db = sc.broadcast(d)\n",
326 | "\n",
327 | "# id_ = {value: key for key, value in d.items()}\n",
328 | "# id_b = sc.broadcast(id_)"
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": 159,
334 | "metadata": {
335 | "collapsed": true
336 | },
337 | "outputs": [],
338 | "source": [
339 | "#numCols = sc.broadcast(len(idx_skus_b.value))"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": 57,
345 | "metadata": {
346 | "collapsed": true
347 | },
348 | "outputs": [],
349 | "source": [
350 | "# p = [0] * numCols.value\n",
351 | "# for row in skus_stats"
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": 55,
357 | "metadata": {
358 | "collapsed": true
359 | },
360 | "outputs": [],
361 | "source": [
362 | "#p = {row.sku: gamma_b.value / row.norm for row in skus_stats.collect()} # if 0 happens as the ``norm`` we expected an Exception to be raised.\n",
363 | "#p_b = sc.broadcast(p)"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": 34,
369 | "metadata": {
370 | "collapsed": true
371 | },
372 | "outputs": [],
373 | "source": [
374 | "#q = {row.sku: gamma_b.value / row.norm for row in skus_stats.collect()}"
375 | ]
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": 35,
380 | "metadata": {},
381 | "outputs": [
382 | {
383 | "data": {
384 | "text/plain": [
385 | "312988"
386 | ]
387 | },
388 | "execution_count": 35,
389 | "metadata": {},
390 | "output_type": "execute_result"
391 | }
392 | ],
393 | "source": [
394 | "#numCols.value"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": 31,
400 | "metadata": {},
401 | "outputs": [
402 | {
403 | "data": {
404 | "text/plain": [
405 | "12.36931687685298"
406 | ]
407 | },
408 | "execution_count": 31,
409 | "metadata": {},
410 | "output_type": "execute_result"
411 | }
412 | ],
413 | "source": [
414 | "#skus_s['NI531SRM74IHX']"
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": 12,
420 | "metadata": {
421 | "collapsed": true
422 | },
423 | "outputs": [],
424 | "source": [
425 | "query_users_items = \"\"\"\n",
426 | "SELECT\n",
427 | "data\n",
428 | "FROM(\n",
429 | " SELECT\n",
430 | " fv,\n",
431 | " COLLECT_LIST(STRUCT(sku, score)) data\n",
432 | " FROM test1\n",
433 | " GROUP BY 1\n",
434 | ")\n",
435 | "WHERE SIZE(data) BETWEEN 2 AND 200\n",
436 | "\"\"\"\n",
437 | "\n",
438 | "t0 = time.time()\n",
439 | "users = spark.sql(query_users_items)\n",
440 | "users_rdd = users.rdd"
441 | ]
442 | },
443 | {
444 | "cell_type": "code",
445 | "execution_count": 148,
446 | "metadata": {},
447 | "outputs": [
448 | {
449 | "data": {
450 | "text/plain": [
451 | "[Row(data=[Row(sku='CO796SCF87LXG', score=0.5), Row(sku='CO796SCM72JGT', score=0.5), Row(sku='CO796SCM23HHW', score=0.5)]),\n",
452 | " Row(data=[Row(sku='HA723APF18CPL', score=0.5), Row(sku='CO515APF44YPR', score=0.5), Row(sku='LA906APF69OQC', score=0.5), Row(sku='TU142APF19BPC', score=0.5), Row(sku='CO515APF27DIA', score=0.5), Row(sku='GA753APF40NJR', score=0.5), Row(sku='GA753APF41NJQ', score=1.0)])]"
453 | ]
454 | },
455 | "execution_count": 148,
456 | "metadata": {},
457 | "output_type": "execute_result"
458 | }
459 | ],
460 | "source": [
461 | "users.head(2)"
462 | ]
463 | },
464 | {
465 | "cell_type": "code",
466 | "execution_count": 13,
467 | "metadata": {
468 | "collapsed": true
469 | },
470 | "outputs": [],
471 | "source": [
472 | "def map_cosines(row):\n",
473 | " for i in range(len(row)):\n",
474 | " value_i = row[i].score / pq_b.value[row[i].sku][1]\n",
475 | " if random.random() < pq_b.value[row[i].sku][0]:\n",
476 | " for j in range(i + 1, len(row)):\n",
477 | " value_j = row[j].score / pq_b.value[row[j].sku][1]\n",
478 | " if random.random() < pq_b.value[row[i].sku][0]:\n",
479 | " yield ((row[i].sku, row[j].sku), value_i * value_j)"
480 | ]
481 | },
482 | {
483 | "cell_type": "code",
484 | "execution_count": 14,
485 | "metadata": {
486 | "collapsed": true
487 | },
488 | "outputs": [],
489 | "source": [
490 | "users2 = users.rdd.flatMap(lambda row: map_cosines(row.data))"
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": 150,
496 | "metadata": {},
497 | "outputs": [
498 | {
499 | "data": {
500 | "text/plain": [
501 | "[(('CO796SCM72JGT', 'CO796SCM23HHW'), 0.0002015811797719921),\n",
502 | " (('HA723APF18CPL', 'CO515APF44YPR'), 0.031234752377721216)]"
503 | ]
504 | },
505 | "execution_count": 150,
506 | "metadata": {},
507 | "output_type": "execute_result"
508 | }
509 | ],
510 | "source": [
511 | "users2.take(2)"
512 | ]
513 | },
514 | {
515 | "cell_type": "code",
516 | "execution_count": 15,
517 | "metadata": {
518 | "collapsed": true
519 | },
520 | "outputs": [],
521 | "source": [
522 | "final = users2.reduceByKey(operator.add)"
523 | ]
524 | },
525 | {
526 | "cell_type": "code",
527 | "execution_count": 16,
528 | "metadata": {},
529 | "outputs": [
530 | {
531 | "name": "stdout",
532 | "output_type": "stream",
533 | "text": [
534 | "[(('VI618SHF35NCY-51', 'LU773ACF56ILV'), 0.029501220638256383), (('FI911APF72ZHF', 'KA952APF52DNB'), 0.015504341823651058), (('FA865ACF45CCS', 'QU097ACF14BCMN'), 0.7071067811865475)]\n",
535 | "363.733115196228\n"
536 | ]
537 | }
538 | ],
539 | "source": [
540 | "t0 = time.time()\n",
541 | "print(final.take(3))\n",
542 | "print(time.time() - t0)"
543 | ]
544 | },
545 | {
546 | "cell_type": "code",
547 | "execution_count": null,
548 | "metadata": {
549 | "collapsed": true
550 | },
551 | "outputs": [],
552 | "source": []
553 | },
554 | {
555 | "cell_type": "code",
556 | "execution_count": 18,
557 | "metadata": {
558 | "collapsed": true
559 | },
560 | "outputs": [],
561 | "source": [
562 | "import numpy as np"
563 | ]
564 | },
565 | {
566 | "cell_type": "code",
567 | "execution_count": 20,
568 | "metadata": {
569 | "collapsed": true
570 | },
571 | "outputs": [],
572 | "source": [
573 | "a = np.random.randn(12288, 150) # a.shape = (12288, 150)\n",
574 | "b = np.random.randn(150, 45) # b.shape = (150, 45)\n",
575 | "c = np.dot(a,b)"
576 | ]
577 | },
578 | {
579 | "cell_type": "code",
580 | "execution_count": 21,
581 | "metadata": {},
582 | "outputs": [
583 | {
584 | "data": {
585 | "text/plain": [
586 | "(12288, 45)"
587 | ]
588 | },
589 | "execution_count": 21,
590 | "metadata": {},
591 | "output_type": "execute_result"
592 | }
593 | ],
594 | "source": [
595 | "c.shape"
596 | ]
597 | },
598 | {
599 | "cell_type": "code",
600 | "execution_count": 39,
601 | "metadata": {
602 | "collapsed": true
603 | },
604 | "outputs": [],
605 | "source": [
606 | "b = np.random.randn(4, 1)"
607 | ]
608 | },
609 | {
610 | "cell_type": "code",
611 | "execution_count": 40,
612 | "metadata": {},
613 | "outputs": [
614 | {
615 | "data": {
616 | "text/plain": [
617 | "array([[ 0.22988676],\n",
618 | " [-0.77589895],\n",
619 | " [-0.77754825],\n",
620 | " [-0.06151452]])"
621 | ]
622 | },
623 | "execution_count": 40,
624 | "metadata": {},
625 | "output_type": "execute_result"
626 | }
627 | ],
628 | "source": [
629 | "b"
630 | ]
631 | },
632 | {
633 | "cell_type": "code",
634 | "execution_count": 41,
635 | "metadata": {},
636 | "outputs": [
637 | {
638 | "data": {
639 | "text/plain": [
640 | "array([-0.06151452])"
641 | ]
642 | },
643 | "execution_count": 41,
644 | "metadata": {},
645 | "output_type": "execute_result"
646 | }
647 | ],
648 | "source": [
649 | "b[3]"
650 | ]
651 | },
652 | {
653 | "cell_type": "code",
654 | "execution_count": 42,
655 | "metadata": {
656 | "collapsed": true
657 | },
658 | "outputs": [],
659 | "source": [
660 | "a = np.random.randn(3, 3)\n",
661 | "b = np.random.randn(3, 1)\n",
662 | "c = a*b"
663 | ]
664 | },
665 | {
666 | "cell_type": "code",
667 | "execution_count": 44,
668 | "metadata": {},
669 | "outputs": [
670 | {
671 | "data": {
672 | "text/plain": [
673 | "array([[-0.01004274, -0.45400667, -1.97007744],\n",
674 | " [-0.54591752, -0.59968557, 1.47375852],\n",
675 | " [ 0.33738485, 1.00607007, 0.69213239]])"
676 | ]
677 | },
678 | "execution_count": 44,
679 | "metadata": {},
680 | "output_type": "execute_result"
681 | }
682 | ],
683 | "source": [
684 | "a"
685 | ]
686 | },
687 | {
688 | "cell_type": "code",
689 | "execution_count": 45,
690 | "metadata": {},
691 | "outputs": [
692 | {
693 | "data": {
694 | "text/plain": [
695 | "array([[ 0.42442128],\n",
696 | " [-0.8827092 ],\n",
697 | " [-0.5387125 ]])"
698 | ]
699 | },
700 | "execution_count": 45,
701 | "metadata": {},
702 | "output_type": "execute_result"
703 | }
704 | ],
705 | "source": [
706 | "b"
707 | ]
708 | },
709 | {
710 | "cell_type": "code",
711 | "execution_count": 46,
712 | "metadata": {},
713 | "outputs": [
714 | {
715 | "data": {
716 | "text/plain": [
717 | "array([[-0.00426235, -0.19269009, -0.83614278],\n",
718 | " [ 0.48188642, 0.52934797, -1.30090021],\n",
719 | " [-0.18175343, -0.54198252, -0.37286037]])"
720 | ]
721 | },
722 | "execution_count": 46,
723 | "metadata": {},
724 | "output_type": "execute_result"
725 | }
726 | ],
727 | "source": [
728 | "c"
729 | ]
730 | },
731 | {
732 | "cell_type": "code",
733 | "execution_count": null,
734 | "metadata": {
735 | "collapsed": true
736 | },
737 | "outputs": [],
738 | "source": []
739 | },
740 | {
741 | "cell_type": "code",
742 | "execution_count": null,
743 | "metadata": {
744 | "collapsed": true
745 | },
746 | "outputs": [],
747 | "source": []
748 | },
749 | {
750 | "cell_type": "code",
751 | "execution_count": null,
752 | "metadata": {
753 | "collapsed": true
754 | },
755 | "outputs": [],
756 | "source": []
757 | },
758 | {
759 | "cell_type": "code",
760 | "execution_count": null,
761 | "metadata": {
762 | "collapsed": true
763 | },
764 | "outputs": [],
765 | "source": []
766 | }
767 | ],
768 | "metadata": {
769 | "kernelspec": {
770 | "display_name": "PySpark",
771 | "language": "python",
772 | "name": "pyspark"
773 | },
774 | "language_info": {
775 | "codemirror_mode": {
776 | "name": "ipython",
777 | "version": 3
778 | },
779 | "file_extension": ".py",
780 | "mimetype": "text/x-python",
781 | "name": "python",
782 | "nbconvert_exporter": "python",
783 | "pygments_lexer": "ipython3",
784 | "version": "3.5.2"
785 | }
786 | },
787 | "nbformat": 4,
788 | "nbformat_minor": 2
789 | }
790 |
--------------------------------------------------------------------------------
/spark_jobs/neighbor.py:
--------------------------------------------------------------------------------
1 | #MIT License
2 | #
3 | #Copyright (c) 2017 Willian Fuks
4 | #
5 | #Permission is hereby granted, free of charge, to any person obtaining a copy
6 | #of this software and associated documentation files (the "Software"), to deal
7 | #in the Software without restriction, including without limitation the rights
8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | #copies of the Software, and to permit persons to whom the Software is
10 | #furnished to do so, subject to the following conditions:
11 | #
12 | #The above copyright notice and this permission notice shall be included in all
13 | #copies or substantial portions of the Software.
14 | #
15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | #SOFTWARE.
22 |
23 | """
24 | Set of tools to run Marreco's neighborhood algorithm in spark.
25 | """
26 |
27 | import os
28 | import sys
29 | import json
30 | import operator
31 | import math
32 | import random
33 | import time
34 | import argparse
35 | import datetime
36 | from collections import defaultdict
37 |
38 | sys.path.append('..')
39 |
40 | from base import MarrecoBase
41 | from py4j.protocol import Py4JJavaError
42 | from pyspark.sql import SparkSession
43 | from pyspark.sql import types as stypes
44 | from pyspark.sql.utils import AnalysisException
45 |
46 |
47 | class MarrecoNeighborJob(MarrecoBase):
48 | """This Class has all methods necessary to build Marreco Neighborhood
49 | against Spark.
50 |
51 | :type context: `pyspark.SparkContext`
52 | :param context: context to run Spark jobs.
53 | """
54 | def transform_data(self, sc, args):
55 | """This method gets datajet files as input and prepare them on a daily
56 | intermediary basis for Marreco's main algorithm DIMSUM.
57 |
58 | :type sc: spark context
59 | :param sc: spark context for running jobs.
60 |
61 | :param args:
62 |
63 | :type days_init: int
64 | :param days: How many days to scan through the files to be used
65 | in the transformation phase. If this value is say
66 | ``5`` then Marreco will take today's date and come
67 | back 5 days in time from where it will start reading
68 | input files.
69 |
70 | :type days_end: int
71 | :param days_end: Similar to ``days_init`` but tells where the end
72 | of scanning should be. If set say equals to ``3``,
73 | then scans back in time until 3 days ago couting
74 | from today.
75 |
76 | :type w_browse: float
77 | :param w_browse: Weight associated to browsing events on skus.
78 |
79 | :type w_purchase: float
80 | :param w_purchase: Weight associated to purchasing events on skus.
81 |
82 | :type force: str
83 | :param force: Either ``yes``, in which case forces recreation of
84 | files, or ``no``, in which case if files already
85 | exist then do nothing.
86 |
87 | :type source_uri: str
88 | :param source_uri: URI from where to read input data from.
89 |
90 | :type inter_uri: str
91 | :param inter_uri: URI to save intermediate results.
92 |
93 | :type neighbor_uri: str
94 | :param neighbor_uri: URI for where to save similarity matrix result.
95 |
96 | :type threshold: float
97 | :param threshold: This should be converted to float. It asserts how
98 | much quality we should sacrifice in order to gain
99 | performance.
100 |
101 | :type decay: float
102 | :param decay: How much less of an influence a score has given how
103 | long ago it happened. The further ago the more this
104 | ``decay`` factor diminishes the value.
105 | """
106 | spark = SparkSession(sc)
107 | for day in range(args.days_init, args.days_end - 1, -1):
108 | formatted_day = self.get_formatted_date(day)
109 | source_uri = args.source_uri.format(formatted_day)
110 | inter_uri = args.inter_uri.format(formatted_day)
111 | try:
112 | inter_data = spark.read.json(inter_uri,
113 | schema = self._load_users_matrix_schema()).first()
114 |
115 | if args.force == 'yes' or not inter_data:
116 | self._process_datajet_day(sc,
117 | source_uri,
118 | inter_uri,
119 | args,
120 | mode='overwrite')
121 | except (Py4JJavaError, AnalysisException):
122 | self._process_datajet_day(sc, source_uri, inter_uri, args)
123 |
124 |
125 | def _process_datajet_day(self,
126 | sc,
127 | uri,
128 | inter_uri,
129 | args,
130 | mode=None,
131 | compression='gzip'):
132 | """Gets datajet json like files and transforms them into data like
133 | [user_id [(sku, score),...]] saving it in the end.
134 |
135 | :type sc: spark context
136 | :param sc: context to run spark jobs.
137 |
138 | :type uri: str
139 | :param uri: where the files are located.
140 |
141 | :type inter_uri: str
142 | :param inter_uri: where intermediate results should be saved.
143 |
144 | :type args: namedtuple
145 | :type args.w_browse: float
146 | :param args.w_browse: weight associated to users browsing history.
147 |
148 | :type args.w_purchase: float
149 | :param args.w_purchase: weight associated to purchases.
150 |
151 | :type args.decay: float
152 | :param args.decay: decay factor for account events that happened
153 | long ago.
154 |
155 | :type mode: str
156 | :param mode: indicates how data should be saved. If ``None`` then
157 | throws error if file already exist. If ``overwrite`` then
158 | deletes previous file and saves new one.
159 | """
160 | sc.textFile(uri) \
161 | .flatMap(lambda x: self._process_json(x, args)) \
162 | .filter(lambda x: x) \
163 | .groupByKey() \
164 | .mapValues(list) \
165 | .flatMap(lambda x: self._aggregate_skus(x)) \
166 | .toDF(schema=self._load_users_matrix_schema()) \
167 | .write.json(inter_uri, compression=compression, mode=mode)
168 |
169 |
170 | def _load_users_matrix_schema(self):
171 | """Loads schema with data type [user, [(sku, score), (sku, score)]]
172 |
173 | :rtype: `pyspark.sql.type.StructType`
174 | :returns: schema speficiation for user -> (sku, score) data.
175 | """
176 | return stypes.StructType(fields=[
177 | stypes.StructField("user_id", stypes.StringType()),
178 | stypes.StructField('interacted_items', stypes.ArrayType(
179 | stypes.StructType(fields=[stypes.StructField('key',
180 | stypes.StringType()), stypes.StructField('score',
181 | stypes.FloatType())])))])
182 |
183 |
184 | def build_marreco(self, sc, args):
185 | """Main method for building Marreco's algorithms and saving results
186 | for later usage.
187 |
188 | :type sc: `pyspark.SparkContext`
189 | :param sc: spark context for running jobs.
190 |
191 | :param args:
192 | :type days_init: int
193 | :param days_init: which date time that will be used for reading data
194 | with intermediary daily results.
195 |
196 | :type days_end: int
197 | :param days_end: until what file to read input data.
198 |
199 | :type inter_uri: str
200 | :param inter_uri: URI where intermediary results should be read from
201 |
202 | :type neighbor_uri: str
203 | :param neighbor_uri: where to save final marreco matrix (similarity
204 | and user_sku_score matrix).
205 |
206 | :type inter_uri: str
207 | :param inter_uri: URI for where to save intermediary results.
208 |
209 | :type users_matrix_uri: str
210 | :param users_matrix_uri: URI for where to save matrix of users
211 | and their interacted skus.
212 |
213 | :type threshold: str
214 | :param threshold: this should be converted to str. Sets how much
215 | we'll sacrifice in terms of quality in exchange
216 | of processing time.
217 | """
218 | spark = SparkSession(sc)
219 | data = sc.emptyRDD()
220 | for day in range(args.days_init, args.days_end - 1, -1):
221 | formatted_day = self.get_formatted_date(day)
222 | inter_uri = self._render_inter_uri(
223 | args.inter_uri.format(formatted_day))
224 |
225 | data = data.union(spark.read.json(inter_uri,
226 | schema=self._load_users_matrix_schema()).rdd)
227 |
228 | data = data.reduceByKey(operator.add) \
229 | .flatMap(lambda x: self._aggregate_skus(x)) \
230 | .filter(lambda x: len(x[1]) > 1 and len(x[1]) <= 20)
231 |
232 | if args.users_matrix_uri:
233 | self._save_users_matrix(args.users_matrix_uri, data)
234 |
235 | pq_b = self._broadcast_pq(sc, data, args.threshold)
236 | data = data.flatMap(lambda x: self._run_DIMSUM(x[1], pq_b)) \
237 | .reduceByKey(operator.add)
238 |
239 | self._save_neighbor_matrix(args.neighbor_uri, data)
240 |
241 |
242 | def _save_neighbor_matrix(self, neighbor_uri, data):
243 | """Turns similarities into the final neighborhood matrix. The schema
244 | for saving the matrix is like {sku0: [(sku1, similarity1)...]}
245 |
246 | :type neighbor_uri: str
247 | :param neighbor_uri: uri for where to save the matrix.
248 |
249 | :type data: RDD
250 | :param data: RDD with data like [sku0, sku1, similarity]
251 | """
252 | def duplicate_keys(row):
253 | """Builds the similarities between both the diagonals
254 | of the similarity matrix. In the DIMSUM algorithm, we just compute
255 | one of the diagonals. Here we will add the transpose of the matrix
256 | so Marreco can see all similarities between all skus.
257 |
258 | :type row: list
259 | :param row: data of type [(sku0, sku1), similarity]
260 |
261 | :rtype: list:
262 | :returns: skus and their transposed similarities, such as
263 | [sku0, [sku1, s]], [sku1, [sku0, s]]
264 | """
265 | yield (row[0][0], [(row[0][1], row[1])])
266 | yield (row[0][1], [(row[0][0], row[1])])
267 |
268 | data.flatMap(lambda x: duplicate_keys(x)) \
269 | .reduceByKey(operator.add) \
270 | .toDF(schema=self._load_neighbor_schema()) \
271 | .write.json(neighbor_uri, compression='gzip', mode='overwrite')
272 |
273 |
274 | def _load_neighbor_schema(self):
275 | """Loads neighborhood schema for similarity matrix
276 |
277 | :rtype: `pyspark.sql.types.StructField`
278 | :returns: schema of type ["key", [("key", "value")]]
279 | """
280 | return stypes.StructType(fields=[
281 | stypes.StructField("item_key", stypes.StringType()),
282 | stypes.StructField("similarity_items", stypes.ArrayType(
283 | stypes.StructType(fields=[
284 | stypes.StructField("key", stypes.StringType()),
285 | stypes.StructField("score", stypes.FloatType())])))])
286 |
287 |
288 | def _save_users_matrix(self, user_matrix_uri, data):
289 | """Saves user -> sku matrix so Marreco can use it later for greater
290 | optimization. In this case, the matrix is saved as:
291 | [user_id, [{"key": sku, "score": score}] interacted_items]
292 |
293 | :type sc: `pyspark.SparkContext`
294 | :param sc: context for spark jobs.
295 |
296 | :type session: `pyspark.sql.SparkSession`
297 | :param session: session used so to be able to save DataFrames.
298 |
299 | :type data: RDD
300 | :param data: RDD with values [user, [(sku, score), (sku, score)]]
301 | """
302 | def transform_users_data(row):
303 | """Transform row from [user, [(sku, score)]] to desired output.
304 |
305 | :type data: RDD
306 | :param data: observed users interaction
307 | """
308 | yield [{"user_id": row[0],
309 | "interacted_items": list(map(
310 | lambda x: {"key": x[0], "score": x[1]}, row[1]))}]
311 | data.toDF(schema=self._load_users_matrix_schema()) \
312 | .write.json(user_matrix_uri, compression='gzip', mode='overwrite')
313 |
314 |
315 | def _run_DIMSUM(self, row, pq_b):
316 | """Implements DIMSUM as describe here:
317 |
318 | http://arxiv.org/abs/1304.1467
319 |
320 | :type row: list
321 | :param row: list with values (user, [(sku, score)...])
322 |
323 | :rtype: list
324 | :returns: similarities between skus in the form [(sku0, sku1, similarity)]
325 | """
326 | for i in range(len(row)):
327 | if random.random() < pq_b.value[row[i][0]][0]:
328 | for j in range(i + 1, len(row)):
329 | if random.random() < pq_b.value[row[j][0]][0]:
330 | value_i = row[i][1] / pq_b.value[row[i][0]][1]
331 | value_j = row[j][1] / pq_b.value[row[j][0]][1]
332 | key = ((row[i][0], row[j][0]) if row[i][0] < row[j][0]
333 | else (row[j][0], row[i][0]))
334 | yield (key, value_i * value_j)
335 |
336 |
337 | def _broadcast_pq(self, sc, data, threshold):
338 | """Builds and broadcast probability ``p`` value and factor ``q`` for
339 | each sku.
340 |
341 | :type data: `spark.RDD`
342 | :param data: RDD with values (user, (sku, score)).
343 |
344 | :type threshold: float
345 | :param threshold: all similarities above this value will be guaranteed
346 | to converge to real value with relative error ``e``.
347 |
348 | :rtype: broadcasted dict
349 | :returns: dict sku -> (p, q) where p is defined as ``gamma / ||c||``
350 | and ``q = min(gamma, ||c||)``.
351 | """
352 | norms = {sku: score for sku, score in
353 | data.flatMap(lambda x: self._process_scores(x)) \
354 | .reduceByKey(operator.add) \
355 | .map(lambda x: (x[0], math.sqrt(x[1]))) \
356 | .collect()}
357 |
358 | gamma = (math.sqrt(10 * math.log(len(norms)) / threshold) if threshold
359 | > 1e-6 else math.inf)
360 |
361 | pq_b = sc.broadcast({sku: (gamma / value, min(gamma, value))
362 | for sku, value in norms.items()})
363 | return pq_b
364 |
365 |
366 | def _process_scores(self, row):
367 | """After all user -> score aggregation is done, this method loops
368 | through each sku for a given user and yields its squared score so
369 | that we can compute the norm ``||c||`` for each sku column.
370 |
371 | :type row: list
372 | :param row: list of type [(user, (sku, score))]
373 |
374 | :rtype: tuple
375 | :returns: tuple of type (sku, (score ** 2))
376 | """
377 | for inner_row in row[1]:
378 | yield (inner_row[0], inner_row[1] ** 2)
379 |
380 |
381 | def _render_inter_uri(self, inter_uri, name_pattern='part-*'):
382 | """Helper function to process inter_uri's for later usage.
383 |
384 | :type inter_uri: str
385 | :param inter_uri: URI used for saving intermediate data transformation
386 | results.
387 |
388 | :type name_pattern: str
389 | :param name_pattern: pattern used by spark to save multiple files.
390 |
391 | :rtype: str
392 | :returns: URI rendered template for retrieving data back to code.
393 | """
394 | return os.path.join(inter_uri, name_pattern)
395 |
396 |
397 | @staticmethod
398 | def _process_json(row, args):
399 | """Mapper function to extract from each line from datajet file
400 | and return interactions between customers and skus.
401 |
402 | :type row: str
403 | :param row: json string with datajet data.
404 |
405 | :type args: namedtuple
406 | :param args: contains values to specify how the json transformantion
407 | should happen.
408 |
409 | :type w_browse: float
410 | :param w_browse: weight associated to the browsing patterns of
411 | customers.
412 |
413 | :type w_purchase: float
414 | :param w_purchase: weight associated to purchasing patterns of
415 | customers.
416 |
417 | :type decay: float
418 | :param decay: determines how much past interactions should be less
419 | meaningful as time passes by.
420 |
421 | :rtype: list
422 | :returns: `yield` on [customerID, (sku, score)]
423 | """
424 | try:
425 | r = json.loads(row)
426 | if (r['event']['source']['tracker'] == 'fish' and
427 | 'local_timestamp' in r['event'] and
428 | r['event']['identifiers']['djUCID']['value'] and
429 | r['event']['type'] in {"productview", "orderconfirmation"}):
430 |
431 | decay_factor = math.exp(-args.decay * (datetime.datetime.now() -
432 | datetime.datetime.utcfromtimestamp(
433 | int(r['event']['local_timestamp']) / 1000.0)).days)
434 |
435 | type_ = r['event']['type']
436 | score = (args.w_browse if type_ == 'productview'
437 | else args.w_purchase) * decay_factor
438 |
439 | if type_ == 'productview':
440 | yield [r['event']['identifiers']['djUCID']['value'],
441 | (r['event']['details']['product']['group_id'], score)]
442 | elif type_ == 'orderconfirmation':
443 | for product in r['event']['details']['products']:
444 | yield [r['event']['identifiers']['djUCID']['value'],
445 | (product['group_id'], score)]
446 | except:
447 | yield []
448 |
449 |
450 | @staticmethod
451 | def _aggregate_skus(row):
452 | """Aggregates skus from customers and their respective scores
453 |
454 | :type row: list
455 | :param row: list having values [user, (sku, score)]
456 |
457 | :rtype: list
458 | :returns: `yield` on [user, (sku, sum(score))]
459 | """
460 | d = defaultdict(float)
461 | for inner_row in row[1]:
462 | d[inner_row[0]] += inner_row[1]
463 | yield (row[0], list(d.items()))
464 |
465 |
466 | def process_sysargs(self, args):
467 | parser = argparse.ArgumentParser()
468 |
469 | parser.add_argument('--days_init',
470 | dest='days_init',
471 | type=int,
472 | help=("Total amount of days to come back in time "
473 | "from today's date."))
474 |
475 | parser.add_argument('--days_end',
476 | dest='days_end',
477 | type=int,
478 | help=("Total amount of days to come back in time "
479 | "from today's date."))
480 |
481 | parser.add_argument('--source_uri',
482 | dest='source_uri',
483 | type=str,
484 | help=("URI template from where to read source "
485 | "files from."))
486 |
487 | parser.add_argument('--inter_uri',
488 | dest='inter_uri',
489 | type=str,
490 | help=('URI for saving intermediary results.'))
491 |
492 | parser.add_argument('--threshold',
493 | dest='threshold',
494 | type=float,
495 | help=('Threshold for acceptable similarity relative'
496 | ' error.'))
497 |
498 | parser.add_argument('--force',
499 | dest='force',
500 | type=str,
501 | help=('If ``yes`` then replace all files with new ones. '
502 | 'If ``no``, then no replacing happens.'))
503 |
504 | parser.add_argument('--users_matrix_uri',
505 | dest='users_matrix_uri',
506 | type=str,
507 | default=None,
508 | help=('where to save matrix of users. If ``None`` '
509 | 'then the matrix is not built.'))
510 |
511 | parser.add_argument('--neighbor_uri',
512 | dest='neighbor_uri',
513 | type=str,
514 | help=('where to save matrix of skus similarities'))
515 |
516 | parser.add_argument('--w_browse',
517 | dest='w_browse',
518 | type=float,
519 | help=('weight associated to browsing action score'))
520 |
521 | parser.add_argument('--w_purchase',
522 | dest='w_purchase',
523 | type=float,
524 | help=('weight associated to purchasing action score'))
525 |
526 | parser.add_argument('--decay',
527 | dest='decay',
528 | type=float,
529 | help=('Decaying factor to account for past interactions'))
530 |
531 | args = parser.parse_args(args)
532 | return args
533 |
534 |
--------------------------------------------------------------------------------