├── notebooks
    ├── .gitkeep
    ├── network_analysis
    │   ├── README.md
    │   └── extract_brexit_taxon_data.ipynb
    ├── eda
    │   ├── notebook_functions.py
    │   └── look_at_sampling_data.ipynb
    └── taxon
    │   ├── taxon_translate.ipynb
    │   └── taxon_eda.ipynb
├── src
    ├── __init__.py
    ├── data
    │   ├── .gitkeep
    │   ├── __init__.py
    │   ├── make_dataset.py
    │   ├── archived_multiprocess
    │   │   ├── __init__.py
    │   │   ├── test_make_dataset.py
    │   │   └── multiprocess_utils.py
    │   ├── tests
    │   │   ├── test.sql
    │   │   ├── query.sql
    │   │   ├── test_bq_extract_data.py
    │   │   ├── test_preprocess.py
    │   │   └── check_dataset.ipynb
    │   ├── queries
    │   │   ├── query_to_fail.sql
    │   │   ├── simple_test.sql
    │   │   ├── prelim_meta_standard_query.sql
    │   │   ├── stnd_taxon_ab.sql
    │   │   ├── standard_query.sql
    │   │   ├── prelim_meta_standard_query_with_pageseq.sql
    │   │   ├── stnd_taxon_no_len_1_devcounts.sql
    │   │   ├── stnd_taxon.sql
    │   │   └── stnd_taxon_no_len_1.sql
    │   ├── preprocess_dataset.py
    │   ├── preprocess_dataset_thinner.py
    │   ├── taxon_translate.py
    │   ├── bq_extract_data.py
    │   ├── preprocess.py
    │   ├── make_network_data.py
    │   └── merge_dataset.py
    ├── features
    │   ├── .gitkeep
    │   ├── __init__.py
    │   ├── tests
    │   │   └── test_build_features.py
    │   └── build_features.py
    ├── models
    │   ├── .gitkeep
    │   ├── __init__.py
    │   ├── predict_model.py
    │   └── train_model.py
    ├── visualization
    │   ├── .gitkeep
    │   ├── __init__.py
    │   └── visualize.py
    ├── logging.conf
    └── analysis
    │   └── journey_events_analysis.py
├── data
    ├── raw_bq_extract
    │   └── .gitkeep
    ├── processed_journey
    │   └── .gitkeep
    └── processed_network
    │   └── .gitkeep
├── reports
    └── figures
    │   └── .gitkeep
├── network_data_pipeline.png
├── .envrc
├── .github
    └── workflows
    │   └── ci.yml
├── LICENSE
├── .gitignore
├── requirements.txt
└── CONTRIBUTING.md


/notebooks/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/features/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/raw_bq_extract/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/data/make_dataset.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/features/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/predict_model.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/train_model.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/visualization/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/processed_journey/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/processed_network/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reports/figures/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/visualization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/visualization/visualize.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/data/archived_multiprocess/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/data/tests/test.sql:
--------------------------------------------------------------------------------
1 | SELECT *
2 | FROM tables
3 | WHERE thing < 5
4 | 


--------------------------------------------------------------------------------
/network_data_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alphagov/govuk-network-data/HEAD/network_data_pipeline.png


--------------------------------------------------------------------------------
/src/data/tests/query.sql:
--------------------------------------------------------------------------------
1 | SELECT *
2 | FROM TABLE_DATE_RANGE([govuk-bigquery-analytics:1337.ga_sessions_],
3 |     TIME_STAMP))
4 |     WHERE PageSeq_Length > 1
5 | 


--------------------------------------------------------------------------------
/.envrc:
--------------------------------------------------------------------------------
1 | export GDRIVE_DATADIR="/Volumes/GoogleDrive/Team Drives/GOV.UK teams/2018-2019/Q3/Knowledge up Q3/Data science/data/"
2 | export DATA_DIR="$PWD/data"
3 | export REPORTS_DIR="$PWD/reports"
4 | export LOGGING_CONFIG="$PWD/src/logging.conf"
5 | export BQ_KEY_DIR="$PWD/key"
6 | export QUERIES_DIR="$PWD/src/data/queries/"
7 | export DOCUMENTS="$HOME/Documents"
8 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | on: [push, pull_request]
 2 | 
 3 | jobs:
 4 |   test:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |       - uses: actions/checkout@v2
 8 |       - uses: actions/setup-python@v2
 9 |         with:
10 |           python-version: '3.6'
11 |       - run: sudo apt-get install python-dev libxml2-dev libxslt-dev libz-dev
12 |       - run: python -m pip install --upgrade pip
13 |       - run: pip install -r requirements.txt
14 |       - run: cd ./src/data/ && python -m pytest tests/
15 |       - run: cd ./src/features/ && python -m pytest tests/
16 | 


--------------------------------------------------------------------------------
/src/logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root
 3 | 
 4 | [handlers]
 5 | keys=consoleHandler,fileHandler
 6 | 
 7 | [formatters]
 8 | keys=pipelineFormatter
 9 | 
10 | [logger_root]
11 | level=DEBUG
12 | handlers=consoleHandler,fileHandler
13 | qualname=pipeline
14 | propagate=0
15 | 
16 | [handler_consoleHandler]
17 | class=StreamHandler
18 | level=DEBUG
19 | formatter=pipelineFormatter
20 | args=(sys.stdout, )
21 | 
22 | [handler_fileHandler]
23 | class=FileHandler
24 | level=DEBUG
25 | formatter=pipelineFormatter
26 | args=('/tmp/govuk-network-data.log', )
27 | 
28 | [formatter_pipelineFormatter]
29 | format=%(asctime)s - %(name)s - %(levelname)s - %(funcName)s  %(message)s
30 | datefmt=
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Government Digital Service
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/data/archived_multiprocess/test_make_dataset.py:
--------------------------------------------------------------------------------
 1 | # to get correct relative path, run the following command from ./src/data/
 2 | # python3 -m pytest tests/
 3 | import make_dataset
 4 | import pandas as pd
 5 | import pandas as pd
 6 | 
 7 | def test_list_to_dict():
 8 |     assert make_dataset.list_to_dict(['Desktop', 'Tablet', 'Mobile', 'Desktop', 'Mobile', 'Desktop']) ==\
 9 |            [('Desktop', 3), ('Tablet', 1), ('Mobile', 2)]
10 | 
11 | 
12 | def test_str_to_dict():
13 |     assert make_dataset.str_to_dict("Mobile,Desktop,Mobile") ==\
14 |            [("Mobile", 2),("Desktop", 1)]
15 | 
16 | 
17 | def test_aggregate_dict():
18 |     assert make_dataset.aggregate_dict([[("Desktop", 3), ("Tablet", 1), ("Mobile", 2)] +
19 |                                         [("Desktop", 3), ("Tablet", 1), ("Mobile", 2)]]) ==\
20 |            [('Desktop', 6), ('Tablet', 2), ('Mobile', 4)]
21 | 
22 | 
23 | # DATA PIPELINE
24 | # generate some test data in
25 | user_journey_dict = {
26 |      'Occurrences': [1, 12, 35],
27 |      'Sequence': ["/page1<<PAGE<:<NULL<:<NULL", "/page2<<PAGE<:<NULL<:<NULL", "/page1<<PAGE<:<NULL<:<NULL<<other>>/page2<<EVENT<:<yesNoFeedbackForm<:<ffNoClick<<other>>/page2<<EVENT<:<yesNoFeedbackForm<:<Send Form<<other"],
28 |      'PageSequence': ["/page1", "/page2", "/page1>>/page2>>/page2"]
29 | }
30 | 
31 | user_journey_df = pd.DataFrame(user_journey_dict)
32 | 
33 | def test_data_exists():
34 |     assert user_journey_df is not None
35 |     assert user_journey_df.shape == (3, 3)
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .python-version
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | #pyenv environment list
34 | list/
35 | 
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 | 
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *,cover
49 | 
50 | # Translations
51 | *.mo
52 | *.pot
53 | 
54 | # Django stuff:
55 | *.log
56 | 
57 | # Sphinx documentation
58 | docs/_build/
59 | 
60 | # PyBuilder
61 | target/
62 | 
63 | # DotEnv configuration
64 | .env
65 | 
66 | # Database
67 | *.db
68 | *.rdb
69 | 
70 | # Pycharm
71 | .idea
72 | 
73 | # VS Code
74 | .vscode/
75 | 
76 | # Spyder
77 | .spyproject/
78 | 
79 | # Jupyter NB Checkpoints
80 | .ipynb_checkpoints/
81 | 
82 | # Some notebooks
83 | 2018-10-15-read_data_into_df.ipynb
84 | 
85 | # exclude data from source control by default
86 | # we don't exclude to show data folder structure
87 | # data/
88 | *.gz
89 | *.csv
90 | 
91 | # Mac OS-specific storage files
92 | .DS_Store
93 | 
94 | # exclude BQ key
95 | key/*
96 | 


--------------------------------------------------------------------------------
/src/data/queries/query_to_fail.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |   COUNT(*) AS Occurrences,
 3 |   PageSeq_Length,
 4 |   Actions_Length,
 5 |   GROUP_CONCAT(TrafficSource,",") AS TrafficSources,
 6 |   GROUP_CONCAT(TrafficMedium,",") AS TrafficMediums,
 7 |   Sequence
 8 | FROM (
 9 |   SELECT
10 |     *
11 |   FROM (
12 |     ----SELECT
13 |       CONCAT(fullVisitorId,"-",STRING(visitId),"-",STRING(visitNumber),"-",STRING(TIMESTAMP(INTEGER(visitStartTime*1000000)))) AS sessionId,
14 |       GROUP_CONCAT(CONCAT(pagePath,"::",CONCAT(IFNULL(hits.eventInfo.eventCategory,"NULL"),"//",IFNULL(hits.eventInfo.eventAction,"NULL"))),">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hits.hitNumber rows BETWEEN unbounded preceding AND unbounded following ) AS Sequence,
15 |       TrafficSource,
16 |       TrafficMedium,
17 |       Date,
18 |       COUNT(*) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hits.hitNumber rows BETWEEN unbounded preceding AND unbounded following ) AS Actions_Length,
19 |       SUM(IF(hits.type='PAGE',1,0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hits.hitNumber rows BETWEEN unbounded preceding AND unbounded following ) AS PageSeq_Length
20 |     FROM (
21 |       SELECT
22 |         fullVisitorId,
23 |         visitId,
24 |         visitNumber,
25 |         visitStartTime,
26 |         hits.page.pagePath AS pagePath,
27 |         hits.hitNumber AS hitNumber,
28 |         trafficSource.source AS TrafficSource,
29 |         trafficSource.medium AS TrafficMedium,
30 |         hits.eventInfo.eventAction,
31 |         date AS Date,
32 |         hits.type,
33 |         hits
34 |     Date,
35 |     Actions_Length,
36 |     PageSeq_Length)
37 | GROUP BY
38 |   Sequence,
39 |   PageSeq_Length,
40 |   Actions_Length


--------------------------------------------------------------------------------
/src/data/queries/simple_test.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |   COUNT(*) AS Occurrences,
 3 |   PageSeq_Length,
 4 |   Actions_Length,
 5 |   GROUP_CONCAT(TrafficSource,",") AS TrafficSources,
 6 |   GROUP_CONCAT(TrafficMedium,",") AS TrafficMediums,
 7 |   Date,
 8 |   Sequence
 9 | FROM (
10 |   SELECT
11 |     *
12 |   FROM (
13 |     SELECT
14 |       CONCAT(fullVisitorId,"-",STRING(visitId),"-",STRING(visitNumber),"-",STRING(TIMESTAMP(INTEGER(visitStartTime*1000000)))) AS sessionId,
15 |       GROUP_CONCAT(CONCAT(pagePath,"::",CONCAT(IFNULL(hits.eventInfo.eventCategory,"NULL"),"//",IFNULL(hits.eventInfo.eventAction,"NULL"))),">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hits.hitNumber rows BETWEEN unbounded preceding AND unbounded following ) AS Sequence,
16 |       TrafficSource,
17 |       TrafficMedium,
18 |       Date,
19 |       COUNT(*) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hits.hitNumber rows BETWEEN unbounded preceding AND unbounded following ) AS Actions_Length,
20 |       SUM(IF(hits.type='PAGE',1,0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hits.hitNumber rows BETWEEN unbounded preceding AND unbounded following ) AS PageSeq_Length
21 |     FROM (
22 |       SELECT
23 |         fullVisitorId,
24 |         visitId,
25 |         visitNumber,
26 |         visitStartTime,
27 |         hits.page.pagePath AS pagePath,
28 |         hits.hitNumber AS hitNumber,
29 |         trafficSource.source AS TrafficSource,
30 |         trafficSource.medium AS TrafficMedium,
31 |         hits.eventInfo.eventAction,
32 |         date AS Date,
33 |         hits.type,
34 |         hits.eventInfo.eventCategory,
35 |       FROM
36 |         TABLE_DATE_RANGE([govuk-bigquery-analytics:87773428.ga_sessions_],
37 |           TIME_STAMP))
38 |     WHERE
39 |       PageSeq_Length > 1
40 |   GROUP BY
41 |     sessionId,
42 |     Sequence,
43 |     TrafficSource,
44 |     TrafficMedium,
45 |     Date,
46 |     Actions_Length,
47 |     PageSeq_Length)
48 | GROUP BY
49 |   Sequence,
50 |   PageSeq_Length,
51 |   Actions_Length


--------------------------------------------------------------------------------
/src/data/queries/prelim_meta_standard_query.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |   COUNT(*) AS Occurrences,
 3 |   STRING_AGG(DeviceCategory,",") AS DeviceCategories,
 4 |   PageSeq_Length,
 5 |   Actions_Length,
 6 |   STRING_AGG(Date,",") AS Dates,
 7 |   Sequence
 8 | FROM (
 9 |   SELECT
10 |     *
11 |   FROM (
12 |     SELECT
13 |       CONCAT(fullVisitorId,"-",CAST(visitId AS STRING),"-",CAST(visitNumber AS STRING)) AS sessionId,
14 |       STRING_AGG(CONCAT(pagePath,"<<",CONCAT(IFNULL(eventCategory, "NULL"),"<:<",IFNULL(eventAction, "NULL"))),
15 |         ">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Sequence,
16 |       DeviceCategory,
17 |       Date,
18 |       COUNT(*) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Actions_Length,
19 |       SUM(IF(htype='PAGE', 1, 0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSeq_Length,
20 |       SUM(IF(eventAction='ffYesClick', 1, 0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS EventYes
21 |     FROM (
22 |       SELECT
23 |         fullVisitorId,
24 |         visitId,
25 |         visitNumber,
26 |         visitStartTime,
27 |         hits.page.pagePath AS pagePath,
28 |         hits.hitNumber AS hitNumber,
29 |         hits.type AS htype,
30 |         hits.eventInfo.eventAction AS eventAction,
31 |         hits.eventInfo.eventCategory AS eventCategory,
32 |         date AS Date,
33 |         device.deviceCategory AS DeviceCategory
34 |       FROM
35 |         `govuk-bigquery-analytics.87773428.ga_sessions_TIME_STAMP` AS sessions
36 |       CROSS JOIN
37 |         UNNEST(sessions.hits) AS hits ))
38 |   GROUP BY
39 |     sessionId,
40 |     Sequence,
41 |     DeviceCategory,
42 |     Date,
43 |     EventYes,
44 |     Actions_Length,
45 |     PageSeq_Length)
46 | GROUP BY
47 |   Sequence,
48 |   PageSeq_Length,
49 |   Actions_Length


--------------------------------------------------------------------------------
/src/data/queries/stnd_taxon_ab.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |   COUNT(*) AS Occurrences,
 3 |   REPLACE(ab_variant,"AB_DIMENSION_VALUE_PREFIX:","") as ABVariant,
 4 |   STRING_AGG(DeviceCategory, ",") AS DeviceCategories,
 5 |   Sequence
 6 | FROM (
 7 |   SELECT
 8 |     *
 9 |   FROM (
10 |     SELECT
11 |       CONCAT(fullVisitorId,"-",CAST(visitId AS STRING),"-",CAST(visitNumber AS STRING)) AS sessionId,
12 |       STRING_AGG(CONCAT(pagePath,"<<",CONCAT(htype,"<:<",IFNULL(eventCategory,
13 |               "NULL"),"<:<",IFNULL(eventAction,
14 |               "NULL")),"<<",taxon), ">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Sequence,
15 |       STRING_AGG(IF(htype = 'PAGE',
16 |           pagePath,
17 |           NULL), ">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSequence,
18 |       DeviceCategory,
19 |       ab_variant
20 |     FROM (
21 |       SELECT
22 |         fullVisitorId,
23 |         visitId,
24 |         visitNumber,
25 |         visitStartTime,
26 |         hits.page.pagePath AS pagePath,
27 |         hits.hitNumber AS hitNumber,
28 |         hits.type AS htype,
29 |         hits.eventInfo.eventAction AS eventAction,
30 |         hits.eventInfo.eventCategory AS eventCategory,
31 |         (
32 |         SELECT
33 |           value
34 |         FROM
35 |           hits.customDimensions
36 |         WHERE
37 |           index=59) AS taxon,
38 |         (
39 |         SELECT
40 |           IFNULL(value,"NULL")
41 |         FROM
42 |           sessions.customDimensions
43 |         WHERE
44 |           index=65) AS ab_variant,
45 |         device.deviceCategory AS DeviceCategory
46 |       FROM
47 |         `govuk-bigquery-analytics.87773428.ga_sessions_TIME_STAMP` AS sessions
48 |       CROSS JOIN
49 |         UNNEST(sessions.hits) AS hits ) )
50 |   WHERE
51 |     DeviceCategory != "tablet" and
52 |     ab_variant LIKE 'AB_DIMENSION_VALUE_PREFIX:%'
53 |     GROUP BY
54 |     sessionId,
55 |     Sequence,
56 |     PageSequence,
57 |     DeviceCategory,
58 |     ab_variant)
59 | GROUP BY
60 |   Sequence,
61 |   ABVariant


--------------------------------------------------------------------------------
/src/data/queries/standard_query.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |   COUNT(*) AS Occurrences,
 3 |   PageSeq_Length,
 4 |   Actions_Length,
 5 |   TrafficSource,
 6 |   TrafficMedium,
 7 |   Date,
 8 |   Sequence
 9 | FROM (
10 |   SELECT
11 |     *
12 |   FROM (
13 |     SELECT
14 |       CONCAT(fullVisitorId,"-",CAST(visitId AS STRING),"-",CAST(visitNumber AS STRING)) AS sessionId,
15 |       STRING_AGG(CONCAT(pagePath,"::",CONCAT(IFNULL(eventCategory,
16 |               "NULL"),"//",IFNULL(eventAction,
17 |               "NULL"))), ">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Sequence,
18 |       TrafficSource,
19 |       TrafficMedium,
20 |       Date,
21 |       COUNT(*) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Actions_Length,
22 |       SUM(IF(htype='PAGE',
23 |           1,
24 |           0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSeq_Length,
25 |       SUM(IF(eventAction='ffYesClick',
26 |           1,
27 |           0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS EventYes
28 |     FROM (
29 |       SELECT
30 |         fullVisitorId,
31 |         visitId,
32 |         visitNumber,
33 |         visitStartTime,
34 |         hits.page.pagePath AS pagePath,
35 |         hits.hitNumber AS hitNumber,
36 |         hits.type AS htype,
37 |         hits.eventInfo.eventAction AS eventAction,
38 |         hits.eventInfo.eventCategory AS eventCategory,
39 |         date AS Date,
40 |         trafficSource.source AS TrafficSource,
41 |         trafficSource.medium AS TrafficMedium
42 |       FROM
43 |         `govuk-bigquery-analytics.87773428.ga_sessions_*` AS sessions
44 |       CROSS JOIN
45 |         UNNEST(sessions.hits) AS hits
46 |       WHERE
47 |         _TABLE_SUFFIX BETWEEN start_date
48 |         AND end_date))
49 |   GROUP BY
50 |     sessionId,
51 |     Sequence,
52 |     TrafficSource,
53 |     TrafficMedium,
54 |     Date,
55 |     EventYes,
56 |     Actions_Length,
57 |     PageSeq_Length)
58 | GROUP BY
59 |   Sequence,
60 |   PageSeq_Length,
61 |   Actions_Length,
62 |   TrafficSource,
63 |   TrafficMedium,
64 |   Date


--------------------------------------------------------------------------------
/src/data/queries/prelim_meta_standard_query_with_pageseq.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |   COUNT(*) AS Occurrences,
 3 |   STRING_AGG(DeviceCategory,",") AS DeviceCategories,
 4 |   PageSeq_Length,
 5 |   Actions_Length,
 6 |   STRING_AGG(Date,",") AS Dates,
 7 |   Sequence,
 8 |   PageSequence
 9 | FROM (
10 |   SELECT
11 |     *
12 |   FROM (
13 |     SELECT
14 |       CONCAT(fullVisitorId,"-",CAST(visitId AS STRING),"-",CAST(visitNumber AS STRING)) AS sessionId,
15 |       STRING_AGG(CONCAT(pagePath,"<<",CONCAT(htype,"<:<",IFNULL(eventCategory, "NULL"),"<:<",IFNULL(eventAction, "NULL"))),
16 |         ">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Sequence,
17 |       STRING_AGG(IF(htype = 'PAGE',pagePath,NULL),">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSequence,
18 |       DeviceCategory,
19 |       Date,
20 |       COUNT(*) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Actions_Length,
21 |       SUM(IF(htype='PAGE', 1, 0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSeq_Length,
22 |       SUM(IF(eventAction='ffYesClick', 1, 0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS EventYes
23 |     FROM (
24 |       SELECT
25 |         fullVisitorId,
26 |         visitId,
27 |         visitNumber,
28 |         visitStartTime,
29 |         hits.page.pagePath AS pagePath,
30 |         hits.hitNumber AS hitNumber,
31 |         hits.type AS htype,
32 |         hits.eventInfo.eventAction AS eventAction,
33 |         hits.eventInfo.eventCategory AS eventCategory,
34 |         date AS Date,
35 |         device.deviceCategory AS DeviceCategory
36 |       FROM
37 |         `govuk-bigquery-analytics.87773428.ga_sessions_TIME_STAMP` AS sessions
38 |       CROSS JOIN
39 |         UNNEST(sessions.hits) AS hits ))
40 |   GROUP BY
41 |     sessionId,
42 |     Sequence,
43 |     PageSequence,
44 |     DeviceCategory,
45 |     Date,
46 |     EventYes,
47 |     Actions_Length,
48 |     PageSeq_Length)
49 | GROUP BY
50 |   Sequence,
51 |   PageSequence,
52 |   PageSeq_Length,
53 |   Actions_Length


--------------------------------------------------------------------------------
/src/features/tests/test_build_features.py:
--------------------------------------------------------------------------------
 1 | # to get correct relative path, run the following command from ./src/features/
 2 | # python -m pytest tests/
 3 | import build_features
 4 | 
 5 | 
 6 | def test_has_loop():
 7 |     assert build_features.has_loop(["page1", "page2", "page1"]) is False
 8 |     assert build_features.has_loop(["page1", "page2", "page2"]) is True
 9 | 
10 | 
11 | def test_has_repetition():
12 |     assert build_features.has_repetition(["page1", "page2", "page3"]) is False
13 |     # Yields true due to self-loop, should be run on collapsed-loop page lists
14 |     assert build_features.has_repetition(["page1", "page1", "page1"]) is True
15 |     assert build_features.has_repetition(["page1", "page2", "page3", "page1", "page4"]) is True
16 |     assert build_features.has_repetition(["page2", "page3", "page2", "page1"]) is True
17 | 
18 | 
19 | def test_count_event_cat():
20 |     assert build_features.count_event_cat([('eventCategory1', 'eventAction1'),
21 |                                            ('eventCategory2', 'eventAction2'),
22 |                                            ('eventCategory2', 'eventAction1')]) == 2
23 | 
24 | 
25 | def test_count_event_act():
26 |     assert build_features.count_event_act([('eventCategory1', 'eventAction1'),
27 |                                            ('eventCategory2', 'eventAction2'),
28 |                                            ('eventCategory2', 'eventAction1')],
29 |                                           category='eventCategory1', action='eventAction1') == 1
30 | 
31 | 
32 | def test_aggregate_event_count():
33 |     assert build_features.aggregate_event_cat([('eventCategory1', 'eventAction1'),
34 |                                                ('eventCategory2', 'eventAction2'),
35 |                                                ('eventCategory2', 'eventAction1')]) == \
36 |            [('eventCategory1', 1), ('eventCategory2', 2)]
37 | 
38 | 
39 | def test_aggregate_event_cat_act():
40 |     assert build_features.aggregate_event_cat_act([('eventCategory1', 'eventAction1'),
41 |                                                    ('eventCategory2', 'eventAction2'),
42 |                                                    ('eventCategory2', 'eventAction1')]) == \
43 |            [(('eventCategory1', 'eventAction1'), 1),
44 |             (('eventCategory2', 'eventAction2'), 1),
45 |             (('eventCategory2', 'eventAction1'), 1)]
46 | 


--------------------------------------------------------------------------------
/src/data/queries/stnd_taxon_no_len_1_devcounts.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |   COUNT(*) AS Occurrences,
 3 |   SUM(IF(DeviceCategory='mobile',1,0)) AS MobileCount,
 4 |   SUM(IF(DeviceCategory='desktop',1,0)) AS DesktopCount,
 5 |   PageSeq_Length,
 6 |   Actions_Length,
 7 |   Sequence,
 8 |   PageSequence
 9 | FROM (
10 |   SELECT
11 |     *
12 |   FROM (
13 |     SELECT
14 |       CONCAT(fullVisitorId,"-",CAST(visitId AS STRING),"-",CAST(visitNumber AS STRING)) AS sessionId,
15 |       STRING_AGG(CONCAT(pagePath,"<<",CONCAT(htype,"<:<",IFNULL(eventCategory,
16 |               "NULL"),"<:<",IFNULL(eventAction,
17 |               "NULL")),"<<",taxon), ">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Sequence,
18 |       STRING_AGG(IF(htype = 'PAGE',
19 |           pagePath,
20 |           NULL),">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSequence,
21 |       DeviceCategory,
22 |       COUNT(*) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Actions_Length,
23 |       SUM(IF(htype='PAGE',
24 |           1,
25 |           0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSeq_Length
26 |     FROM (
27 |       SELECT
28 |         fullVisitorId,
29 |         visitId,
30 |         visitNumber,
31 |         visitStartTime,
32 |         hits.page.pagePath AS pagePath,
33 |         hits.hitNumber AS hitNumber,
34 |         hits.type AS htype,
35 |         hits.eventInfo.eventAction AS eventAction,
36 |         hits.eventInfo.eventCategory AS eventCategory,
37 |         (
38 |         SELECT
39 |           value
40 |         FROM
41 |           hits.customDimensions
42 |         WHERE
43 |           index=59) AS taxon,
44 |         device.deviceCategory AS DeviceCategory
45 |       FROM
46 |         `govuk-bigquery-analytics.87773428.ga_sessions_TIME_STAMP` AS sessions
47 |       CROSS JOIN
48 |         UNNEST(sessions.hits) AS hits )
49 | )
50 | WHERE
51 |       PageSeq_Length >1
52 |   GROUP BY
53 |     sessionId,
54 |     Sequence,
55 |     PageSequence,
56 |     DeviceCategory,
57 |     Actions_Length,
58 |     PageSeq_Length)
59 | GROUP BY
60 |   Sequence,
61 |   PageSequence,
62 |   PageSeq_Length,
63 |   Actions_Length


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | appnope==0.1.0
  2 | atomicwrites==1.2.1
  3 | attrs==18.2.0
  4 | backcall==0.1.0
  5 | bleach==3.3.0
  6 | bokeh==1.3.4
  7 | cachetools==2.1.0
  8 | certifi==2018.10.15
  9 | chardet==3.0.4
 10 | Click==7.0
 11 | cloudpickle==1.2.2
 12 | colorcet==2.0.2
 13 | cycler==0.10.0
 14 | dask==2.5.2
 15 | datashader==0.8.0
 16 | datashape==0.5.2
 17 | decorator==4.3.0
 18 | defusedxml==0.5.0
 19 | distributed==2.5.2
 20 | entrypoints==0.2.3
 21 | fsspec==0.5.2
 22 | google-api-core==1.5.0
 23 | google-auth==1.5.1
 24 | google-auth-oauthlib==0.2.0
 25 | google-cloud-bigquery==1.6.0
 26 | google-cloud-core==0.28.1
 27 | google-resumable-media==0.3.1
 28 | googleapis-common-protos==1.5.3
 29 | HeapDict==1.0.1
 30 | holoviews==1.12.6
 31 | idna==2.7
 32 | imageio==2.6.1
 33 | ipykernel==5.1.0
 34 | ipython==7.0.1
 35 | ipython-genutils==0.2.0
 36 | ipywidgets==7.4.2
 37 | jedi==0.13.1
 38 | Jinja2==2.10.1
 39 | jsonschema==2.6.0
 40 | jupyter==1.0.0
 41 | jupyter-client==5.3.4
 42 | jupyter-console==6.0.0
 43 | jupyter-core>=4.6.0
 44 | kiwisolver==1.1.0
 45 | locket==0.2.0
 46 | MarkupSafe==1.0
 47 | matplotlib==3.1.1
 48 | mistune==0.8.4
 49 | more-itertools==4.3.0
 50 | msgpack==0.6.2
 51 | multipledispatch==0.6.0
 52 | nbconvert==5.4.0
 53 | nbformat==4.4.0
 54 | networkx==2.3
 55 | notebook==6.1.5
 56 | numba==0.49.0
 57 | numpy==1.16.3
 58 | oauthlib==2.1.0
 59 | packaging==19.2
 60 | pandas==0.25.1
 61 | pandas-gbq==0.6.1
 62 | pandocfilters==1.4.2
 63 | param==1.9.2
 64 | parso==0.3.1
 65 | partd==1.0.0
 66 | pexpect==4.6.0
 67 | pickleshare==0.7.5
 68 | Pillow==8.2.0
 69 | pluggy==0.8.0
 70 | prometheus-client==0.4.2
 71 | prompt-toolkit==2.0.6
 72 | protobuf==3.6.1
 73 | psutil==5.6.7
 74 | ptyprocess==0.6.0
 75 | py==1.10.0
 76 | pyasn1==0.4.4
 77 | pyasn1-modules==0.2.2
 78 | pyct==0.4.6
 79 | Pygments==2.7.4
 80 | pyparsing==2.4.2
 81 | pytest==3.9.3
 82 | python-dateutil==2.7.3
 83 | python-louvain==0.13
 84 | pytz==2018.5
 85 | pyviz-comms==0.7.2
 86 | PyWavelets==1.0.3
 87 | PyYAML==5.4
 88 | pyzmq==17.1.2
 89 | qtconsole==4.4.2
 90 | requests>=2.20.0
 91 | requests-oauthlib==1.0.0
 92 | rsa==4.7
 93 | scikit-image==0.16.1
 94 | scipy==1.3.1
 95 | Send2Trash==1.5.0
 96 | simplegeneric==0.8.1
 97 | six==1.11.0
 98 | sortedcontainers==2.1.0
 99 | tblib==1.4.0
100 | terminado>=0.8.3
101 | testpath==0.4.2
102 | toolz==0.10.0
103 | tornado==5.1.1
104 | traitlets==4.3.2
105 | urllib3==1.25.9
106 | wcwidth==0.1.7
107 | webencodings==0.5.1
108 | widgetsnbextension==3.4.2
109 | xarray==0.14.0
110 | zict==1.0.0
111 | 


--------------------------------------------------------------------------------
/src/data/queries/stnd_taxon.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |   COUNT(*) AS Occurrences,
 3 |   STRING_AGG(DeviceCategory,",") AS DeviceCategories,
 4 |   PageSeq_Length,
 5 |   Actions_Length,
 6 |   STRING_AGG(Date,",") AS Dates,
 7 |   Sequence,
 8 |   PageSequence
 9 | FROM (
10 |   SELECT
11 |     *
12 |   FROM (
13 |     SELECT
14 |       CONCAT(fullVisitorId,"-",CAST(visitId AS STRING),"-",CAST(visitNumber AS STRING)) AS sessionId,
15 |       STRING_AGG(CONCAT(pagePath,"<<",CONCAT(htype,"<:<",IFNULL(eventCategory,
16 |               "NULL"),"<:<",IFNULL(eventAction,
17 |               "NULL")),"<<",taxon), ">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Sequence,
18 |       STRING_AGG(IF(htype = 'PAGE',
19 |           pagePath,
20 |           NULL),">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSequence,
21 |       DeviceCategory,
22 |       Date,
23 |       COUNT(*) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Actions_Length,
24 |       SUM(IF(htype='PAGE',
25 |           1,
26 |           0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSeq_Length,
27 |       SUM(IF(eventAction='ffYesClick',
28 |           1,
29 |           0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS EventYes
30 |     FROM (
31 |       SELECT
32 |         fullVisitorId,
33 |         visitId,
34 |         visitNumber,
35 |         visitStartTime,
36 |         hits.page.pagePath AS pagePath,
37 |         hits.hitNumber AS hitNumber,
38 |         hits.type AS htype,
39 |         hits.eventInfo.eventAction AS eventAction,
40 |         hits.eventInfo.eventCategory AS eventCategory,
41 |         (
42 |         SELECT
43 |           value
44 |         FROM
45 |           hits.customDimensions
46 |         WHERE
47 |           index=59) AS taxon,
48 |         date AS Date,
49 |         device.deviceCategory AS DeviceCategory
50 |       FROM
51 |         `govuk-bigquery-analytics.87773428.ga_sessions_TIME_STAMP` AS sessions
52 |       CROSS JOIN
53 |         UNNEST(sessions.hits) AS hits ) )
54 |   GROUP BY
55 |     sessionId,
56 |     Sequence,
57 |     PageSequence,
58 |     DeviceCategory,
59 |     Date,
60 |     EventYes,
61 |     Actions_Length,
62 |     PageSeq_Length)
63 | GROUP BY
64 |   Sequence,
65 |   PageSequence,
66 |   PageSeq_Length,
67 |   Actions_Length


--------------------------------------------------------------------------------
/src/data/queries/stnd_taxon_no_len_1.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |   COUNT(*) AS Occurrences,
 3 |   STRING_AGG(DeviceCategory,",") AS DeviceCategories,
 4 |   PageSeq_Length,
 5 |   Actions_Length,
 6 |   STRING_AGG(Date,",") AS Dates,
 7 |   Sequence,
 8 |   PageSequence
 9 | FROM (
10 |   SELECT
11 |     *
12 |   FROM (
13 |     SELECT
14 |       CONCAT(fullVisitorId,"-",CAST(visitId AS STRING),"-",CAST(visitNumber AS STRING)) AS sessionId,
15 |       STRING_AGG(CONCAT(pagePath,"<<",CONCAT(htype,"<:<",IFNULL(eventCategory,
16 |               "NULL"),"<:<",IFNULL(eventAction,
17 |               "NULL")),"<<",taxon), ">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Sequence,
18 |       STRING_AGG(IF(htype = 'PAGE',
19 |           pagePath,
20 |           NULL),">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSequence,
21 |       DeviceCategory,
22 |       Date,
23 |       COUNT(*) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Actions_Length,
24 |       SUM(IF(htype='PAGE',
25 |           1,
26 |           0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSeq_Length,
27 |       SUM(IF(eventAction='ffYesClick',
28 |           1,
29 |           0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS EventYes
30 |     FROM (
31 |       SELECT
32 |         fullVisitorId,
33 |         visitId,
34 |         visitNumber,
35 |         visitStartTime,
36 |         hits.page.pagePath AS pagePath,
37 |         hits.hitNumber AS hitNumber,
38 |         hits.type AS htype,
39 |         hits.eventInfo.eventAction AS eventAction,
40 |         hits.eventInfo.eventCategory AS eventCategory,
41 |         (
42 |         SELECT
43 |           value
44 |         FROM
45 |           hits.customDimensions
46 |         WHERE
47 |           index=59) AS taxon,
48 |         date AS Date,
49 |         device.deviceCategory AS DeviceCategory
50 |       FROM
51 |         `govuk-bigquery-analytics.87773428.ga_sessions_TIME_STAMP` AS sessions
52 |       CROSS JOIN
53 |         UNNEST(sessions.hits) AS hits )
54 | )
55 | WHERE
56 |       PageSeq_Length >1
57 |   GROUP BY
58 |     sessionId,
59 |     Sequence,
60 |     PageSequence,
61 |     DeviceCategory,
62 |     Date,
63 |     EventYes,
64 |     Actions_Length,
65 |     PageSeq_Length)
66 | GROUP BY
67 |   Sequence,
68 |   PageSequence,
69 |   PageSeq_Length,
70 |   Actions_Length


--------------------------------------------------------------------------------
/src/features/build_features.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | 
 3 | 
 4 | def has_loop(page_list):
 5 |     """
 6 |     Check if a list of page hits contains an adjacent page loop (A >> A >> B) == True.
 7 |     :param page_list: list of page hits derived from BQ user journey
 8 |     :return: True if there is a loop
 9 |     """
10 |     return any(i == j for i, j in zip(page_list, page_list[1:]))
11 | 
12 | 
13 | def has_repetition(page_list):
14 |     """
15 |     Check if a list of page hits contains a page repetition (A >> B >> A) == True.
16 |     Run on journeys with collapsed loops so stuff like A >> A >> B are not captured as a repetition.
17 |     Similar to cycles/triangles, but from a flat perspective.
18 |     :param page_list: list of page hits derived from BQ user journey
19 |     :return: True if there is a repetition
20 |     """
21 |     return len(set(page_list)) != len(page_list)
22 | 
23 | 
24 | # Counters for events
25 | def count_event_cat(event_list):
26 |     """
27 |     TODO: possibly remove
28 |     Count different event categories present in an event_list. Includes "NULL" events coming from page
29 |     hits for the sake of completeness. Does not include frequency.
30 |     :param event_list: list of event tuples (eventCategory,eventAction)
31 |     :return: number of different eventCategories present
32 |     """
33 |     return len(set([cat for cat, _ in event_list]))
34 | 
35 | 
36 | def count_event_act(event_list, category, action):
37 |     """
38 |     TODO: possibly remove
39 |     Count number of specific eventActions given a specific eventCategory
40 |     :param event_list: list of event tuples (eventCategory,eventAction)
41 |     :param category: target eventCategory
42 |     :param action: target eventAction
43 |     :return: count
44 |     """
45 |     return [action for cat, action in event_list if cat == category].count(action)
46 | 
47 | 
48 | def aggregate_event_cat(event_list):
49 |     """
50 |     Return a dictionary-like list of eventCategory frequency counts.
51 |     :param event_list: list of event tuples (eventCategory,eventAction)
52 |     :return: dict-like list of frequencies [(eventCat1, freq_1),(eventCat2, freq_2),...]
53 |     """
54 |     return list(Counter([cat for cat, _ in event_list]).items())
55 | 
56 | 
57 | def aggregate_event_cat_act(event_list):
58 |     """
59 |     Return a dictionary-like list of (eventCategory,eventAction) frequency counts.
60 |     :param event_list: list of event tuples (eventCategory,eventAction)
61 |     :return: dict-like list of frequencies [((eventCat1,eventAction1) freq_1),((eventCat1,eventAction2) freq_2),...]
62 |     """
63 |     return list(Counter([(cat, act) for cat, act in event_list]).items())
64 | 


--------------------------------------------------------------------------------
/notebooks/network_analysis/README.md:
--------------------------------------------------------------------------------
 1 | # Python setup for MacOS
 2 | 
 3 | This is a quick run through on how to set up Python on your machines. We'll be
 4 | using `pip` to install our packages, and `pyenv` with its `pyenv-virtualenv`
 5 | plugin to manage different Python versions and virtual environments,
 6 | respectively.
 7 | 
 8 | Python virtual environments allow you to create an isolated environment. This
 9 | can have its own dependencies (different packages, different versions)
10 | completely separate from every other environment.
11 | 
12 | These instructions have been adapted from [The Hitchhiker's Guide to Python](https://docs.python-guide.org/starting/install3/osx/).
13 | Further detail about `pyenv-virtualenv` can be found in its [documentation](https://github.com/pyenv/pyenv-virtualenv#pyenv-virtualenv).
14 | 
15 | By default, MacOS has Python 2 installed, but we need Python 3.
16 | 
17 | Install [Homebrew](https://brew.sh/) using Terminal.
18 | ```
19 | /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
20 | ```
21 | 
22 | Install the latest version of Python 3 using Homebrew; this should also install
23 | `pip` for you automatically.
24 | ```
25 | brew install python
26 | ```
27 | 
28 | Add your newly-installed Python to PATH, and activate your changes. Then validate your Python 3 version has
29 | been installed.
30 | ```
31 | echo 'export PATH="/usr/local/opt/python/libexec/bin:$PATH"' >> ~/.bash_profile
32 | source ~/.bash_profile  # activates your changes; alternatively, restart your Terminal
33 | python --version  # as of Oct 2019, this should be Python 3.7.4 on Homebrew
34 | ```
35 | 
36 | Use Homebrew to install `pyenv`, and its `pyenv-virtualenv` plugin, add required
37 | lines to your `.bash_profile`, then activate the changes.
38 | ```
39 | brew install pyenv
40 | brew install pyenv-virtual
41 | 
42 | echo 'eval "$(pyenv init -)"' >> ~/.bash_profile
43 | echo 'eval "$(pyenv virtualenv-init -)"' >> ~/.bash_profile
44 | source ~/.bash_profile  # activates your changes; alternatively, restart your Terminal
45 | ```
46 | 
47 | Create a new Python virtual environment running Python 3.6.9; we'll call this
48 | virtual environment `govuk-network-data`.
49 | ```
50 | pyenv virtualenv 3.6.9 govuk-network-data
51 | ```
52 | 
53 | You need to activate this virtual environment before install packages and using
54 | it.
55 | ```
56 | pyenv activate govuk-network-data
57 | ```
58 | 
59 | Now install packages as listed in the `requirements.txt` file in this
60 | repository.
61 | ```
62 | pip install -r <<<PATH TO requirements.txt>>>
63 | ```
64 | 
65 | To deactivate the virtual environment run the following code:
66 | ```
67 | pyenv deactivate
68 | ```
69 | 


--------------------------------------------------------------------------------
/src/data/tests/test_bq_extract_data.py:
--------------------------------------------------------------------------------
 1 | # to get correct relative path, run the following command from ./src/data/
 2 | # python -m pytest tests/
 3 | import bq_extract_data
 4 | 
 5 | 
 6 | def test_find_query():
 7 |     # only returns .sql files, addresses issue 10 somewhat
 8 |     assert bq_extract_data.find_query("test_bq_extract_data.py", "./tests") is None
 9 |     assert bq_extract_data.find_query("quer", "./tests") == "./tests/query.sql"
10 |     # returns first file to match query_arg, bug or feature?
11 |     assert bq_extract_data.find_query("", "./tests") == "./tests/test.sql"
12 |     # potential bug spotted
13 |     # assert bq_extract_data.find_query("query.sql", "./tests") == "./tests/query.sql"
14 | 
15 | 
16 | # test removing linebreaks from sql query file
17 | # add space for line breaks
18 | def test_read_query():
19 |     assert bq_extract_data.read_query("./tests/test.sql") == "SELECT * FROM tables WHERE thing < 5"
20 |     # handles indent as represented by two-spaces
21 |     assert bq_extract_data.read_query("./tests/query.sql") == "SELECT * FROM TABLE_DATE_RANGE([govuk-bigquery-analytics:1337.ga_sessions_],     TIME_STAMP))     WHERE PageSeq_Length > 1"
22 | 
23 | 
24 | def test_change_timestamp():
25 |     """
26 |     Unit test for change_timestamp. Tests for both "standard" and "legacy" SQL timestamp differences.
27 |     """
28 |     # standard
29 |     assert bq_extract_data.change_timestamp(x =  "SELECT * FROM TABLE_DATE_RANGE([govuk-bigquery-analytics:1337.ga_sessions_], TIME_STAMP)) WHERE PageSeq_Length > 1", date = "2018-12-31", dialect = "standard") == 'SELECT * FROM TABLE_DATE_RANGE([govuk-bigquery-analytics:1337.ga_sessions_], 20181231)) WHERE PageSeq_Length > 1'
30 |     # legacy
31 |     assert bq_extract_data.change_timestamp(x =  "SELECT * FROM TABLE_DATE_RANGE([govuk-bigquery-analytics:1337.ga_sessions_], TIME_STAMP)) WHERE PageSeq_Length > 1", date = "2018-12-31", dialect = "legacy") == 'SELECT * FROM TABLE_DATE_RANGE([govuk-bigquery-analytics:1337.ga_sessions_], TIMESTAMP("2018-12-31"), TIMESTAMP("2018-12-31")))) WHERE PageSeq_Length > 1'
32 |     # standard, input x with read_query output
33 |     assert bq_extract_data.change_timestamp(x = bq_extract_data.read_query("./tests/query.sql"), date = "2018-12-31", dialect =  "standard") == 'SELECT * FROM TABLE_DATE_RANGE([govuk-bigquery-analytics:1337.ga_sessions_],     20181231))     WHERE PageSeq_Length > 1'
34 | 
35 | # functional test
36 | def test_find_read_change_timestamp_combined():
37 |     """
38 |     Combines the three functions above. A user provides an
39 |     approximate name of the file in a given dir that holds their
40 |     SQL query of interest. This is read in and converted to a string,
41 |     replacing line breaks with spaces. This "SQL query" str
42 |     then has its timestamps adjusted to the correct dialect
43 |     and so that the correct table is read in BigQuery.
44 |     One table per day.
45 |     """
46 |     assert bq_extract_data.change_timestamp(bq_extract_data.read_query(bq_extract_data.find_query("query", "./tests")),
47 |                                             date = "2018-12-31", dialect =  "standard") == 'SELECT * FROM TABLE_DATE_RANGE([govuk-bigquery-analytics:1337.ga_sessions_],     20181231))     WHERE PageSeq_Length > 1'
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/src/data/tests/test_preprocess.py:
--------------------------------------------------------------------------------
 1 | # to get correct relative path, run the following command from ./src/data/
 2 | # python3 -m pytest tests/
 3 | import preprocess
 4 | 
 5 | 
 6 | def test_bq_journey_to_pe_list():
 7 |     assert preprocess.bq_journey_to_pe_list("page1<<eventCategory1<:<eventAction1>>page2<<eventCategory2<:<eventAction2>>") ==\
 8 |            [('page1', 'eventCategory1<:<eventAction1'), ('page2', 'eventCategory2<:<eventAction2')]
 9 | 
10 | 
11 | def test_split_event():
12 |     assert preprocess.split_event("eventCategory<:<eventAction") ==\
13 |            ('eventCategory', 'eventAction')
14 |     assert preprocess.split_event("yesNoFeedbackForm<:<ffYesClick") ==\
15 |            ("yesNoFeedbackForm", "ffYesClick")
16 | 
17 | 
18 | def test_extract_pe_components():
19 |     # get event category and the type of action taken when parameter i = 1
20 |     assert preprocess.extract_pe_components([('page1', 'eventCategory1<:<eventAction1'),
21 |                                              ('page2', 'eventCategory2<:<eventAction2'),
22 |                                              ('page3', 'eventCategory2<:<eventAction1')], 1) ==\
23 |            [('eventCategory1', 'eventAction1'), ('eventCategory2', 'eventAction2'), ('eventCategory2', 'eventAction1')]
24 |     # should this return an empty list? Potential bug?
25 |     assert preprocess.extract_pe_components([('page1', 'eventCategory1<:<eventAction1'),
26 |                                              ('page2', 'eventCategory2<:<eventAction2'),
27 |                                              ('page3', 'eventCategory2<:<eventAction1')], i = 0) == []
28 | 
29 | 
30 | def test_collapse_loop():
31 |     assert preprocess.collapse_loop(["page1","page1", "page2", "page3", "page1"]) ==\
32 |            ["page1", "page2", "page3", "page1"]
33 |     assert preprocess.collapse_loop(["page1","page1", "page2", "page1", "page1"]) ==\
34 |            ["page1", "page2", "page1"]
35 | 
36 | 
37 | def test_start_end_page():
38 |     assert preprocess.start_end_page(["page1","page1", "page2", "page3", "page1"]) ==\
39 |            ("page1", "page1")
40 |     # if page_list length is one then return just that page, bug fix
41 |     assert preprocess.start_end_page(["page1"]) == "page1"
42 | 
43 | 
44 | def test_subpaths_aka_edges_from_list():
45 |     assert preprocess.subpaths_from_list(["page1","page1", "page2", "page3", "page1"]) ==\
46 |            [['page1', 'page1'], ['page1', 'page2'], ['page2', 'page3'], ['page3', 'page1']]
47 | 
48 | 
49 | def test_start_page():
50 |     assert preprocess.start_page(["page1", "page2", "page3"]) == "page1"
51 | 
52 | 
53 | def test_end_page():
54 |     assert preprocess.end_page(["page1", "page2", "page3"]) == "page3"
55 | 
56 | 
57 | def test_start_end_subpath_list():
58 |     assert preprocess.start_end_subpath_list([['page1', 'page1'], ['page1', 'page2'],
59 |                                               ['page2', 'page3'], ['page3', 'page1']]) ==\
60 |            ("page1", "page1")
61 | 
62 | 
63 | def test_start_end_edges_subpath_list():
64 |     assert preprocess.start_end_edges_subpath_list([['page1', 'page1'], ['page1', 'page2'],
65 |                                                     ['page2', 'page3'], ['page3', 'page1']]) ==\
66 |            (['page1', 'page1'], ['page3', 'page1'])
67 | 


--------------------------------------------------------------------------------
/notebooks/network_analysis/extract_brexit_taxon_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "We filter user journeys for those that pass through at least one page in the Brexit taxon and output as a tsv."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 12,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import pandas as pd\n",
 17 |     "import os\n",
 18 |     "import gzip"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 13,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "output = os.path.join(os.getenv(\"DATA_DIR\"),\"output\")"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 14,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "target = os.path.join(output, \"test_line_write_final.csv.gz\")"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 15,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "name": "stdout",
 46 |      "output_type": "stream",
 47 |      "text": [
 48 |       "0\n",
 49 |       "500000\n",
 50 |       "1000000\n",
 51 |       "1500000\n",
 52 |       "2000000\n",
 53 |       "2500000\n",
 54 |       "3000000\n",
 55 |       "3500000\n",
 56 |       "4000000\n",
 57 |       "4500000\n",
 58 |       "5000000\n",
 59 |       "5500000\n",
 60 |       "6000000\n"
 61 |      ]
 62 |     },
 63 |     {
 64 |      "data": {
 65 |       "text/plain": [
 66 |        "77112"
 67 |       ]
 68 |      },
 69 |      "execution_count": 15,
 70 |      "metadata": {},
 71 |      "output_type": "execute_result"
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "counter = 0 \n",
 76 |     "list_of_lines = []\n",
 77 |     "with gzip.open(target,\"rt\") as file:\n",
 78 |     "    headers = file.readline().replace(\"\\n\",\"\").split(\"\\t\")\n",
 79 |     "    for i,line in enumerate(file):\n",
 80 |     "        if \"d6c2de5d-ef90-45d1-82d4-5f2438369eea\" in line:\n",
 81 |     "            counter+=1\n",
 82 |     "            list_of_lines.append(line.replace(\"\\n\",\"\"))\n",
 83 |     "        if i%500000 == 0:\n",
 84 |     "            print(i)         \n",
 85 |     "counter"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 16,
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/plain": [
 96 |        "'Occurrences\\tDeviceCategories\\tPageSeq_Length\\tActions_Length\\tDates\\tSequence\\tPageSequence\\tPage_Event_List\\tPage_List\\tEvent_List\\tnum_event_cats\\tEvent_cats_agg\\tEvent_cat_act_agg\\tTaxon_List\\tTaxon_Page_List\\tPage_List_NL\\tPage_Seq_NL'"
 97 |       ]
 98 |      },
 99 |      "execution_count": 16,
100 |      "metadata": {},
101 |      "output_type": "execute_result"
102 |     }
103 |    ],
104 |    "source": [
105 |     "headers_str = \"\\t\".join(headers)\n",
106 |     "headers_str"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 19,
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "data": {
116 |       "text/plain": [
117 |        "'/Users/felisialoukou/Documents/govuk-network-data/data/output/brexit_taxon_29_04.csv.gz'"
118 |       ]
119 |      },
120 |      "execution_count": 19,
121 |      "metadata": {},
122 |      "output_type": "execute_result"
123 |     }
124 |    ],
125 |    "source": [
126 |     "result_path = os.path.join(output, \"brexit_taxon_29_04.csv.gz\")\n",
127 |     "result_path"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 17,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "with gzip.open(result_path, \"w\") as file:\n",
137 |     "    file.write(\"{}\\n\".format(headers_str).encode())\n",
138 |     "    for row in list_of_lines:\n",
139 |     "        file.write(\"{}\\n\".format(row).encode())"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 18,
145 |    "metadata": {},
146 |    "outputs": [
147 |     {
148 |      "data": {
149 |       "text/plain": [
150 |        "77113"
151 |       ]
152 |      },
153 |      "execution_count": 18,
154 |      "metadata": {},
155 |      "output_type": "execute_result"
156 |     }
157 |    ],
158 |    "source": [
159 |     "num_lines = 0 \n",
160 |     "with gzip.open(result_path,\"rt\") as file:\n",
161 |     "    for line in file:\n",
162 |     "        num_lines+=1\n",
163 |     "num_lines"
164 |    ]
165 |   }
166 |  ],
167 |  "metadata": {
168 |   "kernelspec": {
169 |    "display_name": "Python 3",
170 |    "language": "python",
171 |    "name": "python3"
172 |   },
173 |   "language_info": {
174 |    "codemirror_mode": {
175 |     "name": "ipython",
176 |     "version": 3
177 |    },
178 |    "file_extension": ".py",
179 |    "mimetype": "text/x-python",
180 |    "name": "python",
181 |    "nbconvert_exporter": "python",
182 |    "pygments_lexer": "ipython3",
183 |    "version": "3.6.0"
184 |   }
185 |  },
186 |  "nbformat": 4,
187 |  "nbformat_minor": 2
188 | }
189 | 


--------------------------------------------------------------------------------
/src/data/preprocess_dataset.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gzip
  3 | import logging.config
  4 | import os
  5 | import sys
  6 | 
  7 | src = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
  8 | sys.path.append(os.path.join(src, "data"))
  9 | sys.path.append(os.path.join(src, "features"))
 10 | import preprocess as prep
 11 | import build_features as feat
 12 | 
 13 | OTHER_COLUMNS = ['Page_Event_List', 'Page_List', 'Event_List', 'num_event_cats', 'Event_cats_agg',
 14 |                  'Event_cat_act_agg', 'Taxon_List', 'Taxon_Page_List', 'Page_List_NL', 'Page_Seq_NL']
 15 | 
 16 | 
 17 | def count_lines(filepath):
 18 |     logging.info("Counting lines...")
 19 |     with gzip.open(filepath, "r") as file:
 20 |         for index, line in enumerate(file):
 21 |             pass
 22 |     return index
 23 | 
 24 | 
 25 | def read_write_file(input_path, output_path, number_lines):
 26 |     """
 27 | 
 28 |     :param input_path:
 29 |     :param output_path:
 30 |     :return:
 31 |     """
 32 |     with gzip.open(output_path, "w") as write_file:
 33 |         with gzip.open(input_path, "r") as read_file:
 34 |             df_columns = read_file.readline().decode().replace("\n", "").split("\t")
 35 |             print(df_columns)
 36 |             if 'PageSequence' not in df_columns:
 37 |                 OTHER_COLUMNS.insert(2, 'PageSequence')
 38 |             sequence_index = df_columns.index("Sequence")
 39 |             logging.info("Write headers...")
 40 |             all_cols = df_columns + OTHER_COLUMNS
 41 |             print(all_cols)
 42 |             write_to_file = "\t".join(all_cols) + "\n"
 43 |             logging.info("Iteration...")
 44 | 
 45 |             for i, line in enumerate(read_file):
 46 | 
 47 |                 line = line.decode().replace("\n", "")
 48 |                 row = line.split("\t")
 49 | 
 50 |                 for element in row:
 51 |                     if not isinstance(element, str):
 52 |                         write_to_file += str(element) + "\t"
 53 |                     else:
 54 |                         write_to_file += "\"" + str(element) + "\"" + "\t"
 55 | 
 56 |                 sequence = row[sequence_index]
 57 |                 # Writing sequence columns
 58 |                 # Page event
 59 |                 page_event_list = prep.bq_journey_to_pe_list(sequence)
 60 |                 write_to_file += "\"" + str(page_event_list) + "\"" + "\t"
 61 |                 # Page list
 62 |                 page_list = prep.extract_pe_components(page_event_list, 0)
 63 |                 write_to_file += "\"" + str(page_list) + "\"" + "\t"
 64 | 
 65 |                 if 'PageSequence' not in df_columns:
 66 |                     write_to_file += "\"" + ">>".join(page_list) + "\"" + "\t"
 67 | 
 68 |                 # Writing events columns
 69 |                 event_list = prep.extract_pe_components(page_event_list, 1)
 70 |                 write_to_file += "\"" + str(event_list) + "\"" + "\t"
 71 |                 write_to_file += "\"" + str(feat.count_event_cat(event_list)) + "\"" + "\t"
 72 |                 write_to_file += "\"" + str(feat.aggregate_event_cat(event_list)) + "\"" + "\t"
 73 |                 write_to_file += "\"" + str(feat.aggregate_event_cat_act(event_list)) + "\"" + "\t"
 74 | 
 75 |                 # Writing taxon_list
 76 |                 write_to_file += "\"" + str(prep.extract_cd_components(page_event_list, 2)) + "\"" + "\t"
 77 |                 write_to_file += "\"" + str(prep.extract_pcd_list(page_event_list, 2)) + "\"" + "\t"
 78 | 
 79 |                 # Writing loop column stuff
 80 |                 de_looped = prep.collapse_loop(page_list)
 81 |                 write_to_file += "\"" + str(de_looped) + "\"" + "\t"
 82 |                 write_to_file += "\"" + ">>".join(de_looped) + "\""
 83 | 
 84 |                 write_to_file += "\n"
 85 | 
 86 |                 if i % 500000 == 0:
 87 |                     logging.info("At index: {}".format(i))
 88 |                     write_file.write(write_to_file.encode())
 89 |                     write_to_file = ""
 90 |                     write_file.flush()
 91 | 
 92 |                 if i == number_lines - 1 and write_to_file != "":
 93 |                     logging.info("At index via last: {}".format(i))
 94 |                     write_file.write(write_to_file.encode())
 95 |                     write_to_file = ""
 96 |                     write_file.flush()
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 | 
101 |     parser = argparse.ArgumentParser(description='Module that produces a metadata-aggregated and '
102 |                                                  'preprocessed dataset (.csv.gz), given a merged file.')
103 |     parser.add_argument('in_file', help='Input dataframe file, this module adds .csv.gz automatically ')
104 | 
105 |     args = parser.parse_args()
106 | 
107 |     LOGGING_CONFIG = os.getenv("LOGGING_CONFIG")
108 |     logging.config.fileConfig(LOGGING_CONFIG)
109 |     logger = logging.getLogger('preprocess_dataset')
110 | 
111 |     DATA_DIR = os.getenv("DATA_DIR")
112 | 
113 |     read_path = os.path.join(DATA_DIR, "raw_bq_extract", args.in_file+".csv.gz")
114 |     write_path = os.path.join(DATA_DIR, "processed_journey", args.in_file.replace("merged", "preprocessed")+".csv.gz")
115 | 
116 |     if os.path.isfile(read_path):
117 |         logging.info("Reading from \"{}\" and writing to \"{}\"...".format(read_path, write_path))
118 |         num_lines = count_lines(read_path)
119 |         logging.info("Number of rows in dataframe: {}".format(num_lines))
120 |         logging.info("Reading, processing, writing file...")
121 |         read_write_file(read_path, write_path, num_lines)
122 |     else:
123 |         logging.error("Input file \"{}\" does not exist.".format(read_path))
124 | 


--------------------------------------------------------------------------------
/notebooks/eda/notebook_functions.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Helper functions used in the EDA notebooks
  3 | '''
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | def get_end_page(Page_List):
  9 |     return pd.eval(Page_List)[-1]
 10 | 
 11 | def get_end_page_event(Page_Event_List):          
 12 |     return pd.eval(Page_Event_List)[-1][-1]
 13 | 
 14 | def count_desktop(DeviceCategories):
 15 |     thelist  = pd.eval(DeviceCategories)
 16 |     desktop = 0
 17 |     mobile = 0
 18 |     other = 0
 19 |     for i in range(len(thelist)):
 20 |         if thelist[i][0] =='desktop':
 21 |             desktop = thelist[i][1]
 22 |         elif thelist[i][0] =='mobile':
 23 |             mobile = thelist[i][1]
 24 |         else:
 25 |             other = thelist[i][1]
 26 |     return desktop, mobile, other
 27 | 
 28 | def derive_new_variables(df):
 29 |     print("creating page sequence length vars")
 30 |     # string to list
 31 |     df['page_list_eval'] = df['Page_List'].map(pd.eval)
 32 |     # count list items in the page sequence (so this is page count for the journey)
 33 |     df['page_seq_len'] = df['page_list_eval'].map(len)
 34 | 
 35 |     # string to list
 36 |     df['page_list_NL_eval'] = df['Page_List_NL'].map(pd.eval)
 37 |     # Count the page sequence without loops so B -> A ->A is B -> A and length is 2
 38 |     df['page_seq_len_NL'] = df['page_list_NL_eval'].map(len)
 39 | 
 40 |     print("Creating search vars")
 41 | 
 42 |     # variable to count how many times do the keywords that identify search appear in the page sequence?
 43 |     df['count_search'] = df.PageSequence.str.count("/search?") + df.PageSequence.str.count("/search/")
 44 | 
 45 |     # new variable: does the event list include the term "start"? yes ->1, no ->0
 46 |     df['event_list_contains_start'] = np.where(df.Event_List.str.contains("start"), 1, 0)
 47 |     # new variable: does the page sequence include the term "start"? yes ->1, no ->0
 48 |     df['page_seq_contains_start'] = np.where(df.Sequence.str.contains("start"), 1, 0)
 49 |     # new variable: does the page sequence include the term "service.gov.uk"? yes ->1, no ->0
 50 |     # This identifies external links to a service which has passed a serivce assessment
 51 |     df['page_seq_contains_service.gov.uk'] = np.where(df.Sequence.str.contains("service.gov.uk"), 1, 0)
 52 | 
 53 |     df['final_page'] = df['Page_List'].map(get_end_page)
 54 |     df['final_interaction'] = df['Page_Event_List'].map(get_end_page_event)
 55 | 
 56 |     # new variable: does the page sequence include the terms which identify internal search? yes ->1, no ->0
 57 |     df['contains_search_regex'] = np.where(
 58 |         (df.PageSequence.str.contains("/search?")) | (df.PageSequence.str.contains("/search/")), 1, 0)
 59 | 
 60 |     df['contains_search_n'] = df['contains_search_regex'] * df['Page_Seq_Occurrences']
 61 | 
 62 |     df['desktop'], df['mobile'], df['other_device'] = zip(
 63 |         *df['DeviceCategories'].map(count_desktop))
 64 | 
 65 |     df['more_desktop'] = np.where(df['desktop'] > (df['mobile'] + df['other_device']), 1, 0)
 66 | 
 67 |     print("creating final_page_type")
 68 | 
 69 |     df['final_page_type'] = 'other'
 70 |     df.loc[df['final_page'].str.contains('/government/publications/'), 'final_page_type'] = 'government_publication'
 71 |     df.loc[df['final_page'].str.contains('log-in'), 'final_page_type'] = 'login'
 72 |     df.loc[df['final_page'].str.contains('sign-in'), 'final_page_type'] = 'login'
 73 |     df.loc[df['final_page'].str.contains('login'), 'final_page_type'] = 'login'
 74 |     df.loc[df['final_page'].str.contains('check'), 'final_page_type'] = 'check'
 75 |     df.loc[df['final_page'].str.contains('apply'), 'final_page_type'] = 'apply'
 76 |     df.loc[df['final_page'].str.contains('contact'), 'final_page_type'] = 'contact/enquiries'
 77 |     df.loc[df['final_page'].str.contains('enquiries'), 'final_page_type'] = 'contact/enquiries'
 78 |     df.loc[df['final_page'].str.contains(r'get-.*-information.*'), 'final_page_type'] = 'get_information'
 79 |     df.loc[df['final_page'].str.contains('send'), 'final_page_type'] = 'send'
 80 |     df.loc[df['final_page'].str.contains('find'), 'final_page_type'] = 'find'
 81 |     df.loc[df['final_page'].str.contains('calculat'), 'final_page_type'] = 'calculate/calculator'
 82 |     df.loc[df['final_page'].str.contains('order'), 'final_page_type'] = 'order'
 83 |     df.loc[df['final_page'].str.contains('manage'), 'final_page_type'] = 'manage'
 84 |     df.loc[df['final_page'].str.contains('update'), 'final_page_type'] = 'update'
 85 |     df.loc[df['final_page'].str.contains('eligibility'), 'final_page_type'] = 'eligibility'
 86 |     df.loc[df['final_page'].str.contains('estimate'), 'final_page_type'] = 'estimate'
 87 |     df.loc[df['final_page'].str.contains('renew'), 'final_page_type'] = 'renew'
 88 |     df.loc[df['final_page'].str.contains('pay'), 'final_page_type'] = 'pay'
 89 |     df.loc[df['final_page'].str.contains('claim'), 'final_page_type'] = 'claim'
 90 |     df.loc[df['final_page'].str.contains('change'), 'final_page_type'] = 'change'
 91 | 
 92 |     df['final_interaction_type'] = df.final_interaction.str.extract(r'<:<(.*)<:<', expand=False)
 93 |     df['final_external_link'] = df.final_interaction.str.extract(r'EVENT<:<External Link Clicked<:(.*)', expand=False)
 94 |     df['exit_to_assessed_service'] = np.where(df['final_external_link'].str.contains(r'.*service.gov.uk.*', na=False),
 95 |                                               1, 0)
 96 | 
 97 |     return df
 98 | 
 99 | def groupby_percent(df, groupby_var, unit_var, figsize=(10, 5)):
100 |     x = df.groupby(groupby_var).count().reset_index()
101 |     x['percent'] = 100*x[unit_var]/df.shape[0]
102 |     x = x.sort_values(['percent'])
103 | 
104 |     s = pd.DataFrame(x[[groupby_var, unit_var,'percent']])
105 | 
106 |     return(s, x.plot(x=groupby_var, y='percent', kind='barh', figsize=figsize, color='#2B8CC4'))


--------------------------------------------------------------------------------
/src/data/preprocess_dataset_thinner.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gzip
  3 | import logging.config
  4 | import os
  5 | import sys
  6 | 
  7 | src = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
  8 | sys.path.append(os.path.join(src, "data"))
  9 | sys.path.append(os.path.join(src, "features"))
 10 | import preprocess as prep
 11 | import build_features as feat
 12 | 
 13 | OTHER_COLUMNS = ['Page_Event_List', 'Page_List',
 14 |                  # 'Event_List', 'num_event_cats', 'Event_cats_agg',
 15 |                  'Event_cat_act_agg',
 16 |                  # 'Taxon_List', 'Taxon_Page_List', 'Page_List_NL', 'Page_Seq_NL'
 17 |                  ]
 18 | 
 19 | 
 20 | def count_lines(filepath):
 21 |     logging.info("Counting lines...")
 22 |     with gzip.open(filepath, "r") as file:
 23 |         for index, line in enumerate(file):
 24 |             pass
 25 |     return index
 26 | 
 27 | 
 28 | def read_write_file(input_path, output_path, number_lines):
 29 |     """
 30 | 
 31 |     :param input_path:
 32 |     :param output_path:
 33 |     :return:
 34 |     """
 35 |     with gzip.open(output_path, "w") as write_file:
 36 |         with gzip.open(input_path, "r") as read_file:
 37 |             df_columns = read_file.readline().decode().replace("\n", "").split("\t")
 38 |             print(df_columns)
 39 |             if 'PageSequence' not in df_columns:
 40 |                 OTHER_COLUMNS.insert(2, 'PageSequence')
 41 |             sequence_index = df_columns.index("Sequence")
 42 |             logging.info("Write headers...")
 43 |             all_cols = df_columns + OTHER_COLUMNS
 44 |             print(all_cols)
 45 |             write_to_file = "\t".join(all_cols) + "\n"
 46 |             logging.info("Iteration...")
 47 | 
 48 |             for i, line in enumerate(read_file):
 49 | 
 50 |                 line = line.decode().replace("\n", "")
 51 |                 row = line.split("\t")
 52 | 
 53 |                 for element in row:
 54 |                     if not isinstance(element, str):
 55 |                         write_to_file += str(element) + "\t"
 56 |                     else:
 57 |                         write_to_file += "\"" + str(element) + "\"" + "\t"
 58 | 
 59 |                 sequence = row[sequence_index]
 60 |                 # Writing sequence columns
 61 |                 # Page event
 62 |                 page_event_list = prep.bq_journey_to_pe_list(sequence)
 63 |                 write_to_file += "\"" + str(page_event_list) + "\"" + "\t"
 64 |                 # Page list
 65 |                 page_list = prep.extract_pe_components(page_event_list, 0)
 66 |                 write_to_file += "\"" + str(page_list) + "\"" + "\t"
 67 | 
 68 |                 if 'PageSequence' not in df_columns:
 69 |                     write_to_file += "\"" + ">>".join(page_list) + "\"" + "\t"
 70 | 
 71 |                 # Writing events columns
 72 |                 event_list = prep.extract_pe_components(page_event_list, 1)
 73 |                 # write_to_file += "\"" + str(event_list) + "\"" + "\t"
 74 |                 # write_to_file += "\"" + str(feat.count_event_cat(event_list)) + "\"" + "\t"
 75 |                 # write_to_file += "\"" + str(feat.aggregate_event_cat(event_list)) + "\"" + "\t"
 76 | 
 77 |                 # Event_cat_act_agg
 78 |                 write_to_file += "\"" + str(feat.aggregate_event_cat_act(event_list)) + "\"" + "\t"
 79 | 
 80 |                 # # Writing taxon_list
 81 |                 # write_to_file += "\"" + str(prep.extract_cd_components(page_event_list, 2)) + "\"" + "\t"
 82 |                 # write_to_file += "\"" + str(prep.extract_pcd_list(page_event_list, 2)) + "\"" + "\t"
 83 | 
 84 |                 # # Writing loop column stuff
 85 |                 # de_looped = prep.collapse_loop(page_list)
 86 |                 # write_to_file += "\"" + str(de_looped) + "\"" + "\t"
 87 |                 # write_to_file += "\"" + ">>".join(de_looped) + "\""
 88 | 
 89 |                 write_to_file += "\n"
 90 | 
 91 |                 if i % 500000 == 0:
 92 |                     logging.info("At index: {}".format(i))
 93 |                     write_file.write(write_to_file.encode())
 94 |                     write_to_file = ""
 95 |                     write_file.flush()
 96 | 
 97 |                 if i == number_lines - 1 and write_to_file != "":
 98 |                     logging.info("At index via last: {}".format(i))
 99 |                     write_file.write(write_to_file.encode())
100 |                     write_to_file = ""
101 |                     write_file.flush()
102 | 
103 | 
104 | if __name__ == "__main__":
105 | 
106 |     parser = argparse.ArgumentParser(description='Module that produces a metadata-aggregated and '
107 |                                                  'preprocessed dataset (.csv.gz), given a merged file.')
108 |     parser.add_argument('in_file', help='Input dataframe file, this module adds .csv.gz automatically ')
109 | 
110 |     args = parser.parse_args()
111 | 
112 |     LOGGING_CONFIG = os.getenv("LOGGING_CONFIG")
113 |     logging.config.fileConfig(LOGGING_CONFIG)
114 |     logger = logging.getLogger('preprocess_dataset')
115 | 
116 |     DATA_DIR = os.getenv("DATA_DIR")
117 | 
118 |     read_path = os.path.join(DATA_DIR, "raw_bq_extract", args.in_file+".csv.gz")
119 |     write_path = os.path.join(
120 |         DATA_DIR, "processed_journey",
121 |         args.in_file.replace("merged", "preprocessed")+"_thinner.csv.gz")
122 | 
123 |     if os.path.isfile(read_path):
124 |         logging.info("Reading from \"{}\" and writing to \"{}\"...".format(read_path, write_path))
125 |         num_lines = count_lines(read_path)
126 |         logging.info("Number of rows in dataframe: {}".format(num_lines))
127 |         logging.info("Reading, processing, writing file...")
128 |         read_write_file(read_path, write_path, num_lines)
129 |     else:
130 |         logging.error("Input file \"{}\" does not exist.".format(read_path))
131 | 


--------------------------------------------------------------------------------
/src/data/archived_multiprocess/multiprocess_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | 
  5 | def delete_vars(x):
  6 |     """
  7 |     Force object deletion
  8 |     :param x: object to delete
  9 |     """
 10 |     if isinstance(x, list):
 11 |         for xs in x:
 12 |             del xs
 13 |     del x
 14 | 
 15 | 
 16 | def compute_max_depth(test_list, chunks, depth, fewer_than_cpu):
 17 |     """
 18 |     Compute maximum recursive depth of process_dataframes, governs MAX_DEPTH global and at which point of execution
 19 |     one-off rows (based on Occurrence # of PageSequence) will be dropped.
 20 |     :param test_list: dummy list based on list of files to be read/processed.
 21 |     :param chunks: initial number of partitions
 22 |     :param depth: init = 0, increases with every recursive call
 23 |     :return: (int) maximum recursive depth
 24 |     """
 25 |     partitions = partition_list(test_list, chunks, fewer_than_cpu)
 26 |     if len(test_list) > 1:
 27 |         new_lst = [0 for _ in partitions]
 28 |         return compute_max_depth(new_lst, (lambda x: int(x / 2) if int(x / 2) > 0 else 1)(chunks), depth + 1,
 29 |                                  fewer_than_cpu)
 30 |     else:
 31 |         return depth
 32 | 
 33 | 
 34 | def compute_initial_chunksize(number_of_files, num_cpu):
 35 |     """
 36 | 
 37 |     :param num_cpu:
 38 |     :param number_of_files:
 39 |     :return:
 40 |     """
 41 |     if number_of_files > num_cpu:
 42 |         return int(number_of_files / 2)
 43 |     else:
 44 |         return number_of_files
 45 | 
 46 | 
 47 | def compute_batches(files, batchsize):
 48 |     """
 49 | 
 50 |     :param files:
 51 |     :param batchsize:
 52 |     :return:
 53 |     """
 54 | 
 55 |     if len(files) > int(np.ceil(batchsize * 1.5)):
 56 |         return True, merge_small_partition([files[i:i + batchsize] for i in range(0, len(files), batchsize)])
 57 |     else:
 58 |         return False, files
 59 | 
 60 | 
 61 | def merge_sliced_df(sliced_df_list: list, expected_size: int):
 62 |     """
 63 |     Merge dataframe slices (column pairs) when appropriate (codes match) and append to a list of merged dataframes.
 64 |     Due to order of columns, the Occurrences slice will be used as a basis for the merge.
 65 |     :param sliced_df_list: list of slices
 66 |     :param expected_size: number of dataframes that have been originally sliced
 67 |     :return: list of merged dataframes
 68 |     """
 69 |     final_list = [pd.DataFrame()] * expected_size
 70 |     # print([df.shape for i, df in sliced_df_list if i == 0])
 71 |     # i = dataframe code, dataframes may come from multiple files.
 72 |     for i, df in sliced_df_list:
 73 |         # print(df.columns)
 74 |         if len(final_list[i]) == 0:
 75 |             # print("new")
 76 |             final_list[i] = df.copy(deep=True)
 77 |         else:
 78 |             # print("merge")
 79 |             final_list[i] = pd.merge(final_list[i], df, how='left', on='Sequence')
 80 |     return final_list
 81 | 
 82 | 
 83 | def partition_list(dataframe_list: list, chunks: int, fewer_than_cpu):
 84 |     """
 85 |     Build a list of partitions from a list of dataframes. Based on indices.
 86 |     :param dataframe_list: list of dataframes
 87 |     :param chunks: number of indices lists to generate, len(partition_list)
 88 |     :return: partition list, list of lists containing indices
 89 |     """
 90 |     if chunks > 0:
 91 |         initial = [list(xs) for xs in np.array_split(list(range(len(dataframe_list))), chunks)]
 92 |         # print(initial)
 93 |         if len(initial) > 1 and not fewer_than_cpu:
 94 |             initial = merge_small_partition(initial)
 95 |         return initial
 96 |     else:
 97 |         return [[0]]
 98 | 
 99 | 
100 | def merge_small_partition(partitions: list):
101 |     """
102 |     Merge small partitions of length 1 into previous partition, reduce number of recursive runs.
103 |     :param partitions:
104 |     :return:
105 |     """
106 |     to_merge = []
107 |     for partition in partitions:
108 |         if len(partition) == 1:
109 |             to_merge.append(partition[0])
110 |             partitions.remove(partition)
111 |     if len(to_merge) >= 1:
112 |         partitions[-1].extend(to_merge)
113 |     return partitions
114 | 
115 | 
116 | def slice_many_df(df_list, drop_one_offs, sliceable_cols, ordered=False):
117 |     """
118 |     Slice a list of dataframes into their columns. First list will consist of
119 |     (df_number, [Sequence, PageSequence, Occurrences])
120 |     slices, second list will consist of (df_number, [Sequence, AggregatableMetadata1]),
121 |     (df_number, [Sequence, AggregatableMetadata2]) etc.
122 |     Reduces size of dataframes passed on to worker processes, so they don't break.
123 |     :param df_list:
124 |     :param ordered:
125 |     :return:
126 |     """
127 |     if not ordered:
128 |         return [(i, df.iloc[:, ind].copy(deep=True)) for i, df in enumerate(df_list) for ind in
129 |                 slice_dataframe(df, drop_one_offs, sliceable_cols)]
130 |     else:
131 |         return [(i, df.iloc[:, ind].copy(deep=True)) for i, df in enumerate(df_list) for ind in
132 |                 slice_dataframe(df, drop_one_offs, sliceable_cols) if
133 |                 "Occurrences" in df.columns[ind]], [(i, df.iloc[:, ind].copy(deep=True)) for i, df in enumerate(df_list)
134 |                                                     for ind in
135 |                                                     slice_dataframe(df, drop_one_offs, sliceable_cols) if
136 |                                                     "Occurrences" not in df.columns[ind]]
137 | 
138 | 
139 | def slice_dataframe(df, drop_one_offs, sliceable_cols):
140 |     """
141 |     Computes the slices (column pairs) of dataframe
142 |     :param df: dataframe to be sliced
143 |     :param drop_one_offs:
144 |     :param sliceable_cols:
145 |     :return: list of dataframe slices
146 |     """
147 |     sliced_df = []
148 |     for col in sliceable_cols:
149 |         if col in df.columns:
150 |             if col == "Occurrences":
151 |                 if drop_one_offs:
152 |                     sliced_df.append(
153 |                         [df.columns.get_loc("Sequence"), df.columns.get_loc("PageSequence"), df.columns.get_loc(col)])
154 |                 else:
155 |                     sliced_df.append(
156 |                         [df.columns.get_loc("Sequence"), df.columns.get_loc(col)])
157 |             else:
158 |                 sliced_df.append([df.columns.get_loc("Sequence"), df.columns.get_loc(col)])
159 |     return sliced_df
160 | 


--------------------------------------------------------------------------------
/src/data/taxon_translate.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import pandas as pd
  5 | 
  6 | 
  7 | def recursive_parenting(taxon_df, content_id, parent_content_id, parent_list):
  8 |     """
  9 |     Recursively compute a taxon's parents
 10 |     :param taxon_df: taxon dataframe from content tagger (taxon json file)
 11 |     :param content_id: target taxon content_id
 12 |     :param parent_content_id: target taxon's parent content_id
 13 |     :param parent_list: incrementing list of parents
 14 |     :return: recursive call, aggregated list of parents if top level
 15 |     """
 16 |     if isinstance(parent_content_id, float) and len(parent_list) == 0:
 17 |         return []
 18 |     elif isinstance(parent_content_id, float):
 19 |         return [[parent_taxon, i + 1] for i, parent_taxon in enumerate(reversed(parent_list))]
 20 |     else:
 21 |         content_id = parent_content_id
 22 |         parent_content_id = taxon_df[taxon_df.content_id == parent_content_id].iloc[0].parent_content_id
 23 |         title = taxon_df[taxon_df.content_id == content_id].iloc[0].title
 24 |         parent_list.append([content_id, parent_content_id, title])
 25 |         return recursive_parenting(taxon_df, content_id, parent_content_id, parent_list)
 26 | 
 27 | 
 28 | def build_taxon_set(taxon_series):
 29 |     """
 30 |     Build set of unique taxons from the input taxon Series induced from the network node dataframe.
 31 |     :param taxon_series: Taxon column from the network node df, list of taxon content_id lists.
 32 |     :return: unique set containing taxon content_ids from nodes
 33 |     """
 34 |     return set([content_id for taxon_list in taxon_series for content_id in taxon_list])
 35 | 
 36 | 
 37 | def map_taxon_content_ids(target_taxon_df, nodes_df):
 38 |     """
 39 |     Extract taxons from node dataframe as a unique set of taxon content_ids and then compute their title, base_path
 40 |     (main component to be returned), level, parents (if any, else NaN) and finally the top-most parent.
 41 |     :param target_taxon_df: taxon dataframe from content tagger (taxon json file)
 42 |     :param nodes_df: dataframe with network nodes
 43 |     :return: dataframe containing taxon information
 44 |     """
 45 | 
 46 |     column_list = ['content_id', 'title', 'base_path', 'level', 'parents', 'level1_parent']
 47 |     taxon_level_df = pd.DataFrame(columns=column_list)
 48 | 
 49 |     taxon_set = build_taxon_set(nodes_df.Node_Taxon)
 50 | 
 51 |     for content_id in taxon_set:
 52 |         if target_taxon_df[target_taxon_df.content_id == content_id].shape[0] > 0:
 53 |             title = target_taxon_df[target_taxon_df.content_id == content_id].iloc[0].title
 54 |             base_path = target_taxon_df[target_taxon_df.content_id == content_id].iloc[0].base_path
 55 |             parent_list = pd.Series(recursive_parenting(target_taxon_df, content_id,
 56 |                                                         target_taxon_df[
 57 |                                                             target_taxon_df.content_id == content_id].parent_content_id.values[
 58 |                                                             0], []))
 59 |             current_level = len(parent_list) + 1
 60 |             level1_par = title
 61 |             if len(parent_list.values) > 0:
 62 |                 level1_par = parent_list.values[0][0][2]
 63 |             taxon_level_df = pd.concat([taxon_level_df, pd.DataFrame([[content_id,
 64 |                                                                        title,
 65 |                                                                        base_path,
 66 |                                                                        current_level,
 67 |                                                                        parent_list.values,
 68 |                                                                        level1_par]], columns=column_list)])
 69 |     taxon_level_df.reset_index(drop=True, inplace=True)
 70 |     taxon_level_df.drop_duplicates(subset="content_id", keep="first", inplace=True)
 71 |     return taxon_level_df
 72 | 
 73 | 
 74 | def add_taxon_basepath_to_df(node_df, taxon_level_df):
 75 |     """
 76 |     Compute appropriate taxon base_paths for list of taxon content_ids and add to node dataframe.
 77 |     :param node_df: dataframe with network nodes
 78 |     :param taxon_level_df: dataframe containing taxon information (taxons nodes are tagged with)
 79 |     :return: augmented node dataframe, including taxon base_paths
 80 |     """
 81 |     content_basepath_dict = dict(zip(taxon_level_df.content_id, taxon_level_df.base_path))
 82 |     taxon_name_list = []
 83 |     for tup in node_df.itertuples():
 84 |         taxon_basepath = []
 85 |         for taxon in tup.Node_Taxon:
 86 |             if taxon in content_basepath_dict.keys():
 87 |                 taxon_basepath.append(content_basepath_dict[taxon])
 88 |         taxon_name_list.append(taxon_basepath)
 89 |     node_df['Node_Taxon_basepath'] = taxon_name_list
 90 |     return node_df
 91 | 
 92 | 
 93 | if __name__ == "__main__":
 94 |     parser = argparse.ArgumentParser(
 95 |         description='Module to translate taxon content_ids in node file to taxon base paths. In addition, ecursively '
 96 |                     'compute taxon'
 97 |                     'level, parents and top-most parents.')
 98 |     parser.add_argument('node_filename', help='Node input filename.')
 99 |     parser.add_argument('taxon_dir', help='Directory containing taxon json file.')
100 |     parser.add_argument('taxon_output_filename', default="",
101 |                         help='Naming convention for resulting taxon dataframe file. Includes taxons that nodes in node '
102 |                              'file are tagged to.')
103 |     parser.add_argument('-q', '--quiet', action='store_true', default=False, help='Turn off debugging logging.')
104 |     args = parser.parse_args()
105 | 
106 |     DATA_DIR = os.getenv("DATA_DIR")
107 |     nodes_path = os.path.join(DATA_DIR, "processed_data", args.node_filename + ".csv.gz")
108 |     taxons_path = os.path.join(args.taxon_dir, "taxons.json.gz")
109 | 
110 |     if os.path.exists(taxons_path) and os.path.exists(nodes_path):
111 |         print("Working on: {}".format(taxons_path))
112 |         taxons_json_df = pd.read_json(taxons_path, compression="gzip")
113 |         print("Working on: {} ".format(nodes_path))
114 |         nodes_df = pd.read_csv(nodes_path, sep="\t", compression="gzip")
115 | 
116 |         taxon_df = map_taxon_content_ids(taxons_json_df, nodes_df)
117 |         nodes_df = add_taxon_basepath_to_df(nodes_df, taxon_df)
118 | 
119 |         # overwrite option? should it be an option or default?
120 |         nodes_df.to_csv(nodes_path.replace(".csv.gz", "_taxon_base_path.csv.gz"), sep="\t", compression="gzip",
121 |                         index=False)
122 |         # save taxon-specific dataframe
123 |         taxon_output_path = os.path.join(DATA_DIR, "processed_data", args.taxon_output_filename)
124 |         taxon_df.to_csv(taxon_output_path, compression="gzip", index=False)
125 |     else:
126 |         print("Files do not exist:\n {}: {},\n {}: {}".format(taxons_path, os.path.exists(taxons_path), nodes_path,
127 |                                                               os.path.exists(nodes_path)))
128 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | Contributing to govuk-network-data
  2 | ==================================
  3 | 
  4 | Welcome!  govuk-network-data is a community project that aims to work for a wide
  5 | range of Python users and Python codebases.  If you're trying govuk-network-data on
  6 | your Python code, your experience and what you can contribute are
  7 | important to the project's success.
  8 | 
  9 | 
 10 | Getting started, building, and testing
 11 | --------------------------------------
 12 | 
 13 | If you haven't already, take a look at the project's
 14 | [README.md file](README.md).
 15 | 
 16 | Discussion
 17 | ----------
 18 | 
 19 | If you've run into behavior in govuk-network-data you don't understand, or you're
 20 | having trouble working out a good way to apply it to your code, or
 21 | you've found a bug or would like a feature it doesn't have, we want to
 22 | hear from you!
 23 | 
 24 | Our main forum for discussion is the project's [GitHub issue
 25 | tracker](https://github.com/python/mypy/issues).  This is the right
 26 | place to start a discussion of any of the above or most any other
 27 | topic concerning the project.
 28 | 
 29 | #### Code of Conduct
 30 | 
 31 | Everyone participating in the govuk-network-data community, and in particular in our
 32 | issue tracker, pull requests, and Slack channel, is expected to treat
 33 | other people with respect and more generally to follow the guidelines
 34 | articulated in the [Python Community Code of
 35 | Conduct](https://www.python.org/psf/codeofconduct/).
 36 | 
 37 | Submitting Changes
 38 | ------------------
 39 | 
 40 | Even more excellent than a good bug report is a fix for a bug, or the
 41 | implementation of a much-needed new feature. (*)  We'd love to have
 42 | your contributions.
 43 | 
 44 | (*) If your new feature will be a lot of work, we recommend talking to
 45 |     us early -- see below.
 46 | 
 47 | We use the usual GitHub pull-request flow, which may be familiar to
 48 | you if you've contributed to other projects on GitHub.  For the mechanics,
 49 | see [our git and GitHub workflow help page](https://github.com/python/mypy/wiki/Using-Git-And-GitHub),
 50 | or [GitHub's own documentation](https://help.github.com/articles/using-pull-requests/).
 51 | 
 52 | Anyone interested in govuk-network-data may review your code.  One of the govuk-network-data core
 53 | developers will merge your pull request when they think it's ready.
 54 | For every pull request, we aim to promptly either merge it or say why
 55 | it's not yet ready; if you go a few days without a reply, please feel
 56 | free to ping the thread by adding a new comment.
 57 | 
 58 | Preparing Changes
 59 | -----------------
 60 | 
 61 | Before you begin: if your change will be a significant amount of work
 62 | to write, we highly recommend starting by opening an issue laying out
 63 | what you want to do.  That lets a conversation happen early in case
 64 | other contributors disagree with what you'd like to do or have ideas
 65 | that will help you do it.
 66 | 
 67 | The best pull requests are focused, clearly describe what they're for
 68 | and why they're correct, and contain tests for whatever changes they
 69 | make to the code's behavior.  As a bonus these are easiest for someone
 70 | to review, which helps your pull request get merged quickly!  Standard
 71 | advice about good pull requests for open-source projects applies.
 72 | 
 73 | For coding conventions see the  reference to
 74 | [PEP 8](https://www.python.org/dev/peps/pep-0008/) -- for the code you
 75 | put in the pull request.
 76 | 
 77 | Also, do not squash your commits after you have submitted a pull request, as this
 78 | erases context during review. We will squash commits when the pull request is merged.
 79 | 
 80 | You may also find other pages in the
 81 | [govuk-network-data developer guide](https://github.com/python/mypy/wiki/Developer-Guides)
 82 | helpful in developing your change.
 83 | 
 84 | 
 85 | Core developer guidelines
 86 | -------------------------
 87 | 
 88 | Core developers should follow these rules when processing pull requests:
 89 | 
 90 | * Always wait for tests to pass before merging PRs.
 91 | * Use "[Squash and merge](https://github.com/blog/2141-squash-your-commits)"
 92 |   to merge PRs.
 93 | * Delete branches for merged PRs (by core devs pushing to the main repo).
 94 | * Edit the final commit message before merging to conform to the following
 95 |   style (we wish to have a clean `git log` output):
 96 |   * When merging a multi-commit PR make sure that the commit message doesn't
 97 |     contain the local history from the committer and the review history from
 98 |     the PR. Edit the message to only describe the end state of the PR.
 99 |   * Make sure there is a *single* newline at the end of the commit message.
100 |     This way there is a single empty line between commits in `git log`
101 |     output.
102 |   * Split lines as needed so that the maximum line length of the commit
103 |     message is under 80 characters, including the subject line.
104 |   * Capitalize the subject and each paragraph.
105 |   * Make sure that the subject of the commit message has no trailing dot.
106 |   * Use the imperative mood in the subject line (e.g. "Fix typo in README").
107 |   * If the PR fixes an issue, make sure something like "Fixes #xxx." occurs
108 |     in the body of the message (not in the subject).
109 |   * Use Markdown for formatting.
110 | 
111 | 
112 | Issue-tracker conventions
113 | -------------------------
114 | 
115 | We aim to reply to all new issues promptly.  We'll assign a milestone
116 | to help us track which issues we intend to get to when, and may apply
117 | labels to carry some other information.  Here's what our milestones
118 | and labels mean.
119 | 
120 | Sometimes this information might be on Trello and not duplicated on Github, however when we open this code up we should endeavour to rely on Github.
121 | 
122 | ### Task priority and sizing
123 | 
124 | We use GitHub "labels" ([see our
125 | list](https://github.com/python/mypy/labels)) to roughly order what we
126 | want to do soon and less soon.  There's two dimensions taken into
127 | account: **priority** (does it matter to our users) and **size** (how
128 | long will it take to complete).
129 | 
130 | Bugs that aren't a huge deal but do matter to users and don't seem
131 | like a lot of work to fix generally will be dealt with sooner; things
132 | that will take longer may go further out.
133 | 
134 | We are trying to keep the backlog at a manageable size, an issue that is
135 | unlikely to be acted upon in foreseeable future is going to be
136 | respectfully closed.  This doesn't mean the issue is not important, but
137 | rather reflects the limits of the team.
138 | 
139 | The **question** label is for issue threads where a user is asking a
140 | question but it isn't yet clear that it represents something to actually
141 | change.  We use the issue tracker as the preferred venue for such
142 | questions, even when they aren't literally issues, to keep down the
143 | number of distinct discussion venues anyone needs to track.  These might
144 | evolve into a bug or feature request.
145 | 
146 | Issues **without a priority or size** haven't been triaged.  We aim to
147 | triage all new issues promptly, but there are some issues from previous
148 | years that we haven't yet re-reviewed since adopting these conventions.
149 | 
150 | ### Other labels
151 | 
152 | * **needs discussion**: This issue needs agreement on some kind of
153 |   design before it makes sense to implement it, and it either doesn't
154 |   yet have a design or doesn't yet have agreement on one.
155 | * **feature**, **bug**, **crash**, **refactoring**, **documentation**:
156 |   These classify the user-facing impact of the change.  Specifically
157 |   "refactoring" means there should be no user-facing effect.
158 | * **topic-** labels group issues touching a similar aspect of the
159 |   project, for example PEP 484 compatibility, a specific command-line
160 |   option or dependency.
161 | 


--------------------------------------------------------------------------------
/src/data/bq_extract_data.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import fnmatch
  4 | import logging.config
  5 | import os
  6 | import sys
  7 | import traceback
  8 | 
  9 | import pandas as pd
 10 | 
 11 | 
 12 | def find_query(query_arg, query_dir):
 13 |     """(str, str) -> str
 14 |     Return the relative path of the first file in
 15 |     query_dir that contains a match for query_arg string.
 16 |     The first file will be based on alphabetical order.
 17 |     >>>find_query('work', './')
 18 |     './work'
 19 |     """
 20 |     for file in os.listdir(query_dir):
 21 |         if fnmatch.fnmatch(file, "*" + query_arg + "*.sql"):
 22 |             return os.path.join(query_dir, file)
 23 | 
 24 | 
 25 | def read_query(filepath):
 26 |     """(str) -> str
 27 |     Opens the file at filepath for reading, removing /n
 28 |     before rejoining seperate lines with " " seperator.
 29 |     """
 30 |     with open(filepath, 'r') as file:
 31 |         lines = " ".join(line.strip("\n") for line in file)
 32 |     return lines
 33 | 
 34 | 
 35 | def change_timestamp(x, date, dialect):
 36 |     """(str, str, str) -> str
 37 |     Replace the timestamp in x, where x is the SQL query from file,
 38 |     with the date, using the desired SQL dialect, which defaults to legacy.
 39 |     """
 40 |     if dialect == "standard":
 41 |         return x.replace("TIME_STAMP", date.replace("-", ""))
 42 |     else:
 43 |         change = str("TIMESTAMP(\"") + date + "\"), " + str("TIMESTAMP(\"") + date + "\"))"
 44 |         return x.replace("TIME_STAMP", change)
 45 | 
 46 | 
 47 | def looped_query(query_from_file, date_range, exclude_dates, project_id, key_path, destination_dir, filename_stub,
 48 |                  dialect="legacy"):
 49 |     """(str, list, list, str, str, str, str) -> file
 50 |     Saves a compressed csv with filename_stub suffixed to date queried
 51 |     into destination_dir. They'll be one .csv per day queried. The query is
 52 |     derived from query_from_file and run against dates in the date_range
 53 |     that are not excluded by exclude_dates. The project_id and key_path
 54 |     are used to query the correct table and provide the permissions
 55 |     for the query to run using BigQuery. These csv files can be
 56 |     merged later in the pipeline with make_dataset.py.
 57 |     """
 58 |     runs = len(date_range) - len(exclude_dates)
 59 | 
 60 |     logging.info(query_from_file)
 61 | 
 62 |     for i, date in enumerate(date_range):
 63 |         logger.info("RUN {} OUT OF {}".format(str(i + 1), runs))
 64 |         if date not in exclude_dates:
 65 |             df_in = None
 66 |             logger.info("Working on: {}".format(date))
 67 |             logger.info("Query start...")
 68 |             query_for_paths = change_timestamp(query_from_file, date, dialect)
 69 | 
 70 |             try:
 71 |                 df_in = pd.io.gbq.read_gbq(query_for_paths,
 72 |                                            project_id=project_id,
 73 |                                            reauth=False,
 74 |                                            # verbose=True,
 75 |                                            private_key=key_path,
 76 |                                            dialect=dialect)
 77 |             except Exception as e:
 78 |                 logging.error("Oops, gbq failed.\n======\n {} \n======\n".format(traceback.format_exc()))
 79 | 
 80 |             if df_in is not None:
 81 |                 file_name = os.path.join(destination_dir, filename_stub + "_" + str(date) + '.csv.gz')
 82 |                 logger.info("Saving at: {}".format(file_name))
 83 |                 df_in.to_csv(file_name, compression='gzip', index=False, sep="\t")
 84 |                 logger.info("Saved to file.")
 85 |             else:
 86 |                 logger.error("Nothing to save, query failed.")
 87 | 
 88 |         else:
 89 |             logger.info("Skipped target date: {}".format(date))
 90 | 
 91 | 
 92 | if __name__ == "__main__":
 93 |     parser = argparse.ArgumentParser(
 94 |         description='BigQuery extractor module',
 95 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 96 |     parser.add_argument('start_date', help='Start date in Y-m-d, eg 2018-12-31')
 97 |     parser.add_argument('end_date', help='End date in Y-m-d, eg 2018-12-31')
 98 |     parser.add_argument('filename', help='Naming convention for resulting dataframe file(s).')
 99 |     parser.add_argument('query', help='''
100 |         Name of query to use, within queries directory (specified by
101 |         environment variable QUERIES_DIR). The first file in query_dir that
102 |         contains a match for query string is used, this is based on
103 |         alphabetical order.
104 |         ''')
105 |     parser.add_argument('dest_dir', default="", nargs="?",
106 |                         help='Specialized destination directory for resulting dataframe file(s).')
107 |     parser.add_argument('--standard', action='store_true', default=False,
108 |                         help='Specify BigQuery dialect. Legacy default.')
109 |     parser.add_argument('-q', '--quiet', action='store_true', default=False, help='Turn off debugging logging.')
110 |     parser.add_argument('--ab_test_prefix', help='''
111 |         For use with the stnd_taxon_ab query, prefix of the value in the AB
112 |         test custom dimension, the bit before the colon, not including it, for
113 |         example, if you care about values 'RelatedLinksAATest:A' and
114 |         'RelatedLinksAATest:B', pass 'RelatedLinksAATest' through this arg.
115 |         ''')
116 |     args = parser.parse_args()
117 |     if args.standard:
118 |         dialect = "standard"
119 |     else:
120 |         dialect = "legacy"
121 |     # Logger setup
122 |     LOGGING_CONFIG = os.getenv("LOGGING_CONFIG")
123 |     logging.config.fileConfig(LOGGING_CONFIG)
124 |     logger = logging.getLogger('bq_extract')
125 | 
126 |     if args.quiet:
127 |         logging.disable(logging.DEBUG)
128 |     # BQ PROJECT SETUP
129 |     ProjectID = 'govuk-bigquery-analytics'
130 |     KEY_DIR = os.getenv("BQ_KEY_DIR")
131 |     key_file_path = os.path.join(KEY_DIR, os.listdir(KEY_DIR)[0])
132 | 
133 |     # DATA DIRECTORIES
134 |     QUERIES_DIR = os.getenv("QUERIES_DIR")
135 |     DATA_DIR = os.getenv("DATA_DIR")
136 |     dest_dir = os.path.join(DATA_DIR, args.dest_dir if args.dest_dir != "" else "raw_bq_extract")
137 | 
138 |     # DATAFRAME FILENAME(S)
139 |     filename = args.filename
140 | 
141 |     # DATES TO EVALUATE
142 |     start_date = datetime.datetime.strptime(args.start_date, '%Y-%m-%d')
143 |     end_date = datetime.datetime.strptime(args.end_date, '%Y-%m-%d')
144 |     date_list = list(map(lambda x: x.strftime("%Y-%m-%d"), pd.date_range(start_date, end_date).tolist()))
145 | 
146 |     # RESOLVE QUERY FROM ARG
147 |     if len(args.query) > 1:
148 |         query_path = find_query(args.query, QUERIES_DIR)
149 | 
150 |         # If dest_dir doesn't exist, create it.
151 |         if not os.path.isdir(dest_dir):
152 |             logging.info("Specified destination directory \"{}\" does not exist, creating...".format(dest_dir))
153 |             os.mkdir(dest_dir)
154 | 
155 |         logger.info(
156 |             "\n======\nStart date: {} \nEnd date: {} \nDestination directory: {}\
157 |              \nFilename: {} \nQuery: {}\n======\n".format(
158 |                 start_date,
159 |                 end_date,
160 |                 dest_dir,
161 |                 filename,
162 |                 query_path))
163 | 
164 |         if query_path is not None:
165 |             logger.info("Specified query exists, running...")
166 |             query = read_query(query_path)
167 | 
168 |             if "AB_DIMENSION_VALUE_PREFIX" in query:
169 |                 try:
170 |                     query = query.replace(
171 |                         "AB_DIMENSION_VALUE_PREFIX", args.ab_test_prefix)
172 |                 except TypeError:
173 |                     logging.error(
174 |                         f"Tried to replace AB_DIMENSION_VALUE_PREFIX in query,"
175 |                         f" ab_test_prefix argument is {args.ab_test_prefix}")
176 |                     sys.exit()
177 |             looped_query(query, date_list, [], ProjectID, key_file_path, dest_dir, filename, dialect)
178 |     else:
179 |         logger.info("Query failed, not enough info provided")
180 | 


--------------------------------------------------------------------------------
/src/data/preprocess.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | import numpy as np
  4 | 
  5 | 
  6 | def clean_tuple(pe_str_tuple):
  7 |     """
  8 |     TODO: not sure why this is here... maybe quotes break things
  9 |     Transform raw SQL BigQuery string to list of page/event tuples:
 10 |     :param pe_str_tuple: a tuple, ideally length 2 (page1,eventCategory1<:<eventAction1)
 11 |     :return: tuple with quotes removed from each element
 12 |     """
 13 |     # if "http" not in tupes else re.sub(r"\"|\'", "", tupes)
 14 |     return [re.sub(r"\"|\'", "", tupes) for tupes in pe_str_tuple]
 15 | 
 16 | 
 17 | def bq_journey_to_pe_list(bq_journey_string):
 18 |     """
 19 |     Split a BigQuery string page1<<eventCategory1<:<eventAction1>>page2<<eventCategory2<:<eventAction2>>... into a
 20 |     list of tuples page_event_list = [(page1,eventCategory1<:<eventAction1), (page2,eventCategory2<:<eventAction2),
 21 |     ...] The event string eg eventCategory1<:<eventAction1 is further split at a later stage. Nothing is dropped,
 22 |     number of page1<<eventCategory1<:<eventAction1 instances and number of page-event tuples should be equal.
 23 |     :param bq_journey_string:
 24 |     :return: The list of page-event tuples.
 25 |     """
 26 |     # TODO: fix line below, for now it replaces string in a weird /search query within journey (uncommon)
 27 |     bq_journey_string = re.sub(">>iii\.+|>>\.+|\s>>>\s", "", bq_journey_string)
 28 |     page_event_list = []
 29 |     for hit in bq_journey_string.split(">>"):
 30 |         # Old delimiter: split("//")
 31 |         page_event_tup = clean_tuple(hit.split("<<"))
 32 |         # For len==3 Taxon present within bq_journey_string
 33 |         if len(page_event_tup) == 2 or len(page_event_tup) == 3:
 34 |             page_event_list.append(tuple(page_event_tup))
 35 |         else:
 36 |             # TODO remove in future
 37 |             print("Error, tuple split generated too many elements.")
 38 |             print("Overall BigQuery string:", bq_journey_string)
 39 |             print("Too long page_event tuple:", page_event_tup)
 40 |             # Add in dummy variable for debugging and to avoid empty lists
 41 |             # Useful for inspecting real data, uncomment if desired
 42 |             # page_event_list.append(("page1","eventCategory<:<eventAction"))
 43 |             # TODO remove in future
 44 |             # if any(["http" in tup for tup in page_event_tup]):
 45 |             #     page_event_list.append((page_event_tup[0], "::".join(page_event_tup[1:])))
 46 |             # else:
 47 |             #     page_event_list.append(("::".join(page_event_tup[:-1]), page_event_tup[-1]))
 48 |     return page_event_list
 49 | 
 50 | 
 51 | def reindex_pe_list(page_event_list):
 52 |     """
 53 |     TODO: not used right now
 54 |     Reindex and de-loop page_event_list if necessary. Used when absolute hit position within journey
 55 |     needs to be evaluated.
 56 |     If that's the case, page_list and event_list generators should be run based on this list, not
 57 |     page_event_list itself.
 58 |     :param page_event_list:
 59 |     :return:
 60 |     """
 61 |     if len(page_event_list) > 0:
 62 |         position_dict = [(0, page_event_list[0])]
 63 |         for i, (page, event) in enumerate(page_event_list[1:]):
 64 |             # print(i)
 65 |             if page != page_event_list[i][0]:
 66 |                 index = position_dict[-1][0]
 67 |                 position_dict.append((index + 1, (page, event)))
 68 |             elif page == page_event_list[i][0] and (event != position_dict[-1][1][1]):
 69 |                 position_dict.append((position_dict[-1][0], (page, event)))
 70 |         return position_dict
 71 |     return np.NaN
 72 | 
 73 | 
 74 | def split_event(event_str):
 75 |     """
 76 |     Split eventCategory<:<eventAction pair into a tuple. The if conditions are superfluous, there in the case
 77 |     something breaks due to delimiter being present in the str. (rare now)
 78 |     :param event_str: string tuple from
 79 |     page_event_list.
 80 |     EVENT::NULL::NULL
 81 |    (EVENT_NULL,EVENT_NULL)
 82 |     :return: tuple(eventCat,EventAct)
 83 |     """
 84 |     event_tuple = tuple(event_str.split("<:<"))
 85 |     if len(event_tuple) == 2:
 86 |         return event_tuple
 87 |     if len(event_tuple) == 3:
 88 |         if "NULL" in event_tuple[1]:
 89 |             return tuple((event_tuple[0] + "_" + event_tup for event_tup in event_tuple[1:]))
 90 |         else:
 91 |             return tuple(event_tuple[1:])
 92 |     if len(event_tuple) > 3:
 93 |         print("Event tuple has more than two elements:", event_tuple)
 94 |         print("Original:", event_str)
 95 |         # event_tuple = (event_tuple[0], "<<".join(event_tuple[1:]))
 96 |     if len(event_tuple) == 2:
 97 |         print("Event tuple has only one element:", event_tuple)
 98 |         print("Original:", event_str)
 99 | 
100 | 
101 | def extract_pe_components(page_event_list, i):
102 |     """
103 |     Extract page_list or event_list from page_event_list
104 |     :param page_event_list: list of (page,event) tuples
105 |     :param i: 0 for page_list 1, for event_list
106 |     :return: appropriate hit_list
107 |     """
108 |     hit_list = []
109 |     # page_event is a tuple
110 |     for page_event in page_event_list:
111 |         if i == 0 and page_event[1] == "PAGE<:<NULL<:<NULL":
112 |             hit_list.append(page_event[i])
113 |         elif i == 1:
114 |             hit_list.append(split_event(page_event[i]))
115 |     return hit_list
116 | 
117 | 
118 | def extract_pcd_list(page_event_list, cd_index):
119 |     pcd_list = []
120 |     for page_event in page_event_list:
121 |         if page_event[1] == "PAGE<:<NULL<:<NULL":
122 |             pcd_list.append((page_event[0], tuple(page_event[cd_index].split(","))))
123 |     return pcd_list
124 | 
125 | 
126 | def nest_taxon_list(taxon_list):
127 |     return [tuple(taxon.split(",")) for taxon in taxon_list]
128 | 
129 | 
130 | def extract_cd_components(page_event_list, i):
131 |     """
132 |     TODO: probably add functionality as a condition to extract_pe_components
133 |     Extract cd_list from page_event_cd_list
134 |     :param page_event_list: list of (page,event) tuples
135 |     :param i: 0 for page_list 1, for event_list
136 |     :return: appropriate hit_list
137 |     """
138 |     # page_event_cd is a tuple
139 |     # For initial taxon implementation
140 |     return [page_event_cd[i] for page_event_cd in page_event_list if page_event_cd[1] == "PAGE<:<NULL<:<NULL"]
141 | 
142 | 
143 | def extract_page_cd_components(page_event_list, i):
144 |     """
145 |     TODO: probably add functionality as a condition to extract_pe_components
146 |     Extract cd_list from page_event_cd_list
147 |     :param page_event_list: list of (page,event) tuples
148 |     :param i: 0 for page_list 1, for event_list
149 |     :return: appropriate hit_list
150 |     """
151 |     # page_event_cd is a tuple
152 |     # For initial taxon implementation
153 |     return [(page_event_cd[0], page_event_cd[i]) for page_event_cd in page_event_list]
154 | 
155 | 
156 | def collapse_loop(page_list):
157 |     """
158 |     Remove A>>A>>B page loops from page_list. Saved as new dataframe column.
159 |     :param page_list: the list of pages to de-loop
160 |     :return: de-loop page list
161 |     """
162 |     return [node for i, node in enumerate(page_list) if i == 0 or node != page_list[i - 1]]
163 | 
164 | 
165 | # Network things, should probably be moved somewhere else
166 | def start_end_page(page_list):
167 |     """
168 |     Find start and end pages (nodes) in a list of page hits
169 |     :param page_list: list of page hits
170 |     :return: start and end nodes
171 |     """
172 |     if len(page_list) == 1:
173 |         return page_list[0]
174 |     else:
175 |         return page_list[0], page_list[-1]
176 | 
177 | 
178 | def subpaths_from_list(page_list):
179 |     """
180 |     Build node pairs (edges) from a list of page hits
181 |     :param page_list: list of page hits
182 |     :return: list of all possible node pairs
183 |     """
184 |     return [[page, page_list[i + 1]] for i, page in enumerate(page_list) if i < len(page_list) - 1]
185 | 
186 | 
187 | def start_page(page_list):
188 |     """
189 |     First page/node in a list of page hits
190 |     :param page_list: list of page hits
191 |     :return: First page
192 |     """
193 |     return page_list[0]
194 | 
195 | 
196 | def end_page(page_list):
197 |     """
198 |     Last page/node in a list of page hits
199 |     :param page_list: list of page hits
200 |     :return: last page
201 |     """
202 |     return page_list[-1]
203 | 
204 | 
205 | def start_end_subpath_list(subpath_list):
206 |     """
207 |     First and last page from list of node pairs
208 |     :param subpath_list: list of node pairs
209 |     :return: first and last page
210 |     """
211 |     return subpath_list[0][0], subpath_list[-1][-1]
212 | 
213 | 
214 | def start_end_edges_subpath_list(subpath_list):
215 |     """
216 |     First/last node pairs (edges) from list of node pairs
217 |     :param subpath_list: list of node pairs
218 |     :return: first and last node pairs
219 |     """
220 |     return subpath_list[0], subpath_list[-1]
221 | 


--------------------------------------------------------------------------------
/notebooks/taxon/taxon_translate.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import os\n",
 11 |     "import numpy as np\n",
 12 |     "import json\n",
 13 |     "from ast import literal_eval"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "DATA_DIR = os.getenv(\"DATA_DIR\")\n",
 23 |     "filename = \"preprocessed_with_dupes_31_10_taxon2.csv.gz\"\n",
 24 |     "path = os.path.join(DATA_DIR,\"output\", filename)"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 3,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "df = pd.read_csv(path,sep=\"\\t\",compression=\"gzip\")"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "df[\"Taxon_List\"] = df[\"Taxon_List\"].map(literal_eval)"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "def taxon_split(taxon_list):\n",
 52 |     "    return [t for taxon in taxon_list for t in taxon.split(\",\")]"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "#### Build list of unique taxons, excluding \"other\""
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "taxon_counter = Counter()\n",
 69 |     "for tup in df.itertuples():\n",
 70 |     "    taxons = taxon_split(tup.Taxon_List)\n",
 71 |     "    for taxon in taxons:\n",
 72 |     "        taxon_counter[taxon]+=1\n",
 73 |     "len(taxon_counter)      "
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "#### Map taxon `content_id` to `base_path` using content tagger extract"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "taxon_path = os.path.join(os.getenv(\"DOCUMENTS\"),\"taxons.json.gz\")\n",
 90 |     "taxon_df = pd.read_json(taxon_path,compression=\"gzip\")"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "# taxon_path = os.path.join(os.path.dirname(os.getenv(\"DOCUMENTS\")), \"Downloads\", \"2018-11-19 Taxonomy.csv\")\n",
100 |     "# taxon_df = pd.read_csv(taxon_path)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "taxon_df"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "taxon_df.shape"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "taxon_df.columns"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "## Count taxons present in both journeys and taxon export and write to file"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "found = 0\n",
144 |     "with open(\"taxon_id_title_311018.tsv\",\"w\") as writer:\n",
145 |     "    writer.write(\"content_id\\ttitle\\tbase_path\\tparent_content_id\\n\")\n",
146 |     "    for taxon,value in taxon_counter.items():\n",
147 |     "        temp = taxon_df[taxon_df.content_id==taxon]\n",
148 |     "        if temp.shape[0]>0:\n",
149 |     "            found +=1\n",
150 |     "#             print(taxon,\",\",temp.iloc[0].title)\n",
151 |     "            writer.write(\"{}\\t{}\\t{}\\t{}\\n\".format(taxon,\n",
152 |     "                                               temp.iloc[0].title,\n",
153 |     "                                               temp.iloc[0].base_path,\n",
154 |     "                                               temp.iloc[0].parent_content_id))\n",
155 |     "found"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "(found*100)/taxon_df.shape[0]\n",
165 |     "\n",
166 |     "## Translate content_id to level + parents\n",
167 |     "\n",
168 |     "def recursive_parenting(df,content_id,parent_content_id,parent_list):\n",
169 |     "    if isinstance(parent_content_id,float) and len(parent_list)==0:\n",
170 |     "        return []\n",
171 |     "    elif isinstance(parent_content_id,float):\n",
172 |     "        return [[thing,i+1]for i,thing in enumerate(reversed(parent_list))]\n",
173 |     "    else:\n",
174 |     "        content_id = parent_content_id\n",
175 |     "        parent_content_id = df[df.content_id==parent_content_id].iloc[0].parent_content_id\n",
176 |     "        title = df[df.content_id==content_id].iloc[0].title\n",
177 |     "        parent_list.append([content_id,parent_content_id,title])\n",
178 |     "        return recursive_parenting(df,content_id,parent_content_id,parent_list)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "column_list = ['content_id','title','level','parents','level1_parent']\n",
188 |     "taxon_level_df = pd.DataFrame(columns=column_list)\n",
189 |     "missed=0\n",
190 |     "for content_id,value in taxon_counter.items():\n",
191 |     "    if taxon_df[taxon_df.content_id==content_id].shape[0] > 0:\n",
192 |     "        title = taxon_df[taxon_df.content_id==content_id].iloc[0].title\n",
193 |     "        parent_list = pd.Series(recursive_parenting(taxon_df,content_id,\n",
194 |     "                        taxon_df[taxon_df.content_id==content_id].parent_content_id.values[0],[]))\n",
195 |     "        current_level = len(parent_list)+1\n",
196 |     "        level1_par = title\n",
197 |     "        if len(parent_list.values) > 0:\n",
198 |     "            level1_par = parent_list.values[0][0][2]\n",
199 |     "        taxon_level_df = pd.concat([taxon_level_df,pd.DataFrame([[content_id,\n",
200 |     "                                                                  title,\n",
201 |     "                                                                  current_level,\n",
202 |     "                                                                  parent_list.values,\n",
203 |     "                                                                  level1_par]],columns=column_list)])"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "taxon_level_df"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "taxon_level_df.to_csv(\"taxon_level_df.tsv\",sep='\\t',index=False)"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "metadata": {},
227 |    "source": [
228 |     "## Count parent taxons, self-parenting if nan"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "counter =0\n",
238 |     "parent_taxons = Counter()\n",
239 |     "for taxon,value in taxon_counter.items():\n",
240 |     "    temp = taxon_df[taxon_df.content_id==taxon]\n",
241 |     "    if temp.shape[0]>0:\n",
242 |     "        taxon_base_path = temp.iloc[0].base_path\n",
243 |     "        parent = None\n",
244 |     "        if isinstance(temp.iloc[0].parent_content_id,str):\n",
245 |     "            parent = taxon_df[taxon_df.content_id == temp.iloc[0].parent_content_id].iloc[0].title\n",
246 |     "        else:\n",
247 |     "            parent = temp.iloc[0].title\n",
248 |     "        parent_taxons[parent]+=value"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "list(parent_taxons.most_common(30))"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "len(parent_taxons)"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": []
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "metadata": {},
280 |    "outputs": [],
281 |    "source": []
282 |   }
283 |  ],
284 |  "metadata": {
285 |   "kernelspec": {
286 |    "display_name": "Python 3",
287 |    "language": "python",
288 |    "name": "python3"
289 |   },
290 |   "language_info": {
291 |    "codemirror_mode": {
292 |     "name": "ipython",
293 |     "version": 3
294 |    },
295 |    "file_extension": ".py",
296 |    "mimetype": "text/x-python",
297 |    "name": "python",
298 |    "nbconvert_exporter": "python",
299 |    "pygments_lexer": "ipython3",
300 |    "version": "3.6.0"
301 |   }
302 |  },
303 |  "nbformat": 4,
304 |  "nbformat_minor": 2
305 | }
306 | 


--------------------------------------------------------------------------------
/src/analysis/journey_events_analysis.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging.config
  3 | import os
  4 | from ast import literal_eval
  5 | from collections import Counter
  6 | 
  7 | import pandas as pd
  8 | from scipy import stats
  9 | 
 10 | AGGREGATE_COLUMNS = ['DeviceCategories', 'Event_cats_agg', 'Event_cat_act_agg']
 11 | 
 12 | NAVIGATE_EVENT_CATS = ['breadcrumbClicked',
 13 |                        'homeLinkClicked',
 14 |                        '/search',
 15 |                        'navDocumentCollectionLinkClicked',
 16 |                        'navAccordionLinkClicked',
 17 |                        'navLeafLinkClicked',
 18 |                        'navPolicyAreaLinkClicked',
 19 |                        'navServicesInformationLinkClicked',
 20 |                        'navSubtopicContentItemLinkClicked',
 21 |                        'navSubtopicLinkClicked',
 22 |                        'navTopicLinkClicked',
 23 |                        'relatedTaxonomyLinkClicked',
 24 |                        'stepNavHeaderClicked', 'stepNavLinkClicked', 'stepNavPartOfClicked']
 25 | 
 26 | # Useful for explicit event category and action matching, may extend in the future
 27 | NAVIGATE_EVENT_CATS_ACTS = [('relatedLinkClicked', 'Explore the topic')]
 28 | 
 29 | 
 30 | def device_count(x, device):
 31 |     return sum([value for item, value in x if item == device])
 32 | 
 33 | 
 34 | def has_related_event(sequence_str):
 35 |     return all(cond in sequence_str for cond in ["relatedLinkClicked", "Related content"])
 36 | 
 37 | 
 38 | def has_nav_event_cat(sequence_str):
 39 |     return any(event_cat in sequence_str for event_cat in NAVIGATE_EVENT_CATS)
 40 | 
 41 | 
 42 | def has_nav_event_cat_act(sequence_str):
 43 |     return any(
 44 |         event_cat in sequence_str and event_act in sequence_str for event_cat, event_act in NAVIGATE_EVENT_CATS_ACTS)
 45 | 
 46 | 
 47 | def map_device_counter(df):
 48 |     """
 49 |     Count the device-based occurrences per target device and add as new cols.
 50 | 
 51 |     Tablet is ignored as it is assumed to have been filtered.
 52 |     :param df:
 53 |     :return:
 54 |     """
 55 |     logging.info("Mapping device counts")
 56 |     df["DesktopCount"] = df['DeviceCategories'].map(lambda x: device_count(x, "desktop"))
 57 |     df["MobileCount"] = df['DeviceCategories'].map(lambda x: device_count(x, "mobile"))
 58 | 
 59 | 
 60 | def chi2_test(vol_desk, vol_mobile, vol_mobile_rel, vol_desk_rel):
 61 |     vol_mobile_no_rel = vol_mobile - vol_mobile_rel
 62 |     vol_desk_no_rel = vol_desk - vol_desk_rel
 63 |     obs = [[vol_mobile_rel, vol_mobile_no_rel], [vol_desk_rel, vol_desk_no_rel]]
 64 |     return stats.chi2_contingency(obs)
 65 | 
 66 | 
 67 | def compute_volumes(df, occ_cols):
 68 |     return (df[occ].sum() for occ in occ_cols)
 69 | 
 70 | 
 71 | def compute_percents(nums, denoms):
 72 |     if len(nums) == len(denoms):
 73 |         return (round((num * 100) / denom, 2) for num, denom in zip(nums, denoms))
 74 |     return -1
 75 | 
 76 | 
 77 | def compute_stats(df, df_filtered, occ_cols):
 78 |     logger.info("Computing occurrence-based statistics...")
 79 | 
 80 |     ind = ["All", "All_related", "Desktop", "Desktop_rel", "Mobile", "Mobile_rel"]
 81 |     cols = ["Volume", "Percentage", "Shape"]
 82 |     df_stats = pd.DataFrame(index=ind, columns=cols)
 83 | 
 84 |     vol_all, vol_desk, vol_mobile = compute_volumes(df, occ_cols)
 85 |     vol_all_related, vol_desk_rel, vol_mobile_rel = compute_volumes(df_filtered, occ_cols)
 86 | 
 87 |     percent_from_desk, percent_from_mobile = compute_percents([vol_desk, vol_mobile], 2 * [vol_all])
 88 | 
 89 |     percent_related, percent_from_desk_rel, percent_from_mobile_rel = compute_percents(
 90 |         [vol_all_related, vol_desk_rel, vol_mobile_rel],
 91 |         [vol_all, vol_desk, vol_mobile])
 92 | 
 93 |     df_stats["Volume"] = [vol_all, vol_all_related,
 94 |                           vol_desk, vol_desk_rel,
 95 |                           vol_mobile, vol_mobile_rel]
 96 |     df_stats["Percentage"] = [100, percent_related,
 97 |                               percent_from_desk, percent_from_desk_rel,
 98 |                               percent_from_mobile, percent_from_mobile_rel]
 99 | 
100 |     # a, b, c, _ = chi2_test(vol_desk, vol_mobile, vol_mobile_rel, vol_desk_rel)
101 | 
102 |     return df_stats
103 | 
104 | 
105 | def weight_seq_length(page_lengths, occurrences, name):
106 |     length_occ = Counter()
107 |     for length, occ in zip(page_lengths, occurrences):
108 |         length_occ[length] += occ
109 |     data = []
110 |     for key, value in length_occ.items():
111 |         for i in range(value):
112 |             data.append(key)
113 |     return pd.Series(data, name=name)
114 | 
115 | 
116 | def list_zipper(df_list, count_cols, names, col_to_describe):
117 |     return [[df_all[col_to_describe], df_all[count_col], name] for df_all, count_col, name in
118 |             zip(df_list, count_cols, names)]
119 | 
120 | 
121 | def describe_dfs(df_list_all, df_list_filtered, col_to_describe, count_cols):
122 |     """
123 | 
124 |     :param df:
125 |     :param df_related:
126 |     :param col_to_describe:
127 |     :return:
128 |     """
129 | 
130 |     logger.info("Computing statistics for {}".format(col_to_describe))
131 |     descriptive = pd.DataFrame()
132 |     names_all = ["All_" + name for name in ["Journeys", "Desktop", "Mobile"]]
133 |     names_rel = [name + "_Related" for name in ["Journeys", "Desktop", "Mobile"]]
134 | 
135 |     to_eval = list_zipper(df_list_all, count_cols, names_all, col_to_describe) + list_zipper(df_list_filtered,
136 |                                                                                              count_cols,
137 |                                                                                              names_rel, col_to_describe)
138 | 
139 |     for length, occ, name in to_eval:
140 |         sr = weight_seq_length(length, occ, name).describe().apply(lambda x: format(x, '.3f'))
141 |         descriptive[sr.name] = sr
142 | 
143 |     return descriptive
144 | 
145 | 
146 | def column_eval(df):
147 |     """
148 |     Change type of specified columns from str to list. Compute Page_List lengths, if missing.
149 |     :param df:
150 |     :return: void, inplace
151 |     """
152 |     logger.info("Literal eval...")
153 |     for column in AGGREGATE_COLUMNS:
154 |         if column in df.columns and not isinstance(df[column].iloc[0], list):
155 |             print("Working on column: {}".format(column))
156 |             df[column] = df[column].map(literal_eval)
157 |     if "PageSeq_Length" not in df.columns:
158 |         logger.info("Computing PageSeq_Length...")
159 |         df['Page_List'] = df['Page_List'].map(literal_eval)
160 |         df['PageSeq_Length'] = df['Page_List'].map(len)
161 | 
162 | 
163 | def initialize(filename, reports_dest):
164 |     df = pd.read_csv(filename, sep="\t", compression="gzip")
165 |     column_eval(df)
166 |     # For dataframe files that include tablet devices
167 |     df["TabletCount"] = df['DeviceCategories'].map(lambda x: device_count(x, "tablet"))
168 |     df["Occurrences"] = df["Occurrences"] - df["TabletCount"]
169 | 
170 |     map_device_counter(df)
171 | 
172 |     df["Has_Related"] = df["Sequence"].map(has_related_event)
173 | 
174 |     # Journeys per device
175 |     desktop_journeys = df[df.DesktopCount > 0]
176 |     mobile_journeys = df[df.MobileCount > 0]
177 | 
178 |     # Related journeys, all/per device
179 |     df_related = df[df["Has_Related"]]
180 |     desk_rel_journeys = desktop_journeys[desktop_journeys["Has_Related"]]
181 |     mobile_rel_journeys = mobile_journeys[mobile_journeys["Has_Related"]]
182 | 
183 |     occurrence_cols = ["Occurrences", "DesktopCount", "MobileCount"]
184 | 
185 |     df_stats = compute_stats(df, df_related, occurrence_cols)
186 |     df_stats['Shape'] = [df.shape[0], df_related.shape[0], desktop_journeys.shape[0], desk_rel_journeys.shape[0],
187 |                          mobile_journeys.shape[0], mobile_rel_journeys.shape[0]]
188 | 
189 |     descriptive_df = describe_dfs([df, desktop_journeys, mobile_journeys],
190 |                                   [df_related, desk_rel_journeys, mobile_rel_journeys],
191 |                                   "PageSeq_Length", occurrence_cols)
192 | 
193 |     df_stats.to_csv(os.path.join(reports_dest,  "device_rel_stats.csv"))
194 |     descriptive_df.to_csv(os.path.join(reports_dest,  "PageSeq_Length" + "_describe.csv"))
195 | 
196 | 
197 | if __name__ == "__main__":
198 |     parser = argparse.ArgumentParser(description='Module to run analysis on user journeys in terms of a specific'
199 |                                                  'event(s). For now focusing on \'Related content\' links. Reads'
200 |                                                  'in data from the \'processed_journey\' directory.')
201 |     parser.add_argument('input_filename', help='Source user journey file to analyse.')
202 |     parser.add_argument('-q', '--quiet', action='store_true', default=False, help='Turn off debugging logging.')
203 |     args = parser.parse_args()
204 | 
205 |     DATA_DIR = os.getenv("DATA_DIR")
206 |     REPORTS_DIR = os.getenv("REPORTS_DIR")
207 |     source_directory = os.path.join(DATA_DIR, "processed_journey")
208 |     dest_directory = os.path.join(REPORTS_DIR, args.input_filename)
209 |     input_file = os.path.join(source_directory, args.input_filename + ".csv.gz")
210 | 
211 |     LOGGING_CONFIG = os.getenv("LOGGING_CONFIG")
212 |     logging.config.fileConfig(LOGGING_CONFIG)
213 |     logger = logging.getLogger('user_journey_event_analysis')
214 | 
215 |     if args.quiet:
216 |         logging.disable(logging.DEBUG)
217 | 
218 |     if os.path.isfile(input_file):
219 |         if not os.path.isdir(dest_directory):
220 |             logging.info(
221 |                 "Specified destination directory \"{}\" does not exist, creating...".format(dest_directory))
222 |             os.mkdir(dest_directory)
223 |             initialize(input_file, dest_directory)
224 |         else:
225 |             logging.info(
226 |                 "Specified destination directory \"{}\" exists, adding \'v2\' to results...".format(dest_directory))
227 | 


--------------------------------------------------------------------------------
/src/data/make_network_data.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gzip
  3 | import logging.config
  4 | import os
  5 | import re
  6 | import sys
  7 | from ast import literal_eval
  8 | from collections import Counter
  9 | 
 10 | import pandas as pd
 11 | 
 12 | src = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 13 | sys.path.append(os.path.join(src, "data"))
 14 | import preprocess as prep
 15 | 
 16 | COLUMNS_TO_KEEP = ['Page_List', 'Page_List_NL', 'PageSequence', 'Page_Seq_NL', 'Occurrences', 'Page_Seq_Occurrences',
 17 |                    'Occurrences_NL']
 18 | NODE_ATTRIBUTES = ['Taxon_Page_List']
 19 | OCCURRENCES = ['Occurrences_NL', 'Page_Seq_Occurrences']
 20 | 
 21 | 
 22 | def read_file(filename, columns_to_read, collapse_search=False, use_delooped_journeys=False,
 23 |               drop_incorrect_occ=False, with_attribute=False):
 24 |     """
 25 |     Read a dataframe compressed csv file, init as dataframe, drop unnecessary columns, prepare target columns
 26 |     to be evaluated as lists with literal_eval.
 27 |     :param with_attribute:
 28 |     :param use_delooped_journeys:
 29 |     :param drop_incorrect_occ:
 30 |     :param filename: processed_journey dataframe
 31 |     :return: processed for list-eval dataframe
 32 |     """
 33 |     logger.debug("Reading file {}...".format(filename))
 34 |     df = pd.read_csv(filename, sep='\t', compression="gzip", skipinitialspace=True, usecols=columns_to_read)
 35 |     logger.debug("Read in {} columns...".format(df.columns))
 36 | 
 37 |     if drop_incorrect_occ and all(col in df.columns for col in OCCURRENCES):
 38 |         logger.debug("Dropping incorrect occurrence counts...")
 39 |         df.drop(['Occurrences_NL', 'Page_Seq_Occurrences'], axis=1, inplace=True)
 40 | 
 41 |     print(df.shape)
 42 |     print(df[df.Occurrences == 1].shape)
 43 |     # Sample 30% of one-off journeys and then use these indices to drop them
 44 |     indices = df[df.Occurrences == 1].sample(frac=0.3, random_state=1234).index
 45 |     print(len(indices))
 46 |     df.drop(indices, inplace=True)
 47 |     print(df.shape)
 48 | 
 49 |     logger.debug("Number of rows post one-off occurrence drop: {}".format(df.shape))
 50 | 
 51 |     if with_attribute:
 52 |         for attribute_column in NODE_ATTRIBUTES:
 53 |             logger.debug("Working on literal_eval for \"{}\"".format(attribute_column))
 54 |             df[attribute_column] = df[attribute_column].map(literal_eval)
 55 | 
 56 |     column_to_eval = 'Page_List'
 57 | 
 58 |     if use_delooped_journeys:
 59 |         column_to_eval = 'Page_List_NL'
 60 | 
 61 |     if isinstance(df[column_to_eval].iloc[0], str) and any(["," in val for val in df[column_to_eval].values]):
 62 |         logger.debug("Working on literal_eval for \"{}\"".format(column_to_eval))
 63 |         df[column_to_eval] = df[column_to_eval].map(literal_eval)
 64 | 
 65 |     if collapse_search:
 66 |         logger.debug("Collapsing /search nodes in \"{}\"".format(column_to_eval))
 67 |         df[column_to_eval] = df[column_to_eval].map(collapse_search_page)
 68 | 
 69 |     return df
 70 | 
 71 | 
 72 | def collapse_search_page(page_list):
 73 |     return [page for page in page_list if not (re.match(r"^/search[//?|/]", page) or page == "/search")]
 74 | 
 75 | 
 76 | def compute_occurrences(user_journey_df, page_sequence, occurrences):
 77 |     logging.debug("Computing specialized occurrences \"{}\" based on  \"{}\"...".format(occurrences, page_sequence))
 78 |     user_journey_df[occurrences] = user_journey_df.groupby(page_sequence)['Occurrences'].transform(
 79 |         'sum')
 80 | 
 81 | 
 82 | def generate_subpaths(user_journey_df, page_list, subpaths):
 83 |     """
 84 |     Compute lists of subpaths ie node-pairs/edges (where a node is a page) from both original and de-looped page_lists
 85 |     (page-hit only journeys)
 86 |     :param subpaths:
 87 |     :param page_list:
 88 |     :param user_journey_df: user journey dataframe
 89 |     :return: inplace assign new columns
 90 |     """
 91 |     logger.debug("Setting up \"{}\" based on  \"{}\"...".format(subpaths, page_list))
 92 |     user_journey_df[subpaths] = user_journey_df[page_list].map(prep.subpaths_from_list)
 93 | 
 94 | 
 95 | def edgelist_from_subpaths(user_journey_df, use_delooped_journeys=False):
 96 |     """
 97 |     Generate a counter that represents the edge list. Keys are edges (node pairs) which represent a user going from
 98 |     first element of pair to second one), values are a sum of journey occurrences (de-looped occurrences since current
 99 |     computation is based on de-looped subpaths), ie number of times a user/agent went from one page (node) to another.
100 |     :param use_delooped_journeys:
101 |     :param user_journey_df: user journey dataframe
102 |     :return: edgelist counter
103 |     """
104 |     subpath_default = 'Subpaths'
105 |     occurrences_default = 'Page_Seq_Occurrences'
106 |     page_list_default = 'Page_List'
107 |     page_sequence_default = 'PageSequence'
108 | 
109 |     if use_delooped_journeys:
110 |         logger.debug("Creating edge list from de-looped journeys (based on Subpaths_NL) ...")
111 |         subpath_default = 'Subpaths_NL'
112 |         occurrences_default = 'Occurrences_NL'
113 |         page_list_default = 'Page_List_NL'
114 |         page_sequence_default = 'Page_Seq_NL'
115 | 
116 |     else:
117 |         logger.debug("Creating edge list from original journeys (based on Subpaths) ...")
118 | 
119 |     if occurrences_default not in user_journey_df.columns:
120 |         compute_occurrences(user_journey_df, page_sequence_default, occurrences_default)
121 | 
122 |     logger.debug("Dropping duplicates {}...".format(page_sequence_default))
123 |     user_journey_df.drop_duplicates(page_sequence_default, keep="first", inplace=True)
124 | 
125 |     generate_subpaths(user_journey_df, page_list_default, subpath_default)
126 |     edgelist_counter = Counter()
127 | 
128 |     ind_path = user_journey_df.columns.get_loc(subpath_default)
129 |     ind_occ = user_journey_df.columns.get_loc(occurrences_default)
130 | 
131 |     for tup in user_journey_df.itertuples(index=False):
132 |         for edge in tup[ind_path]:
133 |             edgelist_counter[tuple(edge)] += tup[ind_occ]
134 | 
135 |     return edgelist_counter
136 | 
137 | 
138 | def compute_node_attribute(user_journey_df):
139 |     """
140 | 
141 |     :param user_journey_df:
142 |     :return:
143 |     """
144 |     logger.debug("Identifying node taxons from \"Taxon_Page_List\"...")
145 |     node_taxon_dict = {}
146 |     for tup in user_journey_df.itertuples():
147 |         for page, taxons in tup.Taxon_Page_List:
148 |             if page not in node_taxon_dict.keys():
149 |                 node_taxon_dict[page] = taxons
150 |     return node_taxon_dict
151 | 
152 | 
153 | def nodes_from_edgelist(edgelist):
154 |     """
155 |     Generate a node list (from edges). Internally represented as a set, returned as alphabetically sorted list
156 |     :param edgelist: list of edges (node-pairs)
157 |     :return: sorted list of nodes
158 |     """
159 |     logger.debug("Creating node list...")
160 |     nid = 0
161 |     node_list = {}
162 | 
163 |     for keys, _ in edgelist.items():
164 |         for key in keys:
165 |             if key not in node_list.keys():
166 |                 node_list[key] = nid
167 |                 nid += 1
168 |     return node_list
169 | 
170 | 
171 | def compute_nodes_edges(source_filename, dest_filename, cols, collapse_search, use_delooped_journeys,
172 |                         drop_incorrect_occ,
173 |                         with_attribute):
174 |     """
175 |     Read processed_journey dataframe file, preprocess, compute node/edge lists, write contents of lists to file.
176 |     :param collapse_search:
177 |     :param with_attribute:
178 |     :param drop_incorrect_occ:
179 |     :param use_delooped_journeys:
180 |     :param source_filename: dataframe to be loaded
181 |     :param dest_filename: filename prefix for node and edge files
182 |     """
183 |     df = read_file(source_filename, cols, collapse_search, use_delooped_journeys, drop_incorrect_occ, with_attribute)
184 |     edges = edgelist_from_subpaths(df, use_delooped_journeys)
185 |     node_list = nodes_from_edgelist(edges)
186 | 
187 |     print(list(node_list.items())[0:10])
188 | 
189 |     default_edge_header = "Source_node\tSource_id\tDestination_node\tDestination_id\tWeight\n"
190 |     default_node_header = "Node\tNode_id\n"
191 |     node_attr = None
192 | 
193 |     if with_attribute:
194 |         logger.debug("Creating node-attribute (taxon) dictionary...")
195 |         node_attr = compute_node_attribute(df)
196 |         default_edge_header = "Source_node\tSource_id\tDestination_node\tDestination_id\tWeight\tSource_Taxon\tDestination_Taxon\n"
197 |         default_node_header = "Node\tNode_id\tNode_Taxon\n"
198 | 
199 |     logger.info("Number of nodes: {} Number of edges: {}".format(len(node_list), len(edges)))
200 |     logger.info("Writing edge list to file...")
201 | 
202 |     edge_writer(dest_filename + "_edges.csv.gz", default_edge_header, edges, node_list, node_attr)
203 |     node_writer(dest_filename + "_nodes.csv.gz", default_node_header, node_list, node_attr)
204 | 
205 | 
206 | def node_writer(filename, header, node_id, node_attr):
207 |     with gzip.open(filename, "w") as file:
208 |         print(filename)
209 |         file.write(header.encode())
210 |         for node, nid in node_id.items():
211 |             file.write("{}\t{}".format(node, nid).encode())
212 |             if node_attr is not None:
213 |                 file.write("\t{}".format(node_attr[node]).encode())
214 |             file.write("\n".encode())
215 | 
216 | 
217 | def edge_writer(filename, header, edges, node_id, node_attr):
218 |     with gzip.open(filename, "w") as file:
219 |         print(filename)
220 |         file.write(header.encode())
221 |         for key, value in edges.items():
222 |             file.write("{}\t{}\t{}\t{}\t{}".format(key[0], node_id[key[0]], key[1], node_id[key[1]], value).encode())
223 |             if node_attr is not None:
224 |                 file.write("\t{}\t{}".format(node_attr[key[0]], node_attr[key[1]]).encode())
225 |             file.write("\n".encode())
226 | 
227 | 
228 | def check_header(filename):
229 |     with gzip.open(filename, "rb") as reader:
230 |         header = set(reader.readline().decode().replace("\n", "").split("\t"))
231 |     return list(header.intersection(set(COLUMNS_TO_KEEP + NODE_ATTRIBUTES)))
232 | 
233 | 
234 | if __name__ == "__main__":
235 |     parser = argparse.ArgumentParser(description='Module that produces node and edge files given a user journey file.')
236 |     parser.add_argument('source_directory', default="", nargs="?", help='Source directory for input dataframe file(s).')
237 |     parser.add_argument('input_filename', help='Source directory for input dataframe file(s).')
238 |     parser.add_argument('dest_directory', default="", nargs="?",
239 |                         help='Specialized destination directory for output files.')
240 |     parser.add_argument('output_filename', help='Naming convention for resulting node and edge files.')
241 |     parser.add_argument('-q', '--quiet', action='store_true', default=False, help='Turn off debugging logging.')
242 |     parser.add_argument('-d', '--delooped', action='store_true', default=False,
243 |                         help='Use delooped journeys for edge and weight computation')
244 |     parser.add_argument('-i', '--incorrect', action='store_true', default=False,
245 |                         help='Drop incorrect occurrences if necessary')
246 |     parser.add_argument('-t', '--taxon', action='store_true', default=False,
247 |                         help='Compute and include additional node attributes (only taxon for now).')
248 |     parser.add_argument('-cs', '--collapse_search', action='store_true', default=False,
249 |                         help='Remove /search? page hits.')
250 |     parser.add_argument('-s', '--sampling', action='store_true', default=False,
251 |                         help='Remove /search? page hits.')
252 | 
253 |     args = parser.parse_args()
254 | 
255 |     DATA_DIR = os.getenv("DATA_DIR")
256 |     source_directory = os.path.join(DATA_DIR,
257 |                                     args.source_directory if args.source_directory != "" else "processed_journey")
258 |     input_filename = os.path.join(source_directory, (
259 |         args.input_filename + ".csv.gz" if "csv.gz" not in args.input_filename else args.input_filename))
260 |     dest_directory = os.path.join(DATA_DIR, args.dest_directory if args.dest_directory != "" else "processed_network")
261 | 
262 |     output_filename = os.path.join(dest_directory, args.output_filename)
263 |     LOGGING_CONFIG = os.getenv("LOGGING_CONFIG")
264 |     logging.config.fileConfig(LOGGING_CONFIG)
265 |     logger = logging.getLogger('make_network_data')
266 | 
267 |     if args.quiet:
268 |         logging.disable(logging.DEBUG)
269 | 
270 |     if os.path.exists(input_filename):
271 |         logger.info("Working on file: {}".format(input_filename))
272 |         logger.info("Using de-looped journeys: {}\nDropping incorrect occurrence counts: {}".format(args.delooped,
273 |                                                                                                     args.incorrect))
274 |         cols = check_header(input_filename)
275 |         compute_nodes_edges(input_filename, output_filename, cols, args.collapse_search, args.delooped, args.incorrect,
276 |                             args.taxon)
277 |     else:
278 |         logger.debug("Specified filename does not exist: {}".format(input_filename))
279 | 


--------------------------------------------------------------------------------
/src/data/tests/check_dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import os\n",
 11 |     "from collections import Counter"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "DOCUMENTS = os.getenv(\"DOCUMENTS\")"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "source_dir = os.path.join(DOCUMENTS,\"test1\")"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "flist = sorted([os.path.join(source_dir,f) for f in os.listdir(source_dir) if \"user_network_paths_meta_\" in f])"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "flist"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "file1 = flist[0]\n",
 57 |     "file2 = flist[1]"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {
 64 |     "scrolled": true
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "df1 = pd.read_csv(os.path.join(source_dir,file1))"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "df2 = pd.read_csv(os.path.join(source_dir,file2))"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "df1.head(2)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "target = df1.Languages.iloc[0]"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "type(target)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "test_out_file = os.path.join(source_dir,\"output/merge_test_1.csv.gz\")\n",
114 |     "df3 = pd.read_csv(test_out_file)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "df3.head(2)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "target2 = df3.Sequence.iloc[0]"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "df1[df1.Sequence==target2].iloc[0]"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "Counter([x for x in df1[df1.Sequence==target2].Languages.iloc[0].split(\",\")])"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "df2[df2.Sequence==target2].iloc[0]"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "Counter([x for x in df2[df2.Sequence==target2].Languages.iloc[0].split(\",\")])"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "df2.Occurrences.sum()"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "df3.Occurrences.sum()"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "[(df3.columns.get_loc(c),c) for c in df3.columns]"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "COUNTABLE_AGGREGATE_COLUMNS = ['Occurrences','Languages', 'Locations', 'DeviceCategories', 'TrafficSources',\n",
205 |     "                               'TrafficMediums', 'NetworkLocations']"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "def dataframe_splitter(x):\n",
215 |     "    return [[x.columns.get_loc(\"Sequence\"),x.columns.get_loc(col)]\\\n",
216 |     "            for col in x.columns if col in COUNTABLE_AGGREGATE_COLUMNS]\n",
217 |     "#     for col in x.columns:\n",
218 |     "#         if col in COUNTABLE_AGGREGATE_COLUMNS:\n",
219 |     "#             print(x.columns.get_loc(col))"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "dataframe_splitter(df3)"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "df3.columns"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {},
244 |    "outputs": [],
245 |    "source": [
246 |     "for i,df in enumerate([df1,df2,df3]):\n",
247 |     "    for ind in dataframe_splitter(df):\n",
248 |     "        print(i,df.iloc[:,ind].columns)\n",
249 |     "        print(df.columns[ind[0]],df.columns[ind[1]])"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": null,
255 |    "metadata": {},
256 |    "outputs": [],
257 |    "source": [
258 |     "def multi_column_split(list_of_dfs):\n",
259 |     "    return [(i,df.iloc[0:5,ind]) for i,df in enumerate(list_of_dfs) for ind in dataframe_splitter(df)]\n",
260 |     "#         for ind in dataframe_splitter(df):\n",
261 |     "#             to_ret.append(i,df.iloc[:,ind])\n",
262 |     "#     return to_ret"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": null,
268 |    "metadata": {},
269 |    "outputs": [],
270 |    "source": [
271 |     "df1[0:5]"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "expected_size = 2\n",
281 |     "final_list = [pd.DataFrame()] * expected_size\n",
282 |     "print(final_list)\n",
283 |     "for i,df in multi_column_split([df1,df2]):\n",
284 |     "    if len(final_list[i])==0:\n",
285 |     "        final_list[i] = df\n",
286 |     "    else:\n",
287 |     "        final_list[i] = pd.merge(final_list[i],df,how='left',on='Sequence')\n",
288 |     "#     print(i,\"Occurrences\" in df.columns)\n",
289 |     "final_list[0]"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": null,
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": [
298 |     "def receiver(code_df_tup):\n",
299 |     "    return (code_df_tup)"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": null,
305 |    "metadata": {},
306 |    "outputs": [],
307 |    "source": [
308 |     "left = df3.iloc[0:5,[0,1,2]].drop(0)\n",
309 |     "right = df3.iloc[0:5,[1,3]]\n",
310 |     "pd.merge(left,right,how='left',on='Sequence')"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": null,
316 |    "metadata": {},
317 |    "outputs": [],
318 |    "source": [
319 |     "list1 = [1,2]\n",
320 |     "list2 = [1,2,3,4]"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": null,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "temp = df3.iloc[:,[1,5]]"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "metadata": {},
336 |    "outputs": [],
337 |    "source": [
338 |     "df3.shape"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": null,
344 |    "metadata": {},
345 |    "outputs": [],
346 |    "source": [
347 |     "seq_target = df2.Sequence.values"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": null,
353 |    "metadata": {},
354 |    "outputs": [],
355 |    "source": [
356 |     "df3 = df3.query(\"Sequence.isin(@seq_target)\")"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": null,
362 |    "metadata": {},
363 |    "outputs": [],
364 |    "source": [
365 |     "df3.shape"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": null,
371 |    "metadata": {},
372 |    "outputs": [],
373 |    "source": [
374 |     "import itertools"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": null,
380 |    "metadata": {},
381 |    "outputs": [],
382 |    "source": [
383 |     "list_occ = [(0,1),(1,1),(2,1)]\n",
384 |     "list_meta = [(0,4),(1,4),(2,4),(0,3),(1,3),(2,3)]"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": null,
390 |    "metadata": {},
391 |    "outputs": [],
392 |    "source": [
393 |     "list(zip(list_occ,list_meta))"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": null,
399 |    "metadata": {},
400 |    "outputs": [],
401 |    "source": [
402 |     "list((code,occ,meta) for (code,occ),(code1,meta) in itertools.product(list_occ,list_meta) if code==code1)"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": null,
408 |    "metadata": {},
409 |    "outputs": [],
410 |    "source": [
411 |     "list(itertools.product(list_occ,list_meta))"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": null,
417 |    "metadata": {},
418 |    "outputs": [],
419 |    "source": [
420 |     "\"Occurrences\" in df3.columns[[1,2,3]]"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": null,
426 |    "metadata": {},
427 |    "outputs": [],
428 |    "source": [
429 |     "[1] * 2"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": null,
435 |    "metadata": {},
436 |    "outputs": [],
437 |    "source": [
438 |     "any([False])"
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "code",
443 |    "execution_count": null,
444 |    "metadata": {},
445 |    "outputs": [],
446 |    "source": [
447 |     "test_out_file2 = os.path.join(source_dir,\"output/merge_test_sliced_13days.csv.gz\")\n",
448 |     "df3 = pd.read_csv(test_out_file2,compression=\"gzip\")"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": null,
454 |    "metadata": {},
455 |    "outputs": [],
456 |    "source": [
457 |     "df3.head()"
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "code",
462 |    "execution_count": null,
463 |    "metadata": {},
464 |    "outputs": [],
465 |    "source": [
466 |     "from ast import literal_eval"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": null,
472 |    "metadata": {},
473 |    "outputs": [],
474 |    "source": [
475 |     "cols_to_eval = ['Languages','Locations','DeviceCategories','TrafficSources']\n",
476 |     "for col in cols_to_eval:\n",
477 |     "    print(col)\n",
478 |     "    df3[col] = df3[col].map(literal_eval)"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": null,
484 |    "metadata": {},
485 |    "outputs": [],
486 |    "source": [
487 |     "len(df3.sort_values(\"Occurrences\",ascending=False).iloc[0].Locations)"
488 |    ]
489 |   },
490 |   {
491 |    "cell_type": "code",
492 |    "execution_count": null,
493 |    "metadata": {},
494 |    "outputs": [],
495 |    "source": [
496 |     "df3.head()"
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "code",
501 |    "execution_count": null,
502 |    "metadata": {},
503 |    "outputs": [],
504 |    "source": [
505 |     "df3.drop(cols_to_eval,axis=1)"
506 |    ]
507 |   },
508 |   {
509 |    "cell_type": "code",
510 |    "execution_count": null,
511 |    "metadata": {},
512 |    "outputs": [],
513 |    "source": [
514 |     "test_oct = \"../../data/output\""
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "code",
519 |    "execution_count": null,
520 |    "metadata": {},
521 |    "outputs": [],
522 |    "source": [
523 |     "os.listdir(test_oct)"
524 |    ]
525 |   },
526 |   {
527 |    "cell_type": "code",
528 |    "execution_count": null,
529 |    "metadata": {},
530 |    "outputs": [],
531 |    "source": [
532 |     "df4 = pd.read_csv(os.path.join(test_oct,\"merged_oct_15_17.csv.gz\"),compression=\"gzip\")"
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "code",
537 |    "execution_count": null,
538 |    "metadata": {},
539 |    "outputs": [],
540 |    "source": [
541 |     "df4[df4.PageSeq_Length==1].sort_values(\"Occurrences\",ascending=False).head().Sequence.values"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "code",
546 |    "execution_count": null,
547 |    "metadata": {},
548 |    "outputs": [],
549 |    "source": [
550 |     "df4[df4.Sequence.str.contains(\"%26&licenceid=\")].Event_List.iloc[0]"
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "code",
555 |    "execution_count": null,
556 |    "metadata": {},
557 |    "outputs": [],
558 |    "source": [
559 |     "### object size stuff"
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "code",
564 |    "execution_count": null,
565 |    "metadata": {},
566 |    "outputs": [],
567 |    "source": [
568 |     "import sys"
569 |    ]
570 |   },
571 |   {
572 |    "cell_type": "code",
573 |    "execution_count": null,
574 |    "metadata": {},
575 |    "outputs": [],
576 |    "source": [
577 |     "df3.shape"
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "code",
582 |    "execution_count": null,
583 |    "metadata": {},
584 |    "outputs": [],
585 |    "source": [
586 |     "sys.getsizeof(temp)"
587 |    ]
588 |   },
589 |   {
590 |    "cell_type": "code",
591 |    "execution_count": null,
592 |    "metadata": {},
593 |    "outputs": [],
594 |    "source": [
595 |     "sys.getsizeof(df3)"
596 |    ]
597 |   },
598 |   {
599 |    "cell_type": "code",
600 |    "execution_count": null,
601 |    "metadata": {},
602 |    "outputs": [],
603 |    "source": [
604 |     "2,147,483,647"
605 |    ]
606 |   },
607 |   {
608 |    "cell_type": "code",
609 |    "execution_count": null,
610 |    "metadata": {},
611 |    "outputs": [],
612 |    "source": [
613 |     "105,849,367"
614 |    ]
615 |   },
616 |   {
617 |    "cell_type": "code",
618 |    "execution_count": null,
619 |    "metadata": {},
620 |    "outputs": [],
621 |    "source": [
622 |     "475,231,861"
623 |    ]
624 |   }
625 |  ],
626 |  "metadata": {
627 |   "kernelspec": {
628 |    "display_name": "Python 3",
629 |    "language": "python",
630 |    "name": "python3"
631 |   },
632 |   "language_info": {
633 |    "codemirror_mode": {
634 |     "name": "ipython",
635 |     "version": 3
636 |    },
637 |    "file_extension": ".py",
638 |    "mimetype": "text/x-python",
639 |    "name": "python",
640 |    "nbconvert_exporter": "python",
641 |    "pygments_lexer": "ipython3",
642 |    "version": "3.6.0"
643 |   }
644 |  },
645 |  "nbformat": 4,
646 |  "nbformat_minor": 2
647 | }
648 | 


--------------------------------------------------------------------------------
/src/data/merge_dataset.py:
--------------------------------------------------------------------------------
  1 | # # -*- coding: utf-8 -*-
  2 | 
  3 | import argparse
  4 | import logging.config
  5 | import os
  6 | import sys
  7 | from collections import Counter
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | from pandas import DataFrame
 12 | 
 13 | src = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 14 | sys.path.append(os.path.join(src, "data"))
 15 | sys.path.append(os.path.join(src, "features"))
 16 | import preprocess as prep
 17 | import build_features as feat
 18 | 
 19 | COUNTABLE_AGGREGATE_COLUMNS = ['Languages', 'Locations', 'DeviceCategories', 'TrafficSources',
 20 |                                'TrafficMediums', 'NetworkLocations', 'Dates']
 21 | # Execute module for only one file
 22 | SINGLE: bool = False
 23 | # Fewer files to process than available cpus.
 24 | FEWER_THAN_CPU: bool = False
 25 | # Drop journeys occurring once (not in a day, multiple days, governed by DEPTH globals). If false, overrides depth
 26 | # globals and keeps journeys, resulting in massive dataframes (danger zone).
 27 | DROP_ONE_OFFS: bool = False
 28 | # Drop journeys of length 1
 29 | DROP_ONES: bool = False
 30 | # Keep only journeys of length 1
 31 | KEEP_ONES: bool = False
 32 | 
 33 | 
 34 | def list_to_dict(metadata_list):
 35 |     """
 36 |     Transform metadata lists to dictionary aggregates
 37 |     :param metadata_list:
 38 |     :return:
 39 |     """
 40 |     return Counter([xs for xs in metadata_list])
 41 | 
 42 | 
 43 | def str_to_dict(metadata_str):
 44 |     """
 45 |     Transform metadata string eg mobile,desktop,mobile to [(mobile,2),(desktop,1)] dict-like
 46 |     list.
 47 |     :param metadata_str:
 48 |     :return: dict-like list of frequencies
 49 |     """
 50 |     return list_to_dict(metadata_str.split(','))
 51 | 
 52 | 
 53 | def sequence_preprocess(user_journey_df):
 54 |     """
 55 |     Bulk-execute main input pre-processing functions: from BigQuery journey strings to Page_Event_List to Page_List.
 56 |     PageSequence required for dataframes groupbys/filtering.
 57 |     :param user_journey_df: dataframe
 58 |     :return: no return, columns added in place.
 59 |     """
 60 |     logger.info("BQ Sequence string to Page_Event_List...")
 61 |     user_journey_df['Page_Event_List'] = user_journey_df['Sequence'].map(prep.bq_journey_to_pe_list)
 62 |     logger.info("Page_Event_List to Page_List...")
 63 |     user_journey_df['Page_List'] = user_journey_df['Page_Event_List'].map(lambda x: prep.extract_pe_components(x, 0))
 64 |     logger.info("Page_List to PageSequence...")
 65 |     # TODO: Remove condition + internal PageSequence post-testing/debugging.
 66 |     if 'PageSequence' not in user_journey_df.columns:
 67 |         user_journey_df['PageSequence'] = user_journey_df['Page_List'].map(lambda x: ">>".join(x))
 68 |     else:
 69 |         user_journey_df['PageSequence_internal'] = user_journey_df['Page_List'].map(lambda x: ">>".join(x))
 70 | 
 71 | 
 72 | def event_preprocess(user_journey_df):
 73 |     """
 74 |     Bulk-execute event related functions... Run after sequence_preprocess(user_journey_df) so that
 75 |     Page_Event_List column exists
 76 |     :param user_journey_df: dataframe
 77 |     :return: no return, columns added in place.
 78 |     """
 79 |     logger.info("Preprocess and aggregate events...")
 80 |     logger.debug("Page_Event_List to Event_List...")
 81 |     user_journey_df['Event_List'] = user_journey_df['Page_Event_List'].map(lambda x: prep.extract_pe_components(x, 1))
 82 |     logger.debug("Computing event-related counts and frequencies...")
 83 |     event_counters(user_journey_df)
 84 | 
 85 | 
 86 | def taxon_preprocess(user_journey_df):
 87 |     """
 88 |     Bulk map functions for event frequency/counts.
 89 |     :param user_journey_df: dataframe
 90 |     :return: no return, columns added in place.
 91 |     """
 92 |     logger.info("Preprocess taxons...")
 93 |     logger.debug("Page_Event_List to Taxon_List...")
 94 |     user_journey_df['Taxon_List'] = user_journey_df['Page_Event_List'].map(lambda x: prep.extract_cd_components(x, 2))
 95 |     logger.debug("Page_Event_List to Taxon_Page_List...")
 96 |     user_journey_df['Taxon_Page_List'] = user_journey_df['Page_Event_List'].map(lambda x: prep.extract_pcd_list(x, 2))
 97 | 
 98 | 
 99 | def event_counters(user_journey_df):
100 |     """
101 |     Bulk map functions for event frequency/counts.
102 |     :param user_journey_df: dataframe
103 |     :return: no return, columns added in place.
104 |     """
105 |     # logger.debug("Computing number of event categories...")
106 |     # user_journey_df['num_event_cats'] = user_journey_df['Event_List'].map(feat.count_event_cat)
107 |     logger.debug("Computing frequency of event categories...")
108 |     user_journey_df['Event_cats_agg'] = user_journey_df['Event_List'].map(feat.aggregate_event_cat)
109 |     logger.debug("Computing frequency of event categories and actions...")
110 |     user_journey_df['Event_cat_act_agg'] = user_journey_df['Event_List'].map(feat.aggregate_event_cat_act)
111 | 
112 | 
113 | def add_loop_columns(user_journey_df):
114 |     """
115 |     Bulk map functions for event frequency/counts.
116 |     :param user_journey_df: dataframe
117 |     :return: no return, columns added in place.
118 |     """
119 |     logger.info("Preprocess journey looping...")
120 |     logger.debug("Collapsing loops...")
121 |     user_journey_df['Page_List_NL'] = user_journey_df['Page_List'].map(prep.collapse_loop)
122 |     # In order to groupby during analysis step
123 |     logger.debug("De-looped lists to string...")
124 |     user_journey_df['Page_Seq_NL'] = user_journey_df['Page_List_NL'].map(lambda x: ">>".join(x))
125 | 
126 |     if 'Page_Seq_Occurrences' not in user_journey_df.columns:
127 |         logger.debug("Setting up Page_Seq_Occurrences...")
128 |         user_journey_df['Page_Seq_Occurrences'] = user_journey_df.groupby('PageSequence')['Occurrences'].transform(
129 |             'sum')
130 | 
131 |     # Count occurrences of de-looped journeys, most generic journey frequency metric.
132 |     logger.debug("Aggregating de-looped journey occurrences...")
133 |     user_journey_df['Occurrences_NL'] = user_journey_df.groupby('Page_Seq_NL')['Occurrences'].transform('sum')
134 |     logger.debug("De-looped page sequence to list...")
135 |     user_journey_df['Page_List_NL'] = user_journey_df['Page_Seq_NL'].map(
136 |         lambda x: x.split(">>") if isinstance(x, str) else np.NaN)
137 | 
138 | 
139 | def agg_dict(agg_from_dict, row_dict):
140 |     for xs, value in row_dict.items():
141 |         if xs in agg_from_dict.keys():
142 |             agg_from_dict[xs] += value
143 |         else:
144 |             agg_from_dict[xs] = value
145 |     return agg_from_dict
146 | 
147 | 
148 | def aggregate_metadata(dataframe):
149 |     metadata_counter = {}
150 |     for agg in dataframe.columns:
151 |         if agg in COUNTABLE_AGGREGATE_COLUMNS:
152 |             logging.info("Setting up aggregate dictionary {}".format(agg))
153 |             metadata_counter[agg] = {}
154 | 
155 |     logging.info("Starting iteration...")
156 |     for agg in metadata_counter.keys():
157 |         logging.info("Aggregating: {}".format(agg))
158 |         for row in zip(dataframe['Sequence'], dataframe[agg]):
159 |             if row[0] in metadata_counter[agg].keys():
160 |                 metadata_counter[agg][row[0]] = agg_dict(metadata_counter[agg][row[0]],
161 |                                                          str_to_dict(row[1]))
162 |             else:
163 |                 metadata_counter[agg][row[0]] = str_to_dict(row[1])
164 | 
165 |     return metadata_counter
166 | 
167 | 
168 | def preprocess_dataframe(dataframe):
169 |     """
170 | 
171 |     :param dataframe:
172 |     :param single:
173 |     :return:
174 |     """
175 |     logging.info("Dataframe shape: {}".format(dataframe.shape))
176 | 
177 |     multiple = any(dataframe.Sequence.duplicated())
178 | 
179 |     if multiple:
180 |         logging.info("Working on multiple merged dataframes")
181 |         metadata_counter = aggregate_metadata(dataframe)
182 |     else:
183 |         logging.info("Working on a single dataframe")
184 |         for agg in dataframe.columns:
185 | 
186 |             if agg in COUNTABLE_AGGREGATE_COLUMNS:
187 |                 logging.info("Agg {}".format(agg))
188 |                 dataframe[agg] = dataframe[agg].map(lambda x: list(str_to_dict(x).items()))
189 | 
190 |     logging.info("Computing sequence occurrences...")
191 |     dataframe['Occurrences'] = dataframe.groupby('Sequence')['Occurrences'].transform('sum')
192 | 
193 |     if multiple:
194 |         bef = dataframe.shape[0]
195 |         logger.debug("Current # of rows: {}. Dropping duplicate rows...".format(bef))
196 |         dataframe.drop_duplicates(subset='Sequence', keep='first', inplace=True)
197 |         after = dataframe.shape[0]
198 |         logger.debug("Dropped {} duplicated rows.".format(bef - after))
199 | 
200 |         for agg in metadata_counter.keys():
201 |             logger.info("Mapping {}, items: {}...".format(agg, len(metadata_counter[agg])))
202 |             dataframe[agg] = dataframe['Sequence'].map(lambda x: list(metadata_counter[agg][x].items()))
203 | 
204 |     if DROP_ONE_OFFS:
205 |         dataframe['Page_Seq_Occurrences'] = dataframe.groupby('PageSequence')['Occurrences'].transform('sum')
206 |         bef = dataframe.shape[0]
207 |         dataframe = dataframe[dataframe.Page_Seq_Occurrences > 1]
208 |         after = dataframe.shape[0]
209 |         logger.debug("Dropped {} one-off rows.".format(bef - after))
210 | 
211 | 
212 | def initialize_make(files: list, destination: str, merged_filename: str):
213 |     """
214 | 
215 |     :param files:
216 |     :param destination:
217 |     :param merged_filename:
218 |     :return:
219 |     """
220 | 
221 |     logging.info("Reading {} files...".format(len(files)))
222 | 
223 |     df = pd.concat([read_file(file) for file in files], ignore_index=True)
224 | 
225 |     preprocess_dataframe(df)
226 | 
227 |     logging.debug(df.iloc[0])
228 | 
229 |     path_to_file = os.path.join(destination, "merged_" + merged_filename)
230 | 
231 | 
232 |     logging.debug("Saving merged dataframe...")
233 |     logger.info("Saving at: {}".format(path_to_file))
234 |     df.to_csv(path_to_file, sep="\t", compression='gzip', index=False)
235 | 
236 | 
237 | def read_file(filename):
238 |     """
239 |     Initialize dataframe using specified filename, do some initial prep if necessary depending on global vars
240 |     (specified via arguments)
241 |     :param filename: filename to read, no exists_check because files are loaded from a specified directory
242 |     :return: loaded (maybe modified) pandas dataframe
243 |     """
244 |     logging.info("Reading: {}".format(filename))
245 |     df: DataFrame = pd.read_csv(filename, compression="gzip")
246 |     # logging.info("pre {}".format(df.shape))
247 |     df.dropna(subset=['Sequence'], inplace=True)
248 |     # logging.info("post {}".format(df.shape))
249 |     # print(df.shape)
250 | 
251 |     # Drop journeys of length 1
252 |     if DROP_ONES:
253 |         logging.debug("Dropping ones...")
254 |         df.query("PageSeq_Length > 1", inplace=True)
255 | 
256 |     # Keep ONLY journeys of length 1
257 |     elif KEEP_ONES:
258 |         logging.debug("Keeping only ones...")
259 |         df.query("PageSeq_Length == 1", inplace=True)
260 |     # If
261 |     if DROP_ONE_OFFS:
262 |         if "PageSequence" not in df.columns:
263 |             sequence_preprocess(df)
264 |             # df.drop(DROPABLE_COLS, axis=1, inplace=True)
265 |     return df
266 | 
267 | 
268 | def generate_file_list(source_dir, stub):
269 |     """
270 |     Initialize list of files to read from a specified directory. If stub is not empty, filter files to be read
271 |     based on whether their filename includes the stub.
272 |     :param source_dir: Source directory
273 |     :param stub: Filename stub for file filtering
274 |     :return: a list of files
275 |     """
276 |     file_list = sorted([os.path.join(source_dir, file) for file in os.listdir(source_dir)])
277 |     if stub is not None:
278 |         return [file for file in file_list if stub in file]
279 |     else:
280 |         return file_list
281 | 
282 | 
283 | def build_filename(file_list):
284 |     """
285 | 
286 |     :param file_list:
287 |     :return:
288 |     """
289 |     file_name = "_".join(file_list[0].split("/")[-1].split("_")[0:-1])
290 |     date_list = ["".join(file.split("_")[-1].replace(".csv.gz", "").split("-")) for file in
291 |                  [file_list[0], file_list[-1]]]
292 |     if DROP_ONES:
293 |         file_name += ("_dlo")
294 |     if KEEP_ONES:
295 |         file_name += ("_klo")
296 |     if DROP_ONE_OFFS:
297 |         file_name += ("_doo")
298 | 
299 |     return file_name + "_" + "_".join(date_list)
300 | 
301 | 
302 | if __name__ == "__main__":
303 |     parser = argparse.ArgumentParser(description='Module that produces a merged, metadata-aggregated and '
304 |                                                  'preprocessed dataset (.csv.gz), given a source directory '
305 |                                                  'containing raw BigQuery extract dataset(s). Merging is '
306 |                                                  'skipped if only one file is provided.')
307 |     parser.add_argument('output_filename', default="", nargs="?",
308 |                         help='Naming convention for resulting merged dataframe file.')
309 |     parser.add_argument('source_directory', default="", nargs="?", help='Source directory for input dataframe file(s).')
310 |     parser.add_argument('dest_directory', default="", nargs="?",
311 |                         help='Specialized destination directory for output dataframe file.')
312 |     parser.add_argument('-doo', '--drop_one_offs', action='store_true',
313 |                         help='Drop journeys occurring only once (on a daily basis, '
314 |                              'or over approximately 3 day periods).')
315 |     parser.add_argument('-kloo', '--keep_len_one_only', action='store_true',
316 |                         help='Keep ONLY journeys with length 1 ie journeys visiting only one page.')
317 |     parser.add_argument('-dlo', '--drop_len_one', action='store_true',
318 |                         help='Drop journeys with length 1 ie journeys visiting only one page.')
319 |     parser.add_argument('-f', '--filename_stub', default=None, type=str,
320 |                         help='Filter files to be loaded based on whether their filenames contain specified stub.')
321 |     parser.add_argument('-q', '--quiet', action='store_true', default=False, help='Turn off debugging logging.')
322 |     args = parser.parse_args()
323 | 
324 |     DATA_DIR = os.getenv("DATA_DIR")
325 |     source_directory = os.path.join(DATA_DIR,
326 |                                     args.source_directory if args.source_directory != "" else "raw_bq_extract")
327 |     dest_directory = os.path.join(DATA_DIR, args.dest_directory if args.dest_directory != "" else "processed_journey")
328 |     # final_filename = args.output_filename
329 |     filename_stub = args.filename_stub
330 | 
331 |     LOGGING_CONFIG = os.getenv("LOGGING_CONFIG")
332 |     logging.config.fileConfig(LOGGING_CONFIG)
333 |     logger = logging.getLogger('merge_dataset')
334 | 
335 |     if args.quiet:
336 |         logging.disable(logging.DEBUG)
337 | 
338 |     if os.path.isdir(source_directory):
339 |         # Set up variable values from parsed arguments
340 |         DROP_ONE_OFFS = args.drop_one_offs
341 |         DROP_ONES = args.drop_len_one
342 |         KEEP_ONES = args.keep_len_one_only
343 |         logger.info(
344 |             "Data exclusion parameters:\nDrop one-off journeys: {}"
345 |             "\nDrop journeys of length 1: {}"
346 |             "\nKeep journeys only of length 1: {}".format(DROP_ONE_OFFS, DROP_ONES, KEEP_ONES))
347 | 
348 |         logger.info("Loading data...")
349 | 
350 |         to_load = generate_file_list(source_directory, filename_stub)
351 | 
352 |         if len(to_load) > 0:
353 | 
354 |             if not os.path.isdir(dest_directory):
355 |                 logging.info(
356 |                     "Specified destination directory \"{}\" does not exist, creating...".format(dest_directory))
357 |                 os.mkdir(dest_directory)
358 | 
359 |             final_filename = build_filename(to_load)
360 |             logger.debug("Produced output filename: {}".format(final_filename))
361 |             initialize_make(to_load, dest_directory, final_filename + ".csv.gz")
362 |         else:
363 |             logging.info(
364 |                 "Specified source directory \"{}\" contains no target files.".format(source_directory))
365 | 
366 |     else:
367 |         logging.info("Specified source directory \"{}\" does not exist, cannot read files.".format(source_directory))
368 | 


--------------------------------------------------------------------------------
/notebooks/taxon/taxon_eda.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import sys\n",
 10 |     "import os\n",
 11 |     "src_data = os.path.join(os.path.dirname(os.getenv(\"DATA_DIR\")),\"src/data\")\n",
 12 |     "sys.path.append(src_data)\n",
 13 |     "import preprocess as prep\n",
 14 |     "import datetime\n",
 15 |     "import colorsys\n",
 16 |     "import pandas as pd\n",
 17 |     "import re\n",
 18 |     "import numpy as np\n",
 19 |     "from ast import literal_eval\n",
 20 |     "from collections import Counter\n",
 21 |     "import pprint\n",
 22 |     "import networkx as nx\n",
 23 |     "import pygraphviz\n",
 24 |     "from networkx.drawing.nx_agraph import graphviz_layout\n",
 25 |     "%matplotlib inline\n",
 26 |     "import matplotlib.pyplot as plt"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "DATA_DIR = os.getenv(\"DATA_DIR\")\n",
 36 |     "filename = \"preprocessed_with_dupes_31_10_taxon2.csv.gz\"\n",
 37 |     "path = os.path.join(DATA_DIR,\"output\", filename)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "df = pd.read_csv(path,sep=\"\\t\",compression=\"gzip\")"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "df.shape"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "df.columns"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "any(df.Sequence.duplicated())"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "for col in df.columns:\n",
 83 |     "#     if \"Sequence\" not in col and not col.startswith(\"Event\"):\n",
 84 |     "#         if isinstance(df[col].iloc[0],str) and \"[\" in df[col].iloc[0]:\n",
 85 |     "#             print(col)\n",
 86 |     "#             df[col] = df[col].map(literal_eval)\n",
 87 |     "    if re.search(\"^Taxon|^Page\",col):\n",
 88 |     "        if isinstance(df[col].iloc[0],str) and \"[\" in df[col].iloc[0]:\n",
 89 |     "            print(col)\n",
 90 |     "            df[col] = df[col].map(literal_eval)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "## Count taxons within journeys\n",
 98 |     "### Setup"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "def unique_taxon_flat_unique(taxon_list):\n",
108 |     "    return sum(Counter(set([t for taxon in taxon_list for t in taxon.split(\",\")])).values())\n",
109 |     "def unique_taxon_nested_unique(taxon_list):\n",
110 |     "    return sum(Counter(set([taxon for taxon in taxon_list])).values())\n",
111 |     "def unique_taxon_flat_pages(taxon_list):\n",
112 |     "    return sum(Counter([t for taxon in taxon_list for t in taxon.split(\",\")]).values())\n",
113 |     "def unique_taxon_nested_pages(taxon_list):\n",
114 |     "    return sum(Counter([taxon for taxon in taxon_list]).values())"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "df.iloc[0].Sequence"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "target = df.Taxon_List.iloc[1]\n",
133 |     "print(target)\n",
134 |     "print(unique_taxon_flat_unique(target))\n",
135 |     "print(unique_taxon_nested_unique(target))\n",
136 |     "print(unique_taxon_flat_pages(target))\n",
137 |     "print(unique_taxon_nested_pages(target))"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "df['taxon_flat_unique'] = df['Taxon_List'].map(unique_taxon_flat_unique)\n",
147 |     "df['taxon_nested_unique'] = df['Taxon_List'].map(unique_taxon_nested_unique)\n",
148 |     "df['taxon_flat_pages'] = df['Taxon_List'].map(unique_taxon_flat_pages)\n",
149 |     "df['taxon_nested_pages'] = df['Taxon_List'].map(unique_taxon_nested_pages)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "df.describe().drop(\"count\").applymap(lambda x: format(x,\"f\"))"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "df.describe().drop(\"count\").applymap(lambda x: '%.2f' % x)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "df[df.taxon_flat_unique == 429].Taxon_List.values"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "df[df.taxon_flat_unique == 0].Sequence.values"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "def taxon_split(taxon_list):\n",
195 |     "    return [t for taxon in taxon_list for t in taxon.split(\",\")]"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "#### Build list of unique taxons, excluding \"other\"\n",
205 |     "taxon_counter = Counter()\n",
206 |     "for tup in df.itertuples():\n",
207 |     "    taxons = taxon_split(tup.Taxon_List)\n",
208 |     "    for taxon in taxons:\n",
209 |     "        taxon_counter[taxon]+=1\n",
210 |     "len(taxon_counter)            "
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "list(taxon_counter.keys())[0:10]"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "taxon_counter.most_common(10)"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "taxon_df = pd.read_csv(\"taxon_level_df.tsv\",sep='\\t')"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "### Assign unique parent taxons per journey"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "df['subpaths'] = df['Page_List'].map(prep.subpaths_from_list)"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": null,
259 |    "metadata": {},
260 |    "outputs": [],
261 |    "source": [
262 |     "for val in df[['Page_List','subpaths']].iloc[0].values:\n",
263 |     "    pprint.pprint(val)\n",
264 |     "    print(\"\\n====\")"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "metadata": {},
270 |    "source": [
271 |     "### create new subpaths where each element is a (page,parent taxon pair, pick one?)"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "def get_taxon_name(taxon_id):\n",
281 |     "    if taxon_id in taxon_df.content_id.values:\n",
282 |     "        return taxon_df[taxon_df.content_id==taxon_id].iloc[0].title\n",
283 |     "    else:\n",
284 |     "        return None"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": [
293 |     "def taxon_title(taxon_id_list):\n",
294 |     "    return [get_taxon_name(taxon_id) for taxon_id in taxon_id_list]"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": [
303 |     "def subpaths_from_pcd_list(pcd_list):\n",
304 |     "    return [[(page,taxon_title(taxons)), (pcd_list[i + 1][0],taxon_title(pcd_list[i + 1][1]))] \n",
305 |     "            for i, (page,taxons) in enumerate(pcd_list) if i < len(pcd_list) - 1]"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "test_journey = df[df.PageSeq_Length>4].iloc[0]"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": null,
320 |    "metadata": {},
321 |    "outputs": [],
322 |    "source": [
323 |     "pprint.pprint([p for p,_ in test_journey.Taxon_Page_List])"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "for i,element in enumerate(subpaths_from_pcd_list(test_journey.Taxon_Page_List)):\n",
333 |     "    print(i,element,\"\\n====\")"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {},
340 |    "outputs": [],
341 |    "source": [
342 |     "df['taxon_subpaths'] = df['Taxon_Page_List'].map(subpaths_from_pcd_list)"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": null,
348 |    "metadata": {},
349 |    "outputs": [],
350 |    "source": [
351 |     "# taxon_title(df.Taxon_Page_List.iloc[0][0][1])\n",
352 |     "\n",
353 |     "# def add_to_taxon_dict(diction,taxon_list):\n",
354 |     "#     for taxon in taxon_list:\n",
355 |     "#         if taxon not in diction.keys():\n",
356 |     "#             diction[taxon] = get_taxon_name(taxon)\n",
357 |     "\n",
358 |     "# df.Taxon_Page_List.iloc[0][0][1]\n",
359 |     "\n",
360 |     "# df.Taxon_Page_List.iloc[0][1][1]\n",
361 |     "\n",
362 |     "# taxon_name = {}\n",
363 |     "# add_to_taxon_dict(taxon_name,df.Taxon_Page_List.iloc[0][0][1]+df.Taxon_Page_List.iloc[0][1][1])\n",
364 |     "\n",
365 |     "# taxon_name\n",
366 |     "\n",
367 |     "# df.shape\n",
368 |     "\n",
369 |     "# print(datetime.datetime.now().strftime(\"[%H:%M:%S]\"))"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "markdown",
374 |    "metadata": {},
375 |    "source": [
376 |     "## Graph viz"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "markdown",
381 |    "metadata": {},
382 |    "source": [
383 |     "## graph some stuff based on taxon (parent?)"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": null,
389 |    "metadata": {},
390 |    "outputs": [],
391 |    "source": [
392 |     "def add_page_taxon(diction,key,value):\n",
393 |     "    if key not in diction.keys():\n",
394 |     "        diction[key] = value"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": null,
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "adjacency_list = {}\n",
404 |     "adjacency_counter = Counter()\n",
405 |     "freq_filter = 1000\n",
406 |     "dupe_count = 0\n",
407 |     "page_taxon_title = {}\n",
408 |     "\n",
409 |     "for i,tup in enumerate(df.sort_values(by=\"Occurrences\",ascending=False).itertuples()):\n",
410 |     "#     for page,taxon in tup.Taxon_Page_List:\n",
411 |     "    for subpath in subpaths_from_pcd_list(tup.Taxon_Page_List):\n",
412 |     "        start = subpath[0][0]\n",
413 |     "        end = subpath[1][0]\n",
414 |     "#         print(subpath[0][1]+subpath[1][1])\n",
415 |     "        adjacency_counter [(start,end)] += tup.Occurrences\n",
416 |     "        \n",
417 |     "        \n",
418 |     "        if start!=end and adjacency_counter[(start,end)] >= freq_filter:\n",
419 |     "            \n",
420 |     "            add_page_taxon(page_taxon_title,start,subpath[0][1])\n",
421 |     "            add_page_taxon(page_taxon_title,end,subpath[1][1])\n",
422 |     "          \n",
423 |     "\n",
424 |     "            if start in adjacency_list.keys():\n",
425 |     "                if end not in adjacency_list[start]:\n",
426 |     "                    adjacency_list[start].append(end)\n",
427 |     "            else:\n",
428 |     "                adjacency_list[start] = [end]\n",
429 |     "                \n",
430 |     "    if len(adjacency_list)>1000:\n",
431 |     "        break\n",
432 |     "            \n",
433 |     "    if i%30000==0:\n",
434 |     "        print(datetime.datetime.now().strftime(\"[%H:%M:%S]\"),\"ind\",i)\n",
435 |     "        print(len(adjacency_list))"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": null,
441 |    "metadata": {},
442 |    "outputs": [],
443 |    "source": [
444 |     "len(adjacency_list)"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": null,
450 |    "metadata": {},
451 |    "outputs": [],
452 |    "source": [
453 |     "list(adjacency_list.items())[0:10]"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": null,
459 |    "metadata": {},
460 |    "outputs": [],
461 |    "source": [
462 |     "list(page_taxon_title.items())[0:10]"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": null,
468 |    "metadata": {},
469 |    "outputs": [],
470 |    "source": [
471 |     "for page,taxons in page_taxon_title.items():\n",
472 |     "    page_taxon_title[page] = \"_\".join([taxon if taxon is not None else \"None\" for taxon in taxons])   "
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "markdown",
477 |    "metadata": {},
478 |    "source": [
479 |     "### Set up colors"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": null,
485 |    "metadata": {},
486 |    "outputs": [],
487 |    "source": [
488 |     "N = len(page_taxon_title.values())\n",
489 |     "HSV_tuples = [(x*1.0/N, 0.5, 0.5) for x in range(N)]\n",
490 |     "RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples)\n",
491 |     "RGB_tuples = list(RGB_tuples)"
492 |    ]
493 |   },
494 |   {
495 |    "cell_type": "code",
496 |    "execution_count": null,
497 |    "metadata": {},
498 |    "outputs": [],
499 |    "source": [
500 |     "taxon_color = {taxon:RGB_tuples[i] for i,taxon in enumerate(page_taxon_title.values())}"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "code",
505 |    "execution_count": null,
506 |    "metadata": {},
507 |    "outputs": [],
508 |    "source": [
509 |     "digraph = nx.DiGraph()"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "code",
514 |    "execution_count": null,
515 |    "metadata": {},
516 |    "outputs": [],
517 |    "source": [
518 |     "for node,out_nodes in adjacency_list.items():\n",
519 |     "    color = taxon_color[page_taxon_title[node]]\n",
520 |     "    digraph.add_node(node,taxon=page_taxon_title[node],color=color)\n",
521 |     "    for o_node in out_nodes:\n",
522 |     "        color = taxon_color[page_taxon_title[o_node]]\n",
523 |     "        digraph.add_node(o_node,taxon=page_taxon_title[o_node],color=color)\n",
524 |     "        digraph.add_edge(node,o_node)"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "code",
529 |    "execution_count": null,
530 |    "metadata": {},
531 |    "outputs": [],
532 |    "source": [
533 |     "digraph.edges()"
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "code",
538 |    "execution_count": null,
539 |    "metadata": {},
540 |    "outputs": [],
541 |    "source": [
542 |     "edges = digraph.edges()\n",
543 |     "color_map = [data['color'] for _,data in digraph.nodes(data=True)]\n",
544 |     "pos = nx.nx_agraph.graphviz_layout(digraph, prog='neato')\n",
545 |     "nx.draw(digraph, pos, node_size=20, fontsize=12, edges=edges, node_color=color_map)\n",
546 |     "plt.show()"
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "code",
551 |    "execution_count": null,
552 |    "metadata": {},
553 |    "outputs": [],
554 |    "source": []
555 |   }
556 |  ],
557 |  "metadata": {
558 |   "kernelspec": {
559 |    "display_name": "Python 3",
560 |    "language": "python",
561 |    "name": "python3"
562 |   },
563 |   "language_info": {
564 |    "codemirror_mode": {
565 |     "name": "ipython",
566 |     "version": 3
567 |    },
568 |    "file_extension": ".py",
569 |    "mimetype": "text/x-python",
570 |    "name": "python",
571 |    "nbconvert_exporter": "python",
572 |    "pygments_lexer": "ipython3",
573 |    "version": "3.6.0"
574 |   }
575 |  },
576 |  "nbformat": 4,
577 |  "nbformat_minor": 2
578 | }
579 | 


--------------------------------------------------------------------------------
/notebooks/eda/look_at_sampling_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 38,
  6 |    "metadata": {
  7 |     "ExecuteTime": {
  8 |      "end_time": "2019-02-04T16:02:59.714543Z",
  9 |      "start_time": "2019-02-04T16:02:59.709887Z"
 10 |     }
 11 |    },
 12 |    "outputs": [],
 13 |    "source": [
 14 |     "import os \n",
 15 |     "import pandas as pd\n",
 16 |     "import numpy as np\n",
 17 |     "import ast\n",
 18 |     "import matplotlib.pyplot as plt\n",
 19 |     "import math\n",
 20 |     "\n",
 21 |     "from collections import Counter"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 39,
 27 |    "metadata": {
 28 |     "ExecuteTime": {
 29 |      "end_time": "2019-02-04T16:02:59.969283Z",
 30 |      "start_time": "2019-02-04T16:02:59.956071Z"
 31 |     }
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "%matplotlib inline"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 40,
 41 |    "metadata": {
 42 |     "ExecuteTime": {
 43 |      "end_time": "2019-02-04T16:03:00.309544Z",
 44 |      "start_time": "2019-02-04T16:03:00.301949Z"
 45 |     }
 46 |    },
 47 |    "outputs": [
 48 |     {
 49 |      "name": "stdout",
 50 |      "output_type": "stream",
 51 |      "text": [
 52 |       "500\n"
 53 |      ]
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "# Some of the columns we will look at can be quite wide, but it's good to get an idea of what they contain\n",
 58 |     "print(pd.get_option('max_colwidth'))\n",
 59 |     "pd.set_option('max_colwidth',500)"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "## File/dir locations\n"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 5,
 72 |    "metadata": {
 73 |     "ExecuteTime": {
 74 |      "end_time": "2019-02-04T14:49:55.813774Z",
 75 |      "start_time": "2019-02-04T14:49:55.809824Z"
 76 |     }
 77 |    },
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "DATA_DIR = os.getenv(\"DATA_DIR\")\n",
 81 |     "filename = \"preprocessed_taxon_pageseq_20190114_20190116.csv.gz\"\n",
 82 |     "# df_file = os.path.join(DATA_DIR, \"processed_journey\", filename)\n",
 83 |     "# df_reduced_file = os.path.join(DATA_DIR, \"processed_journey\", \"reduced_\"+filename)\n",
 84 |     "# df_rel_file = os.path.join(DATA_DIR, \"processed_journey\", \"rel_\"+filename)\n",
 85 |     "# df_doo_file = os.path.join(\n",
 86 |     "#     DATA_DIR, \"processed_journey\",\n",
 87 |     "#     \"doo_prelim_meta_standard_with_pageseq_from_29-10_to_04-11-2018.csv.gz\")\n",
 88 |     "\n",
 89 |     "df_dlo_file = os.path.join(\n",
 90 |     "    DATA_DIR, \"processed_journey\",\n",
 91 |     "    \"dlo_prelim_meta_standard_with_pageseq_from_29-10_to_04-11-2018.csv.gz\")\n",
 92 |     "df_kloo_file = os.path.join(\n",
 93 |     "    DATA_DIR, \"processed_journey\",\n",
 94 |     "    \"kloo_prelim_meta_standard_with_pageseq_from_29-10_to_04-11-2018.csv.gz\")"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 6,
100 |    "metadata": {
101 |     "ExecuteTime": {
102 |      "end_time": "2019-02-04T14:58:13.251383Z",
103 |      "start_time": "2019-02-04T14:49:56.256851Z"
104 |     }
105 |    },
106 |    "outputs": [],
107 |    "source": [
108 |     "#the 'drop length one' data read into pandas dataframe\n",
109 |     "dlo = pd.read_csv(df_dlo_file, compression='gzip')\n",
110 |     "#the 'keep length one only' data read into pandas dataframe\n",
111 |     "kloo = pd.read_csv(df_kloo_file, compression='gzip')"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 7,
117 |    "metadata": {
118 |     "ExecuteTime": {
119 |      "end_time": "2019-02-04T14:58:13.428292Z",
120 |      "start_time": "2019-02-04T14:58:13.339211Z"
121 |     }
122 |    },
123 |    "outputs": [
124 |     {
125 |      "data": {
126 |       "text/plain": [
127 |        "(3788851, 15)"
128 |       ]
129 |      },
130 |      "execution_count": 7,
131 |      "metadata": {},
132 |      "output_type": "execute_result"
133 |     }
134 |    ],
135 |    "source": [
136 |     "dlo.shape"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 8,
142 |    "metadata": {
143 |     "ExecuteTime": {
144 |      "end_time": "2019-02-04T14:58:13.446260Z",
145 |      "start_time": "2019-02-04T14:58:13.439735Z"
146 |     }
147 |    },
148 |    "outputs": [
149 |     {
150 |      "data": {
151 |       "text/plain": [
152 |        "(890977, 15)"
153 |       ]
154 |      },
155 |      "execution_count": 8,
156 |      "metadata": {},
157 |      "output_type": "execute_result"
158 |     }
159 |    ],
160 |    "source": [
161 |     "kloo.shape"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "Load up a data from two files: dlo = drop length one journeys, kloo = keep length one journeys only \n",
169 |     "\n",
170 |     "This data was produced by an early version of the pipeline and is missing some descriptive variables, such as taxons etc. However, it contains the sequences of pages and behaviours (or events) of users on those pages, including interaction with the sidebar and the related links contained therein."
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 18,
176 |    "metadata": {
177 |     "ExecuteTime": {
178 |      "end_time": "2019-02-04T15:23:15.685611Z",
179 |      "start_time": "2019-02-04T15:23:15.477848Z"
180 |     }
181 |    },
182 |    "outputs": [
183 |     {
184 |      "name": "stdout",
185 |      "output_type": "stream",
186 |      "text": [
187 |       "6537680\n",
188 |       "7650687\n"
189 |      ]
190 |     }
191 |    ],
192 |    "source": [
193 |     "print(dlo['Occurrences'].sum())\n",
194 |     "print(kloo['Occurrences'].sum())"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 62,
200 |    "metadata": {
201 |     "ExecuteTime": {
202 |      "end_time": "2019-02-04T16:53:09.376433Z",
203 |      "start_time": "2019-02-04T16:53:07.241082Z"
204 |     },
205 |     "code_folding": []
206 |    },
207 |    "outputs": [],
208 |    "source": [
209 |     "#get a reproducible sample of 20% of journey types from each dataframe, \n",
210 |     "#sampled in proportion to the number of occurrences of each journey type\n",
211 |     "#then join the new samples together into a single dataframe\n",
212 |     "\n",
213 |     "# df = pd.concat([dlo.sample(frac=0.2, random_state=1234, weights=dlo.Occurrences).copy(), kloo.sample(frac=0.2, random_state=1234, weights=kloo.Occurrences).copy()], ignore_index=True)\n",
214 |     "\n",
215 |     "\n",
216 |     "# try sampling with replacement, using occurrences as weights, but then \n",
217 |     "# change all \"occurrences\" to 1, to try to create a more representative sample?\n",
218 |     "# df = pd.concat([\n",
219 |     "#     dlo.sample(\n",
220 |     "#         frac=0.4, random_state=1234, weights=dlo.Occurrences, replace=True\n",
221 |     "#     ).copy(),\n",
222 |     "#     kloo.sample(\n",
223 |     "#          frac=0.4, random_state=1234, weights=kloo.Occurrences, replace=True\n",
224 |     "#     ).copy()],\n",
225 |     "#     ignore_index=True)\n",
226 |     "\n",
227 |     "# try  concatting and THEN sampling with replacement, using occurrences as\n",
228 |     "# weights, but then change all \"occurrences\" to 1, to try to create a more \n",
229 |     "# representative sample?\n",
230 |     "# df = pd.concat([\n",
231 |     "#     dlo.copy(),\n",
232 |     "#     kloo.copy()],\n",
233 |     "#     ignore_index=True)\n",
234 |     "# df = df.sample(\n",
235 |     "#         frac=0.4, random_state=1234, weights=df.Occurrences, replace=True\n",
236 |     "#     )\n",
237 |     "\n",
238 |     "# # try  concatting and THEN sampling without replacement, using occurrences as\n",
239 |     "# # weights\n",
240 |     "# df = pd.concat([\n",
241 |     "#     dlo.copy(),\n",
242 |     "#     kloo.copy()],\n",
243 |     "#     ignore_index=True)\n",
244 |     "# df = df.sample(\n",
245 |     "#         frac=0.4, random_state=1234, weights=df.Occurrences\n",
246 |     "#     )\n",
247 |     "\n",
248 |     "# try sampling with, using occurrences as weights, \n",
249 |     "# and sum(Occurrences)*0.4 as n, but then change all \"occurrences\" to 1, \n",
250 |     "# to try to create a more representative sample?\n",
251 |     "df = pd.concat([\n",
252 |     "    dlo.sample(\n",
253 |     "        n=math.ceil(0.4*dlo['Occurrences'].sum()), random_state=1234, \n",
254 |     "        weights=dlo.Occurrences, replace=True\n",
255 |     "    ).copy(),\n",
256 |     "    kloo.sample(\n",
257 |     "         n=math.ceil(0.4*kloo['Occurrences'].sum()), random_state=1234,\n",
258 |     "        weights=kloo.Occurrences, replace=True\n",
259 |     "    ).copy()],\n",
260 |     "    ignore_index=True)\n",
261 |     "\n",
262 |     "\n",
263 |     "# try just concatting them\n",
264 |     "df = pd.concat([\n",
265 |     "    dlo[\n",
266 |     "        ['DeviceCategories', 'Occurrences', 'Sequence', 'Event_cat_act_agg']\n",
267 |     "    ].copy(),\n",
268 |     "    kloo[\n",
269 |     "        ['DeviceCategories', 'Occurrences', 'Sequence', 'Event_cat_act_agg']\n",
270 |     "    ].copy()],\n",
271 |     "    ignore_index=True)\n"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": 63,
277 |    "metadata": {
278 |     "ExecuteTime": {
279 |      "end_time": "2019-02-04T16:53:09.386798Z",
280 |      "start_time": "2019-02-04T16:53:09.380347Z"
281 |     }
282 |    },
283 |    "outputs": [
284 |     {
285 |      "data": {
286 |       "text/plain": [
287 |        "(4679828, 4)"
288 |       ]
289 |      },
290 |      "execution_count": 63,
291 |      "metadata": {},
292 |      "output_type": "execute_result"
293 |     }
294 |    ],
295 |    "source": [
296 |     "df.shape"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "metadata": {},
302 |    "source": [
303 |     "## Remove tablet occurrences"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 64,
309 |    "metadata": {
310 |     "ExecuteTime": {
311 |      "end_time": "2019-02-04T16:54:22.456736Z",
312 |      "start_time": "2019-02-04T16:53:09.388872Z"
313 |     }
314 |    },
315 |    "outputs": [],
316 |    "source": [
317 |     "def device_count(x, device):\n",
318 |     "    return sum([value for item, value in x if item == device])\n",
319 |     "df[\"TabletCount\"] = df['DeviceCategories'].apply(\n",
320 |     "    ast.literal_eval).map(lambda x: device_count(x, \"tablet\"))\n",
321 |     "df[\"Occurrences\"] = df[\"Occurrences\"] - df[\"TabletCount\"]"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": 65,
327 |    "metadata": {
328 |     "ExecuteTime": {
329 |      "end_time": "2019-02-04T16:54:23.745408Z",
330 |      "start_time": "2019-02-04T16:54:22.459505Z"
331 |     }
332 |    },
333 |    "outputs": [
334 |     {
335 |      "data": {
336 |       "text/plain": [
337 |        "(4294728, 5)"
338 |       ]
339 |      },
340 |      "execution_count": 65,
341 |      "metadata": {},
342 |      "output_type": "execute_result"
343 |     }
344 |    ],
345 |    "source": [
346 |     "df = df[df[\"Occurrences\"] != 0]\n",
347 |     "df.shape"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": 61,
353 |    "metadata": {
354 |     "ExecuteTime": {
355 |      "end_time": "2019-02-04T16:52:44.670535Z",
356 |      "start_time": "2019-02-04T16:52:44.617370Z"
357 |     }
358 |    },
359 |    "outputs": [],
360 |    "source": [
361 |     "# MAKE EACH OCCURRENCES 1\n",
362 |     "# df['Occurrences'] = 1"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "markdown",
367 |    "metadata": {},
368 |    "source": [
369 |     "## journey_click_rate\n",
370 |     "There is no difference in the proportion of journeys using at least one related link (journey_click_rate) between page variant A and page variant B.\n",
371 |     "\n"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "markdown",
376 |    "metadata": {},
377 |    "source": [
378 |     "\\begin{equation*}\n",
379 |     "\\frac{\\text{total number of journeys including at least one click on a related link}}{\\text{total number of journeys}}\n",
380 |     "\\end{equation*}"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "markdown",
385 |    "metadata": {},
386 |    "source": [
387 |     "### total number of journeys including at least one click on a related link\n",
388 |     "The numerator.\n",
389 |     "\n",
390 |     "We need to check within the Sequence column, whether the corresponding user journey has an Event where a related link was clicked. There is more than one level to this Event, we are specifically interested in \"Related content\" (as this is the sidebar of the page, the related links we are interested in)."
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": 66,
396 |    "metadata": {
397 |     "ExecuteTime": {
398 |      "end_time": "2019-02-04T16:54:48.924293Z",
399 |      "start_time": "2019-02-04T16:54:48.917549Z"
400 |     },
401 |     "code_folding": []
402 |    },
403 |    "outputs": [],
404 |    "source": [
405 |     "#Compute whether a journey includes at least one related link click\n",
406 |     "def is_related(x):\n",
407 |     "    return all(cond in x for cond in [\"relatedLinkClicked\",\"Related content\"])"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "markdown",
412 |    "metadata": {},
413 |    "source": [
414 |     "Please note, `is_related` does not make sure that `relatedLinkClicked` and `Related content` exist in the same event in `Sequence`"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "code",
419 |    "execution_count": 67,
420 |    "metadata": {
421 |     "ExecuteTime": {
422 |      "end_time": "2019-02-04T16:55:03.066834Z",
423 |      "start_time": "2019-02-04T16:54:49.296795Z"
424 |     }
425 |    },
426 |    "outputs": [],
427 |    "source": [
428 |     "# map across the Sequence variable, which includes pages and Events\n",
429 |     "# we want to pass all the list elements to a function one-by-one and then collect the output.\n",
430 |     "df[\"Has_Related\"] = df[\"Sequence\"].map(is_related)"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": 68,
436 |    "metadata": {
437 |     "ExecuteTime": {
438 |      "end_time": "2019-02-04T16:55:03.188149Z",
439 |      "start_time": "2019-02-04T16:55:03.069423Z"
440 |     }
441 |    },
442 |    "outputs": [
443 |     {
444 |      "data": {
445 |       "text/plain": [
446 |        "395771"
447 |       ]
448 |      },
449 |      "execution_count": 68,
450 |      "metadata": {},
451 |      "output_type": "execute_result"
452 |     }
453 |    ],
454 |    "source": [
455 |     "# We can filter for True and sum\n",
456 |     "df[df[\"Has_Related\"]].Occurrences.sum()"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "markdown",
461 |    "metadata": {},
462 |    "source": [
463 |     "### total number of journeys\n",
464 |     "The denominator."
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "code",
469 |    "execution_count": 69,
470 |    "metadata": {
471 |     "ExecuteTime": {
472 |      "end_time": "2019-02-04T16:55:03.222965Z",
473 |      "start_time": "2019-02-04T16:55:03.191101Z"
474 |     }
475 |    },
476 |    "outputs": [
477 |     {
478 |      "data": {
479 |       "text/plain": [
480 |        "12971165"
481 |       ]
482 |      },
483 |      "execution_count": 69,
484 |      "metadata": {},
485 |      "output_type": "execute_result"
486 |     }
487 |    ],
488 |    "source": [
489 |     "df.Occurrences.sum()"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "markdown",
494 |    "metadata": {},
495 |    "source": [
496 |     "### final metric"
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "markdown",
501 |    "metadata": {},
502 |    "source": [
503 |     "Given this sample, we see:"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "code",
508 |    "execution_count": 70,
509 |    "metadata": {
510 |     "ExecuteTime": {
511 |      "end_time": "2019-02-04T16:55:03.340889Z",
512 |      "start_time": "2019-02-04T16:55:03.233213Z"
513 |     }
514 |    },
515 |    "outputs": [
516 |     {
517 |      "data": {
518 |       "text/plain": [
519 |        "0.030511600153108838"
520 |       ]
521 |      },
522 |      "execution_count": 70,
523 |      "metadata": {},
524 |      "output_type": "execute_result"
525 |     }
526 |    ],
527 |    "source": [
528 |     "df[df[\"Has_Related\"]].Occurrences.sum() / df.Occurrences.sum()"
529 |    ]
530 |   },
531 |   {
532 |    "cell_type": "markdown",
533 |    "metadata": {},
534 |    "source": [
535 |     "## ratio of clicks on navigation elements vs. clicks on related links\n",
536 |     "\n",
537 |     "There is no statistically significant difference in the ratio of clicks on navigation elements vs. clicks on related links between page variant A and page variant B"
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "markdown",
542 |    "metadata": {},
543 |    "source": [
544 |     "\\begin{equation*}\n",
545 |     "\\frac{\\text{total number of navigation element click events from content pages}}{\\text{total number of related link click events}}\n",
546 |     "\\end{equation*}"
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "markdown",
551 |    "metadata": {},
552 |    "source": [
553 |     "### total number of related link click events"
554 |    ]
555 |   },
556 |   {
557 |    "cell_type": "markdown",
558 |    "metadata": {},
559 |    "source": [
560 |     "we need to check `Related content` is in the event, because the `relatedLinkClicked` is also used for \"explore the topic\" links at the bottom of the page, with the event action containing `Explore the topic`, e.g. `(('relatedLinkClicked', '2.1 Explore the topic'), 1)`"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": 51,
566 |    "metadata": {
567 |     "ExecuteTime": {
568 |      "end_time": "2019-02-04T16:30:35.229409Z",
569 |      "start_time": "2019-02-04T16:30:35.204678Z"
570 |     }
571 |    },
572 |    "outputs": [],
573 |    "source": [
574 |     "# If the event category is 'relatedLinkClicked' and the event action contains 'Related content', \n",
575 |     "# return the count of that event\n",
576 |     "def get_number_of_events_rl(event):\n",
577 |     "    if event[0][0] == 'relatedLinkClicked' and 'Related content' in event[0][1]:\n",
578 |     "        return event[1]\n",
579 |     "    return 0\n",
580 |     "\n",
581 |     "def sum_related_click_events(event_list):\n",
582 |     "    return sum([get_number_of_events_rl(event) for event in event_list])"
583 |    ]
584 |   },
585 |   {
586 |    "cell_type": "code",
587 |    "execution_count": 52,
588 |    "metadata": {
589 |     "ExecuteTime": {
590 |      "end_time": "2019-02-04T16:34:03.277220Z",
591 |      "start_time": "2019-02-04T16:30:35.762510Z"
592 |     }
593 |    },
594 |    "outputs": [],
595 |    "source": [
596 |     "# get the number of related links clicks per Sequence\n",
597 |     "df['Related Links Clicks per seq'] = df['Event_cat_act_agg'].apply(\n",
598 |     "    ast.literal_eval).map(sum_related_click_events)\n",
599 |     "\n",
600 |     "# get the total number of related links clicks for that row (clicks per sequence multiplied by occurrences)\n",
601 |     "df['Related Links Clicks row total'] = df['Related Links Clicks per seq'] * df['Occurrences']"
602 |    ]
603 |   },
604 |   {
605 |    "cell_type": "code",
606 |    "execution_count": 53,
607 |    "metadata": {
608 |     "ExecuteTime": {
609 |      "end_time": "2019-02-04T16:34:03.326684Z",
610 |      "start_time": "2019-02-04T16:34:03.282394Z"
611 |     }
612 |    },
613 |    "outputs": [
614 |     {
615 |      "data": {
616 |       "text/plain": [
617 |        "205595"
618 |       ]
619 |      },
620 |      "execution_count": 53,
621 |      "metadata": {},
622 |      "output_type": "execute_result"
623 |     }
624 |    ],
625 |    "source": [
626 |     "df['Related Links Clicks row total'].sum()"
627 |    ]
628 |   },
629 |   {
630 |    "cell_type": "code",
631 |    "execution_count": null,
632 |    "metadata": {},
633 |    "outputs": [],
634 |    "source": []
635 |   }
636 |  ],
637 |  "metadata": {
638 |   "kernelspec": {
639 |    "display_name": "Python 3",
640 |    "language": "python",
641 |    "name": "python3"
642 |   },
643 |   "language_info": {
644 |    "codemirror_mode": {
645 |     "name": "ipython",
646 |     "version": 3
647 |    },
648 |    "file_extension": ".py",
649 |    "mimetype": "text/x-python",
650 |    "name": "python",
651 |    "nbconvert_exporter": "python",
652 |    "pygments_lexer": "ipython3",
653 |    "version": "3.6.0"
654 |   },
655 |   "toc": {
656 |    "base_numbering": 1,
657 |    "nav_menu": {},
658 |    "number_sections": true,
659 |    "sideBar": true,
660 |    "skip_h1_title": false,
661 |    "title_cell": "Table of Contents",
662 |    "title_sidebar": "Contents",
663 |    "toc_cell": false,
664 |    "toc_position": {
665 |     "height": "507px",
666 |     "left": "62px",
667 |     "top": "154px",
668 |     "width": "165px"
669 |    },
670 |    "toc_section_display": true,
671 |    "toc_window_display": true
672 |   }
673 |  },
674 |  "nbformat": 4,
675 |  "nbformat_minor": 2
676 | }
677 | 


--------------------------------------------------------------------------------