├── notebooks ├── .gitkeep ├── network_analysis │ ├── README.md │ └── extract_brexit_taxon_data.ipynb ├── eda │ ├── notebook_functions.py │ └── look_at_sampling_data.ipynb └── taxon │ ├── taxon_translate.ipynb │ └── taxon_eda.ipynb ├── src ├── __init__.py ├── data │ ├── .gitkeep │ ├── __init__.py │ ├── make_dataset.py │ ├── archived_multiprocess │ │ ├── __init__.py │ │ ├── test_make_dataset.py │ │ └── multiprocess_utils.py │ ├── tests │ │ ├── test.sql │ │ ├── query.sql │ │ ├── test_bq_extract_data.py │ │ ├── test_preprocess.py │ │ └── check_dataset.ipynb │ ├── queries │ │ ├── query_to_fail.sql │ │ ├── simple_test.sql │ │ ├── prelim_meta_standard_query.sql │ │ ├── stnd_taxon_ab.sql │ │ ├── standard_query.sql │ │ ├── prelim_meta_standard_query_with_pageseq.sql │ │ ├── stnd_taxon_no_len_1_devcounts.sql │ │ ├── stnd_taxon.sql │ │ └── stnd_taxon_no_len_1.sql │ ├── preprocess_dataset.py │ ├── preprocess_dataset_thinner.py │ ├── taxon_translate.py │ ├── bq_extract_data.py │ ├── preprocess.py │ ├── make_network_data.py │ └── merge_dataset.py ├── features │ ├── .gitkeep │ ├── __init__.py │ ├── tests │ │ └── test_build_features.py │ └── build_features.py ├── models │ ├── .gitkeep │ ├── __init__.py │ ├── predict_model.py │ └── train_model.py ├── visualization │ ├── .gitkeep │ ├── __init__.py │ └── visualize.py ├── logging.conf └── analysis │ └── journey_events_analysis.py ├── data ├── raw_bq_extract │ └── .gitkeep ├── processed_journey │ └── .gitkeep └── processed_network │ └── .gitkeep ├── reports └── figures │ └── .gitkeep ├── network_data_pipeline.png ├── .envrc ├── .github └── workflows │ └── ci.yml ├── LICENSE ├── .gitignore ├── requirements.txt └── CONTRIBUTING.md /notebooks/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/features/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/raw_bq_extract/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/data/make_dataset.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/features/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/predict_model.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/train_model.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/visualization/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/processed_journey/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/processed_network/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reports/figures/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/visualization/visualize.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/data/archived_multiprocess/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/data/tests/test.sql: -------------------------------------------------------------------------------- 1 | SELECT * 2 | FROM tables 3 | WHERE thing < 5 4 | -------------------------------------------------------------------------------- /network_data_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alphagov/govuk-network-data/HEAD/network_data_pipeline.png -------------------------------------------------------------------------------- /src/data/tests/query.sql: -------------------------------------------------------------------------------- 1 | SELECT * 2 | FROM TABLE_DATE_RANGE([govuk-bigquery-analytics:1337.ga_sessions_], 3 | TIME_STAMP)) 4 | WHERE PageSeq_Length > 1 5 | -------------------------------------------------------------------------------- /.envrc: -------------------------------------------------------------------------------- 1 | export GDRIVE_DATADIR="/Volumes/GoogleDrive/Team Drives/GOV.UK teams/2018-2019/Q3/Knowledge up Q3/Data science/data/" 2 | export DATA_DIR="$PWD/data" 3 | export REPORTS_DIR="$PWD/reports" 4 | export LOGGING_CONFIG="$PWD/src/logging.conf" 5 | export BQ_KEY_DIR="$PWD/key" 6 | export QUERIES_DIR="$PWD/src/data/queries/" 7 | export DOCUMENTS="$HOME/Documents" 8 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | on: [push, pull_request] 2 | 3 | jobs: 4 | test: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v2 8 | - uses: actions/setup-python@v2 9 | with: 10 | python-version: '3.6' 11 | - run: sudo apt-get install python-dev libxml2-dev libxslt-dev libz-dev 12 | - run: python -m pip install --upgrade pip 13 | - run: pip install -r requirements.txt 14 | - run: cd ./src/data/ && python -m pytest tests/ 15 | - run: cd ./src/features/ && python -m pytest tests/ 16 | -------------------------------------------------------------------------------- /src/logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root 3 | 4 | [handlers] 5 | keys=consoleHandler,fileHandler 6 | 7 | [formatters] 8 | keys=pipelineFormatter 9 | 10 | [logger_root] 11 | level=DEBUG 12 | handlers=consoleHandler,fileHandler 13 | qualname=pipeline 14 | propagate=0 15 | 16 | [handler_consoleHandler] 17 | class=StreamHandler 18 | level=DEBUG 19 | formatter=pipelineFormatter 20 | args=(sys.stdout, ) 21 | 22 | [handler_fileHandler] 23 | class=FileHandler 24 | level=DEBUG 25 | formatter=pipelineFormatter 26 | args=('/tmp/govuk-network-data.log', ) 27 | 28 | [formatter_pipelineFormatter] 29 | format=%(asctime)s - %(name)s - %(levelname)s - %(funcName)s %(message)s 30 | datefmt= 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Government Digital Service 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/data/archived_multiprocess/test_make_dataset.py: -------------------------------------------------------------------------------- 1 | # to get correct relative path, run the following command from ./src/data/ 2 | # python3 -m pytest tests/ 3 | import make_dataset 4 | import pandas as pd 5 | import pandas as pd 6 | 7 | def test_list_to_dict(): 8 | assert make_dataset.list_to_dict(['Desktop', 'Tablet', 'Mobile', 'Desktop', 'Mobile', 'Desktop']) ==\ 9 | [('Desktop', 3), ('Tablet', 1), ('Mobile', 2)] 10 | 11 | 12 | def test_str_to_dict(): 13 | assert make_dataset.str_to_dict("Mobile,Desktop,Mobile") ==\ 14 | [("Mobile", 2),("Desktop", 1)] 15 | 16 | 17 | def test_aggregate_dict(): 18 | assert make_dataset.aggregate_dict([[("Desktop", 3), ("Tablet", 1), ("Mobile", 2)] + 19 | [("Desktop", 3), ("Tablet", 1), ("Mobile", 2)]]) ==\ 20 | [('Desktop', 6), ('Tablet', 2), ('Mobile', 4)] 21 | 22 | 23 | # DATA PIPELINE 24 | # generate some test data in 25 | user_journey_dict = { 26 | 'Occurrences': [1, 12, 35], 27 | 'Sequence': ["/page1<>/page2<>/page2<>/page2>>/page2"] 29 | } 30 | 31 | user_journey_df = pd.DataFrame(user_journey_dict) 32 | 33 | def test_data_exists(): 34 | assert user_journey_df is not None 35 | assert user_journey_df.shape == (3, 3) 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .python-version 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | #pyenv environment list 34 | list/ 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *,cover 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | 57 | # Sphinx documentation 58 | docs/_build/ 59 | 60 | # PyBuilder 61 | target/ 62 | 63 | # DotEnv configuration 64 | .env 65 | 66 | # Database 67 | *.db 68 | *.rdb 69 | 70 | # Pycharm 71 | .idea 72 | 73 | # VS Code 74 | .vscode/ 75 | 76 | # Spyder 77 | .spyproject/ 78 | 79 | # Jupyter NB Checkpoints 80 | .ipynb_checkpoints/ 81 | 82 | # Some notebooks 83 | 2018-10-15-read_data_into_df.ipynb 84 | 85 | # exclude data from source control by default 86 | # we don't exclude to show data folder structure 87 | # data/ 88 | *.gz 89 | *.csv 90 | 91 | # Mac OS-specific storage files 92 | .DS_Store 93 | 94 | # exclude BQ key 95 | key/* 96 | -------------------------------------------------------------------------------- /src/data/queries/query_to_fail.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | COUNT(*) AS Occurrences, 3 | PageSeq_Length, 4 | Actions_Length, 5 | GROUP_CONCAT(TrafficSource,",") AS TrafficSources, 6 | GROUP_CONCAT(TrafficMedium,",") AS TrafficMediums, 7 | Sequence 8 | FROM ( 9 | SELECT 10 | * 11 | FROM ( 12 | ----SELECT 13 | CONCAT(fullVisitorId,"-",STRING(visitId),"-",STRING(visitNumber),"-",STRING(TIMESTAMP(INTEGER(visitStartTime*1000000)))) AS sessionId, 14 | GROUP_CONCAT(CONCAT(pagePath,"::",CONCAT(IFNULL(hits.eventInfo.eventCategory,"NULL"),"//",IFNULL(hits.eventInfo.eventAction,"NULL"))),">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hits.hitNumber rows BETWEEN unbounded preceding AND unbounded following ) AS Sequence, 15 | TrafficSource, 16 | TrafficMedium, 17 | Date, 18 | COUNT(*) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hits.hitNumber rows BETWEEN unbounded preceding AND unbounded following ) AS Actions_Length, 19 | SUM(IF(hits.type='PAGE',1,0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hits.hitNumber rows BETWEEN unbounded preceding AND unbounded following ) AS PageSeq_Length 20 | FROM ( 21 | SELECT 22 | fullVisitorId, 23 | visitId, 24 | visitNumber, 25 | visitStartTime, 26 | hits.page.pagePath AS pagePath, 27 | hits.hitNumber AS hitNumber, 28 | trafficSource.source AS TrafficSource, 29 | trafficSource.medium AS TrafficMedium, 30 | hits.eventInfo.eventAction, 31 | date AS Date, 32 | hits.type, 33 | hits 34 | Date, 35 | Actions_Length, 36 | PageSeq_Length) 37 | GROUP BY 38 | Sequence, 39 | PageSeq_Length, 40 | Actions_Length -------------------------------------------------------------------------------- /src/data/queries/simple_test.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | COUNT(*) AS Occurrences, 3 | PageSeq_Length, 4 | Actions_Length, 5 | GROUP_CONCAT(TrafficSource,",") AS TrafficSources, 6 | GROUP_CONCAT(TrafficMedium,",") AS TrafficMediums, 7 | Date, 8 | Sequence 9 | FROM ( 10 | SELECT 11 | * 12 | FROM ( 13 | SELECT 14 | CONCAT(fullVisitorId,"-",STRING(visitId),"-",STRING(visitNumber),"-",STRING(TIMESTAMP(INTEGER(visitStartTime*1000000)))) AS sessionId, 15 | GROUP_CONCAT(CONCAT(pagePath,"::",CONCAT(IFNULL(hits.eventInfo.eventCategory,"NULL"),"//",IFNULL(hits.eventInfo.eventAction,"NULL"))),">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hits.hitNumber rows BETWEEN unbounded preceding AND unbounded following ) AS Sequence, 16 | TrafficSource, 17 | TrafficMedium, 18 | Date, 19 | COUNT(*) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hits.hitNumber rows BETWEEN unbounded preceding AND unbounded following ) AS Actions_Length, 20 | SUM(IF(hits.type='PAGE',1,0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hits.hitNumber rows BETWEEN unbounded preceding AND unbounded following ) AS PageSeq_Length 21 | FROM ( 22 | SELECT 23 | fullVisitorId, 24 | visitId, 25 | visitNumber, 26 | visitStartTime, 27 | hits.page.pagePath AS pagePath, 28 | hits.hitNumber AS hitNumber, 29 | trafficSource.source AS TrafficSource, 30 | trafficSource.medium AS TrafficMedium, 31 | hits.eventInfo.eventAction, 32 | date AS Date, 33 | hits.type, 34 | hits.eventInfo.eventCategory, 35 | FROM 36 | TABLE_DATE_RANGE([govuk-bigquery-analytics:87773428.ga_sessions_], 37 | TIME_STAMP)) 38 | WHERE 39 | PageSeq_Length > 1 40 | GROUP BY 41 | sessionId, 42 | Sequence, 43 | TrafficSource, 44 | TrafficMedium, 45 | Date, 46 | Actions_Length, 47 | PageSeq_Length) 48 | GROUP BY 49 | Sequence, 50 | PageSeq_Length, 51 | Actions_Length -------------------------------------------------------------------------------- /src/data/queries/prelim_meta_standard_query.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | COUNT(*) AS Occurrences, 3 | STRING_AGG(DeviceCategory,",") AS DeviceCategories, 4 | PageSeq_Length, 5 | Actions_Length, 6 | STRING_AGG(Date,",") AS Dates, 7 | Sequence 8 | FROM ( 9 | SELECT 10 | * 11 | FROM ( 12 | SELECT 13 | CONCAT(fullVisitorId,"-",CAST(visitId AS STRING),"-",CAST(visitNumber AS STRING)) AS sessionId, 14 | STRING_AGG(CONCAT(pagePath,"<<",CONCAT(IFNULL(eventCategory, "NULL"),"<:<",IFNULL(eventAction, "NULL"))), 15 | ">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Sequence, 16 | DeviceCategory, 17 | Date, 18 | COUNT(*) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Actions_Length, 19 | SUM(IF(htype='PAGE', 1, 0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSeq_Length, 20 | SUM(IF(eventAction='ffYesClick', 1, 0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS EventYes 21 | FROM ( 22 | SELECT 23 | fullVisitorId, 24 | visitId, 25 | visitNumber, 26 | visitStartTime, 27 | hits.page.pagePath AS pagePath, 28 | hits.hitNumber AS hitNumber, 29 | hits.type AS htype, 30 | hits.eventInfo.eventAction AS eventAction, 31 | hits.eventInfo.eventCategory AS eventCategory, 32 | date AS Date, 33 | device.deviceCategory AS DeviceCategory 34 | FROM 35 | `govuk-bigquery-analytics.87773428.ga_sessions_TIME_STAMP` AS sessions 36 | CROSS JOIN 37 | UNNEST(sessions.hits) AS hits )) 38 | GROUP BY 39 | sessionId, 40 | Sequence, 41 | DeviceCategory, 42 | Date, 43 | EventYes, 44 | Actions_Length, 45 | PageSeq_Length) 46 | GROUP BY 47 | Sequence, 48 | PageSeq_Length, 49 | Actions_Length -------------------------------------------------------------------------------- /src/data/queries/stnd_taxon_ab.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | COUNT(*) AS Occurrences, 3 | REPLACE(ab_variant,"AB_DIMENSION_VALUE_PREFIX:","") as ABVariant, 4 | STRING_AGG(DeviceCategory, ",") AS DeviceCategories, 5 | Sequence 6 | FROM ( 7 | SELECT 8 | * 9 | FROM ( 10 | SELECT 11 | CONCAT(fullVisitorId,"-",CAST(visitId AS STRING),"-",CAST(visitNumber AS STRING)) AS sessionId, 12 | STRING_AGG(CONCAT(pagePath,"<<",CONCAT(htype,"<:<",IFNULL(eventCategory, 13 | "NULL"),"<:<",IFNULL(eventAction, 14 | "NULL")),"<<",taxon), ">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Sequence, 15 | STRING_AGG(IF(htype = 'PAGE', 16 | pagePath, 17 | NULL), ">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSequence, 18 | DeviceCategory, 19 | ab_variant 20 | FROM ( 21 | SELECT 22 | fullVisitorId, 23 | visitId, 24 | visitNumber, 25 | visitStartTime, 26 | hits.page.pagePath AS pagePath, 27 | hits.hitNumber AS hitNumber, 28 | hits.type AS htype, 29 | hits.eventInfo.eventAction AS eventAction, 30 | hits.eventInfo.eventCategory AS eventCategory, 31 | ( 32 | SELECT 33 | value 34 | FROM 35 | hits.customDimensions 36 | WHERE 37 | index=59) AS taxon, 38 | ( 39 | SELECT 40 | IFNULL(value,"NULL") 41 | FROM 42 | sessions.customDimensions 43 | WHERE 44 | index=65) AS ab_variant, 45 | device.deviceCategory AS DeviceCategory 46 | FROM 47 | `govuk-bigquery-analytics.87773428.ga_sessions_TIME_STAMP` AS sessions 48 | CROSS JOIN 49 | UNNEST(sessions.hits) AS hits ) ) 50 | WHERE 51 | DeviceCategory != "tablet" and 52 | ab_variant LIKE 'AB_DIMENSION_VALUE_PREFIX:%' 53 | GROUP BY 54 | sessionId, 55 | Sequence, 56 | PageSequence, 57 | DeviceCategory, 58 | ab_variant) 59 | GROUP BY 60 | Sequence, 61 | ABVariant -------------------------------------------------------------------------------- /src/data/queries/standard_query.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | COUNT(*) AS Occurrences, 3 | PageSeq_Length, 4 | Actions_Length, 5 | TrafficSource, 6 | TrafficMedium, 7 | Date, 8 | Sequence 9 | FROM ( 10 | SELECT 11 | * 12 | FROM ( 13 | SELECT 14 | CONCAT(fullVisitorId,"-",CAST(visitId AS STRING),"-",CAST(visitNumber AS STRING)) AS sessionId, 15 | STRING_AGG(CONCAT(pagePath,"::",CONCAT(IFNULL(eventCategory, 16 | "NULL"),"//",IFNULL(eventAction, 17 | "NULL"))), ">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Sequence, 18 | TrafficSource, 19 | TrafficMedium, 20 | Date, 21 | COUNT(*) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Actions_Length, 22 | SUM(IF(htype='PAGE', 23 | 1, 24 | 0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSeq_Length, 25 | SUM(IF(eventAction='ffYesClick', 26 | 1, 27 | 0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS EventYes 28 | FROM ( 29 | SELECT 30 | fullVisitorId, 31 | visitId, 32 | visitNumber, 33 | visitStartTime, 34 | hits.page.pagePath AS pagePath, 35 | hits.hitNumber AS hitNumber, 36 | hits.type AS htype, 37 | hits.eventInfo.eventAction AS eventAction, 38 | hits.eventInfo.eventCategory AS eventCategory, 39 | date AS Date, 40 | trafficSource.source AS TrafficSource, 41 | trafficSource.medium AS TrafficMedium 42 | FROM 43 | `govuk-bigquery-analytics.87773428.ga_sessions_*` AS sessions 44 | CROSS JOIN 45 | UNNEST(sessions.hits) AS hits 46 | WHERE 47 | _TABLE_SUFFIX BETWEEN start_date 48 | AND end_date)) 49 | GROUP BY 50 | sessionId, 51 | Sequence, 52 | TrafficSource, 53 | TrafficMedium, 54 | Date, 55 | EventYes, 56 | Actions_Length, 57 | PageSeq_Length) 58 | GROUP BY 59 | Sequence, 60 | PageSeq_Length, 61 | Actions_Length, 62 | TrafficSource, 63 | TrafficMedium, 64 | Date -------------------------------------------------------------------------------- /src/data/queries/prelim_meta_standard_query_with_pageseq.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | COUNT(*) AS Occurrences, 3 | STRING_AGG(DeviceCategory,",") AS DeviceCategories, 4 | PageSeq_Length, 5 | Actions_Length, 6 | STRING_AGG(Date,",") AS Dates, 7 | Sequence, 8 | PageSequence 9 | FROM ( 10 | SELECT 11 | * 12 | FROM ( 13 | SELECT 14 | CONCAT(fullVisitorId,"-",CAST(visitId AS STRING),"-",CAST(visitNumber AS STRING)) AS sessionId, 15 | STRING_AGG(CONCAT(pagePath,"<<",CONCAT(htype,"<:<",IFNULL(eventCategory, "NULL"),"<:<",IFNULL(eventAction, "NULL"))), 16 | ">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Sequence, 17 | STRING_AGG(IF(htype = 'PAGE',pagePath,NULL),">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSequence, 18 | DeviceCategory, 19 | Date, 20 | COUNT(*) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Actions_Length, 21 | SUM(IF(htype='PAGE', 1, 0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSeq_Length, 22 | SUM(IF(eventAction='ffYesClick', 1, 0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS EventYes 23 | FROM ( 24 | SELECT 25 | fullVisitorId, 26 | visitId, 27 | visitNumber, 28 | visitStartTime, 29 | hits.page.pagePath AS pagePath, 30 | hits.hitNumber AS hitNumber, 31 | hits.type AS htype, 32 | hits.eventInfo.eventAction AS eventAction, 33 | hits.eventInfo.eventCategory AS eventCategory, 34 | date AS Date, 35 | device.deviceCategory AS DeviceCategory 36 | FROM 37 | `govuk-bigquery-analytics.87773428.ga_sessions_TIME_STAMP` AS sessions 38 | CROSS JOIN 39 | UNNEST(sessions.hits) AS hits )) 40 | GROUP BY 41 | sessionId, 42 | Sequence, 43 | PageSequence, 44 | DeviceCategory, 45 | Date, 46 | EventYes, 47 | Actions_Length, 48 | PageSeq_Length) 49 | GROUP BY 50 | Sequence, 51 | PageSequence, 52 | PageSeq_Length, 53 | Actions_Length -------------------------------------------------------------------------------- /src/features/tests/test_build_features.py: -------------------------------------------------------------------------------- 1 | # to get correct relative path, run the following command from ./src/features/ 2 | # python -m pytest tests/ 3 | import build_features 4 | 5 | 6 | def test_has_loop(): 7 | assert build_features.has_loop(["page1", "page2", "page1"]) is False 8 | assert build_features.has_loop(["page1", "page2", "page2"]) is True 9 | 10 | 11 | def test_has_repetition(): 12 | assert build_features.has_repetition(["page1", "page2", "page3"]) is False 13 | # Yields true due to self-loop, should be run on collapsed-loop page lists 14 | assert build_features.has_repetition(["page1", "page1", "page1"]) is True 15 | assert build_features.has_repetition(["page1", "page2", "page3", "page1", "page4"]) is True 16 | assert build_features.has_repetition(["page2", "page3", "page2", "page1"]) is True 17 | 18 | 19 | def test_count_event_cat(): 20 | assert build_features.count_event_cat([('eventCategory1', 'eventAction1'), 21 | ('eventCategory2', 'eventAction2'), 22 | ('eventCategory2', 'eventAction1')]) == 2 23 | 24 | 25 | def test_count_event_act(): 26 | assert build_features.count_event_act([('eventCategory1', 'eventAction1'), 27 | ('eventCategory2', 'eventAction2'), 28 | ('eventCategory2', 'eventAction1')], 29 | category='eventCategory1', action='eventAction1') == 1 30 | 31 | 32 | def test_aggregate_event_count(): 33 | assert build_features.aggregate_event_cat([('eventCategory1', 'eventAction1'), 34 | ('eventCategory2', 'eventAction2'), 35 | ('eventCategory2', 'eventAction1')]) == \ 36 | [('eventCategory1', 1), ('eventCategory2', 2)] 37 | 38 | 39 | def test_aggregate_event_cat_act(): 40 | assert build_features.aggregate_event_cat_act([('eventCategory1', 'eventAction1'), 41 | ('eventCategory2', 'eventAction2'), 42 | ('eventCategory2', 'eventAction1')]) == \ 43 | [(('eventCategory1', 'eventAction1'), 1), 44 | (('eventCategory2', 'eventAction2'), 1), 45 | (('eventCategory2', 'eventAction1'), 1)] 46 | -------------------------------------------------------------------------------- /src/data/queries/stnd_taxon_no_len_1_devcounts.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | COUNT(*) AS Occurrences, 3 | SUM(IF(DeviceCategory='mobile',1,0)) AS MobileCount, 4 | SUM(IF(DeviceCategory='desktop',1,0)) AS DesktopCount, 5 | PageSeq_Length, 6 | Actions_Length, 7 | Sequence, 8 | PageSequence 9 | FROM ( 10 | SELECT 11 | * 12 | FROM ( 13 | SELECT 14 | CONCAT(fullVisitorId,"-",CAST(visitId AS STRING),"-",CAST(visitNumber AS STRING)) AS sessionId, 15 | STRING_AGG(CONCAT(pagePath,"<<",CONCAT(htype,"<:<",IFNULL(eventCategory, 16 | "NULL"),"<:<",IFNULL(eventAction, 17 | "NULL")),"<<",taxon), ">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Sequence, 18 | STRING_AGG(IF(htype = 'PAGE', 19 | pagePath, 20 | NULL),">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSequence, 21 | DeviceCategory, 22 | COUNT(*) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Actions_Length, 23 | SUM(IF(htype='PAGE', 24 | 1, 25 | 0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSeq_Length 26 | FROM ( 27 | SELECT 28 | fullVisitorId, 29 | visitId, 30 | visitNumber, 31 | visitStartTime, 32 | hits.page.pagePath AS pagePath, 33 | hits.hitNumber AS hitNumber, 34 | hits.type AS htype, 35 | hits.eventInfo.eventAction AS eventAction, 36 | hits.eventInfo.eventCategory AS eventCategory, 37 | ( 38 | SELECT 39 | value 40 | FROM 41 | hits.customDimensions 42 | WHERE 43 | index=59) AS taxon, 44 | device.deviceCategory AS DeviceCategory 45 | FROM 46 | `govuk-bigquery-analytics.87773428.ga_sessions_TIME_STAMP` AS sessions 47 | CROSS JOIN 48 | UNNEST(sessions.hits) AS hits ) 49 | ) 50 | WHERE 51 | PageSeq_Length >1 52 | GROUP BY 53 | sessionId, 54 | Sequence, 55 | PageSequence, 56 | DeviceCategory, 57 | Actions_Length, 58 | PageSeq_Length) 59 | GROUP BY 60 | Sequence, 61 | PageSequence, 62 | PageSeq_Length, 63 | Actions_Length -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appnope==0.1.0 2 | atomicwrites==1.2.1 3 | attrs==18.2.0 4 | backcall==0.1.0 5 | bleach==3.3.0 6 | bokeh==1.3.4 7 | cachetools==2.1.0 8 | certifi==2018.10.15 9 | chardet==3.0.4 10 | Click==7.0 11 | cloudpickle==1.2.2 12 | colorcet==2.0.2 13 | cycler==0.10.0 14 | dask==2.5.2 15 | datashader==0.8.0 16 | datashape==0.5.2 17 | decorator==4.3.0 18 | defusedxml==0.5.0 19 | distributed==2.5.2 20 | entrypoints==0.2.3 21 | fsspec==0.5.2 22 | google-api-core==1.5.0 23 | google-auth==1.5.1 24 | google-auth-oauthlib==0.2.0 25 | google-cloud-bigquery==1.6.0 26 | google-cloud-core==0.28.1 27 | google-resumable-media==0.3.1 28 | googleapis-common-protos==1.5.3 29 | HeapDict==1.0.1 30 | holoviews==1.12.6 31 | idna==2.7 32 | imageio==2.6.1 33 | ipykernel==5.1.0 34 | ipython==7.0.1 35 | ipython-genutils==0.2.0 36 | ipywidgets==7.4.2 37 | jedi==0.13.1 38 | Jinja2==2.10.1 39 | jsonschema==2.6.0 40 | jupyter==1.0.0 41 | jupyter-client==5.3.4 42 | jupyter-console==6.0.0 43 | jupyter-core>=4.6.0 44 | kiwisolver==1.1.0 45 | locket==0.2.0 46 | MarkupSafe==1.0 47 | matplotlib==3.1.1 48 | mistune==0.8.4 49 | more-itertools==4.3.0 50 | msgpack==0.6.2 51 | multipledispatch==0.6.0 52 | nbconvert==5.4.0 53 | nbformat==4.4.0 54 | networkx==2.3 55 | notebook==6.1.5 56 | numba==0.49.0 57 | numpy==1.16.3 58 | oauthlib==2.1.0 59 | packaging==19.2 60 | pandas==0.25.1 61 | pandas-gbq==0.6.1 62 | pandocfilters==1.4.2 63 | param==1.9.2 64 | parso==0.3.1 65 | partd==1.0.0 66 | pexpect==4.6.0 67 | pickleshare==0.7.5 68 | Pillow==8.2.0 69 | pluggy==0.8.0 70 | prometheus-client==0.4.2 71 | prompt-toolkit==2.0.6 72 | protobuf==3.6.1 73 | psutil==5.6.7 74 | ptyprocess==0.6.0 75 | py==1.10.0 76 | pyasn1==0.4.4 77 | pyasn1-modules==0.2.2 78 | pyct==0.4.6 79 | Pygments==2.7.4 80 | pyparsing==2.4.2 81 | pytest==3.9.3 82 | python-dateutil==2.7.3 83 | python-louvain==0.13 84 | pytz==2018.5 85 | pyviz-comms==0.7.2 86 | PyWavelets==1.0.3 87 | PyYAML==5.4 88 | pyzmq==17.1.2 89 | qtconsole==4.4.2 90 | requests>=2.20.0 91 | requests-oauthlib==1.0.0 92 | rsa==4.7 93 | scikit-image==0.16.1 94 | scipy==1.3.1 95 | Send2Trash==1.5.0 96 | simplegeneric==0.8.1 97 | six==1.11.0 98 | sortedcontainers==2.1.0 99 | tblib==1.4.0 100 | terminado>=0.8.3 101 | testpath==0.4.2 102 | toolz==0.10.0 103 | tornado==5.1.1 104 | traitlets==4.3.2 105 | urllib3==1.25.9 106 | wcwidth==0.1.7 107 | webencodings==0.5.1 108 | widgetsnbextension==3.4.2 109 | xarray==0.14.0 110 | zict==1.0.0 111 | -------------------------------------------------------------------------------- /src/data/queries/stnd_taxon.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | COUNT(*) AS Occurrences, 3 | STRING_AGG(DeviceCategory,",") AS DeviceCategories, 4 | PageSeq_Length, 5 | Actions_Length, 6 | STRING_AGG(Date,",") AS Dates, 7 | Sequence, 8 | PageSequence 9 | FROM ( 10 | SELECT 11 | * 12 | FROM ( 13 | SELECT 14 | CONCAT(fullVisitorId,"-",CAST(visitId AS STRING),"-",CAST(visitNumber AS STRING)) AS sessionId, 15 | STRING_AGG(CONCAT(pagePath,"<<",CONCAT(htype,"<:<",IFNULL(eventCategory, 16 | "NULL"),"<:<",IFNULL(eventAction, 17 | "NULL")),"<<",taxon), ">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Sequence, 18 | STRING_AGG(IF(htype = 'PAGE', 19 | pagePath, 20 | NULL),">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSequence, 21 | DeviceCategory, 22 | Date, 23 | COUNT(*) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Actions_Length, 24 | SUM(IF(htype='PAGE', 25 | 1, 26 | 0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSeq_Length, 27 | SUM(IF(eventAction='ffYesClick', 28 | 1, 29 | 0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS EventYes 30 | FROM ( 31 | SELECT 32 | fullVisitorId, 33 | visitId, 34 | visitNumber, 35 | visitStartTime, 36 | hits.page.pagePath AS pagePath, 37 | hits.hitNumber AS hitNumber, 38 | hits.type AS htype, 39 | hits.eventInfo.eventAction AS eventAction, 40 | hits.eventInfo.eventCategory AS eventCategory, 41 | ( 42 | SELECT 43 | value 44 | FROM 45 | hits.customDimensions 46 | WHERE 47 | index=59) AS taxon, 48 | date AS Date, 49 | device.deviceCategory AS DeviceCategory 50 | FROM 51 | `govuk-bigquery-analytics.87773428.ga_sessions_TIME_STAMP` AS sessions 52 | CROSS JOIN 53 | UNNEST(sessions.hits) AS hits ) ) 54 | GROUP BY 55 | sessionId, 56 | Sequence, 57 | PageSequence, 58 | DeviceCategory, 59 | Date, 60 | EventYes, 61 | Actions_Length, 62 | PageSeq_Length) 63 | GROUP BY 64 | Sequence, 65 | PageSequence, 66 | PageSeq_Length, 67 | Actions_Length -------------------------------------------------------------------------------- /src/data/queries/stnd_taxon_no_len_1.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | COUNT(*) AS Occurrences, 3 | STRING_AGG(DeviceCategory,",") AS DeviceCategories, 4 | PageSeq_Length, 5 | Actions_Length, 6 | STRING_AGG(Date,",") AS Dates, 7 | Sequence, 8 | PageSequence 9 | FROM ( 10 | SELECT 11 | * 12 | FROM ( 13 | SELECT 14 | CONCAT(fullVisitorId,"-",CAST(visitId AS STRING),"-",CAST(visitNumber AS STRING)) AS sessionId, 15 | STRING_AGG(CONCAT(pagePath,"<<",CONCAT(htype,"<:<",IFNULL(eventCategory, 16 | "NULL"),"<:<",IFNULL(eventAction, 17 | "NULL")),"<<",taxon), ">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Sequence, 18 | STRING_AGG(IF(htype = 'PAGE', 19 | pagePath, 20 | NULL),">>") OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSequence, 21 | DeviceCategory, 22 | Date, 23 | COUNT(*) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Actions_Length, 24 | SUM(IF(htype='PAGE', 25 | 1, 26 | 0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSeq_Length, 27 | SUM(IF(eventAction='ffYesClick', 28 | 1, 29 | 0)) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS EventYes 30 | FROM ( 31 | SELECT 32 | fullVisitorId, 33 | visitId, 34 | visitNumber, 35 | visitStartTime, 36 | hits.page.pagePath AS pagePath, 37 | hits.hitNumber AS hitNumber, 38 | hits.type AS htype, 39 | hits.eventInfo.eventAction AS eventAction, 40 | hits.eventInfo.eventCategory AS eventCategory, 41 | ( 42 | SELECT 43 | value 44 | FROM 45 | hits.customDimensions 46 | WHERE 47 | index=59) AS taxon, 48 | date AS Date, 49 | device.deviceCategory AS DeviceCategory 50 | FROM 51 | `govuk-bigquery-analytics.87773428.ga_sessions_TIME_STAMP` AS sessions 52 | CROSS JOIN 53 | UNNEST(sessions.hits) AS hits ) 54 | ) 55 | WHERE 56 | PageSeq_Length >1 57 | GROUP BY 58 | sessionId, 59 | Sequence, 60 | PageSequence, 61 | DeviceCategory, 62 | Date, 63 | EventYes, 64 | Actions_Length, 65 | PageSeq_Length) 66 | GROUP BY 67 | Sequence, 68 | PageSequence, 69 | PageSeq_Length, 70 | Actions_Length -------------------------------------------------------------------------------- /src/features/build_features.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | 4 | def has_loop(page_list): 5 | """ 6 | Check if a list of page hits contains an adjacent page loop (A >> A >> B) == True. 7 | :param page_list: list of page hits derived from BQ user journey 8 | :return: True if there is a loop 9 | """ 10 | return any(i == j for i, j in zip(page_list, page_list[1:])) 11 | 12 | 13 | def has_repetition(page_list): 14 | """ 15 | Check if a list of page hits contains a page repetition (A >> B >> A) == True. 16 | Run on journeys with collapsed loops so stuff like A >> A >> B are not captured as a repetition. 17 | Similar to cycles/triangles, but from a flat perspective. 18 | :param page_list: list of page hits derived from BQ user journey 19 | :return: True if there is a repetition 20 | """ 21 | return len(set(page_list)) != len(page_list) 22 | 23 | 24 | # Counters for events 25 | def count_event_cat(event_list): 26 | """ 27 | TODO: possibly remove 28 | Count different event categories present in an event_list. Includes "NULL" events coming from page 29 | hits for the sake of completeness. Does not include frequency. 30 | :param event_list: list of event tuples (eventCategory,eventAction) 31 | :return: number of different eventCategories present 32 | """ 33 | return len(set([cat for cat, _ in event_list])) 34 | 35 | 36 | def count_event_act(event_list, category, action): 37 | """ 38 | TODO: possibly remove 39 | Count number of specific eventActions given a specific eventCategory 40 | :param event_list: list of event tuples (eventCategory,eventAction) 41 | :param category: target eventCategory 42 | :param action: target eventAction 43 | :return: count 44 | """ 45 | return [action for cat, action in event_list if cat == category].count(action) 46 | 47 | 48 | def aggregate_event_cat(event_list): 49 | """ 50 | Return a dictionary-like list of eventCategory frequency counts. 51 | :param event_list: list of event tuples (eventCategory,eventAction) 52 | :return: dict-like list of frequencies [(eventCat1, freq_1),(eventCat2, freq_2),...] 53 | """ 54 | return list(Counter([cat for cat, _ in event_list]).items()) 55 | 56 | 57 | def aggregate_event_cat_act(event_list): 58 | """ 59 | Return a dictionary-like list of (eventCategory,eventAction) frequency counts. 60 | :param event_list: list of event tuples (eventCategory,eventAction) 61 | :return: dict-like list of frequencies [((eventCat1,eventAction1) freq_1),((eventCat1,eventAction2) freq_2),...] 62 | """ 63 | return list(Counter([(cat, act) for cat, act in event_list]).items()) 64 | -------------------------------------------------------------------------------- /notebooks/network_analysis/README.md: -------------------------------------------------------------------------------- 1 | # Python setup for MacOS 2 | 3 | This is a quick run through on how to set up Python on your machines. We'll be 4 | using `pip` to install our packages, and `pyenv` with its `pyenv-virtualenv` 5 | plugin to manage different Python versions and virtual environments, 6 | respectively. 7 | 8 | Python virtual environments allow you to create an isolated environment. This 9 | can have its own dependencies (different packages, different versions) 10 | completely separate from every other environment. 11 | 12 | These instructions have been adapted from [The Hitchhiker's Guide to Python](https://docs.python-guide.org/starting/install3/osx/). 13 | Further detail about `pyenv-virtualenv` can be found in its [documentation](https://github.com/pyenv/pyenv-virtualenv#pyenv-virtualenv). 14 | 15 | By default, MacOS has Python 2 installed, but we need Python 3. 16 | 17 | Install [Homebrew](https://brew.sh/) using Terminal. 18 | ``` 19 | /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" 20 | ``` 21 | 22 | Install the latest version of Python 3 using Homebrew; this should also install 23 | `pip` for you automatically. 24 | ``` 25 | brew install python 26 | ``` 27 | 28 | Add your newly-installed Python to PATH, and activate your changes. Then validate your Python 3 version has 29 | been installed. 30 | ``` 31 | echo 'export PATH="/usr/local/opt/python/libexec/bin:$PATH"' >> ~/.bash_profile 32 | source ~/.bash_profile # activates your changes; alternatively, restart your Terminal 33 | python --version # as of Oct 2019, this should be Python 3.7.4 on Homebrew 34 | ``` 35 | 36 | Use Homebrew to install `pyenv`, and its `pyenv-virtualenv` plugin, add required 37 | lines to your `.bash_profile`, then activate the changes. 38 | ``` 39 | brew install pyenv 40 | brew install pyenv-virtual 41 | 42 | echo 'eval "$(pyenv init -)"' >> ~/.bash_profile 43 | echo 'eval "$(pyenv virtualenv-init -)"' >> ~/.bash_profile 44 | source ~/.bash_profile # activates your changes; alternatively, restart your Terminal 45 | ``` 46 | 47 | Create a new Python virtual environment running Python 3.6.9; we'll call this 48 | virtual environment `govuk-network-data`. 49 | ``` 50 | pyenv virtualenv 3.6.9 govuk-network-data 51 | ``` 52 | 53 | You need to activate this virtual environment before install packages and using 54 | it. 55 | ``` 56 | pyenv activate govuk-network-data 57 | ``` 58 | 59 | Now install packages as listed in the `requirements.txt` file in this 60 | repository. 61 | ``` 62 | pip install -r <<>> 63 | ``` 64 | 65 | To deactivate the virtual environment run the following code: 66 | ``` 67 | pyenv deactivate 68 | ``` 69 | -------------------------------------------------------------------------------- /src/data/tests/test_bq_extract_data.py: -------------------------------------------------------------------------------- 1 | # to get correct relative path, run the following command from ./src/data/ 2 | # python -m pytest tests/ 3 | import bq_extract_data 4 | 5 | 6 | def test_find_query(): 7 | # only returns .sql files, addresses issue 10 somewhat 8 | assert bq_extract_data.find_query("test_bq_extract_data.py", "./tests") is None 9 | assert bq_extract_data.find_query("quer", "./tests") == "./tests/query.sql" 10 | # returns first file to match query_arg, bug or feature? 11 | assert bq_extract_data.find_query("", "./tests") == "./tests/test.sql" 12 | # potential bug spotted 13 | # assert bq_extract_data.find_query("query.sql", "./tests") == "./tests/query.sql" 14 | 15 | 16 | # test removing linebreaks from sql query file 17 | # add space for line breaks 18 | def test_read_query(): 19 | assert bq_extract_data.read_query("./tests/test.sql") == "SELECT * FROM tables WHERE thing < 5" 20 | # handles indent as represented by two-spaces 21 | assert bq_extract_data.read_query("./tests/query.sql") == "SELECT * FROM TABLE_DATE_RANGE([govuk-bigquery-analytics:1337.ga_sessions_], TIME_STAMP)) WHERE PageSeq_Length > 1" 22 | 23 | 24 | def test_change_timestamp(): 25 | """ 26 | Unit test for change_timestamp. Tests for both "standard" and "legacy" SQL timestamp differences. 27 | """ 28 | # standard 29 | assert bq_extract_data.change_timestamp(x = "SELECT * FROM TABLE_DATE_RANGE([govuk-bigquery-analytics:1337.ga_sessions_], TIME_STAMP)) WHERE PageSeq_Length > 1", date = "2018-12-31", dialect = "standard") == 'SELECT * FROM TABLE_DATE_RANGE([govuk-bigquery-analytics:1337.ga_sessions_], 20181231)) WHERE PageSeq_Length > 1' 30 | # legacy 31 | assert bq_extract_data.change_timestamp(x = "SELECT * FROM TABLE_DATE_RANGE([govuk-bigquery-analytics:1337.ga_sessions_], TIME_STAMP)) WHERE PageSeq_Length > 1", date = "2018-12-31", dialect = "legacy") == 'SELECT * FROM TABLE_DATE_RANGE([govuk-bigquery-analytics:1337.ga_sessions_], TIMESTAMP("2018-12-31"), TIMESTAMP("2018-12-31")))) WHERE PageSeq_Length > 1' 32 | # standard, input x with read_query output 33 | assert bq_extract_data.change_timestamp(x = bq_extract_data.read_query("./tests/query.sql"), date = "2018-12-31", dialect = "standard") == 'SELECT * FROM TABLE_DATE_RANGE([govuk-bigquery-analytics:1337.ga_sessions_], 20181231)) WHERE PageSeq_Length > 1' 34 | 35 | # functional test 36 | def test_find_read_change_timestamp_combined(): 37 | """ 38 | Combines the three functions above. A user provides an 39 | approximate name of the file in a given dir that holds their 40 | SQL query of interest. This is read in and converted to a string, 41 | replacing line breaks with spaces. This "SQL query" str 42 | then has its timestamps adjusted to the correct dialect 43 | and so that the correct table is read in BigQuery. 44 | One table per day. 45 | """ 46 | assert bq_extract_data.change_timestamp(bq_extract_data.read_query(bq_extract_data.find_query("query", "./tests")), 47 | date = "2018-12-31", dialect = "standard") == 'SELECT * FROM TABLE_DATE_RANGE([govuk-bigquery-analytics:1337.ga_sessions_], 20181231)) WHERE PageSeq_Length > 1' 48 | 49 | 50 | -------------------------------------------------------------------------------- /src/data/tests/test_preprocess.py: -------------------------------------------------------------------------------- 1 | # to get correct relative path, run the following command from ./src/data/ 2 | # python3 -m pytest tests/ 3 | import preprocess 4 | 5 | 6 | def test_bq_journey_to_pe_list(): 7 | assert preprocess.bq_journey_to_pe_list("page1<>page2<>") ==\ 8 | [('page1', 'eventCategory1<:>".join(page_list) + "\"" + "\t" 67 | 68 | # Writing events columns 69 | event_list = prep.extract_pe_components(page_event_list, 1) 70 | write_to_file += "\"" + str(event_list) + "\"" + "\t" 71 | write_to_file += "\"" + str(feat.count_event_cat(event_list)) + "\"" + "\t" 72 | write_to_file += "\"" + str(feat.aggregate_event_cat(event_list)) + "\"" + "\t" 73 | write_to_file += "\"" + str(feat.aggregate_event_cat_act(event_list)) + "\"" + "\t" 74 | 75 | # Writing taxon_list 76 | write_to_file += "\"" + str(prep.extract_cd_components(page_event_list, 2)) + "\"" + "\t" 77 | write_to_file += "\"" + str(prep.extract_pcd_list(page_event_list, 2)) + "\"" + "\t" 78 | 79 | # Writing loop column stuff 80 | de_looped = prep.collapse_loop(page_list) 81 | write_to_file += "\"" + str(de_looped) + "\"" + "\t" 82 | write_to_file += "\"" + ">>".join(de_looped) + "\"" 83 | 84 | write_to_file += "\n" 85 | 86 | if i % 500000 == 0: 87 | logging.info("At index: {}".format(i)) 88 | write_file.write(write_to_file.encode()) 89 | write_to_file = "" 90 | write_file.flush() 91 | 92 | if i == number_lines - 1 and write_to_file != "": 93 | logging.info("At index via last: {}".format(i)) 94 | write_file.write(write_to_file.encode()) 95 | write_to_file = "" 96 | write_file.flush() 97 | 98 | 99 | if __name__ == "__main__": 100 | 101 | parser = argparse.ArgumentParser(description='Module that produces a metadata-aggregated and ' 102 | 'preprocessed dataset (.csv.gz), given a merged file.') 103 | parser.add_argument('in_file', help='Input dataframe file, this module adds .csv.gz automatically ') 104 | 105 | args = parser.parse_args() 106 | 107 | LOGGING_CONFIG = os.getenv("LOGGING_CONFIG") 108 | logging.config.fileConfig(LOGGING_CONFIG) 109 | logger = logging.getLogger('preprocess_dataset') 110 | 111 | DATA_DIR = os.getenv("DATA_DIR") 112 | 113 | read_path = os.path.join(DATA_DIR, "raw_bq_extract", args.in_file+".csv.gz") 114 | write_path = os.path.join(DATA_DIR, "processed_journey", args.in_file.replace("merged", "preprocessed")+".csv.gz") 115 | 116 | if os.path.isfile(read_path): 117 | logging.info("Reading from \"{}\" and writing to \"{}\"...".format(read_path, write_path)) 118 | num_lines = count_lines(read_path) 119 | logging.info("Number of rows in dataframe: {}".format(num_lines)) 120 | logging.info("Reading, processing, writing file...") 121 | read_write_file(read_path, write_path, num_lines) 122 | else: 123 | logging.error("Input file \"{}\" does not exist.".format(read_path)) 124 | -------------------------------------------------------------------------------- /notebooks/eda/notebook_functions.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Helper functions used in the EDA notebooks 3 | ''' 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | def get_end_page(Page_List): 9 | return pd.eval(Page_List)[-1] 10 | 11 | def get_end_page_event(Page_Event_List): 12 | return pd.eval(Page_Event_List)[-1][-1] 13 | 14 | def count_desktop(DeviceCategories): 15 | thelist = pd.eval(DeviceCategories) 16 | desktop = 0 17 | mobile = 0 18 | other = 0 19 | for i in range(len(thelist)): 20 | if thelist[i][0] =='desktop': 21 | desktop = thelist[i][1] 22 | elif thelist[i][0] =='mobile': 23 | mobile = thelist[i][1] 24 | else: 25 | other = thelist[i][1] 26 | return desktop, mobile, other 27 | 28 | def derive_new_variables(df): 29 | print("creating page sequence length vars") 30 | # string to list 31 | df['page_list_eval'] = df['Page_List'].map(pd.eval) 32 | # count list items in the page sequence (so this is page count for the journey) 33 | df['page_seq_len'] = df['page_list_eval'].map(len) 34 | 35 | # string to list 36 | df['page_list_NL_eval'] = df['Page_List_NL'].map(pd.eval) 37 | # Count the page sequence without loops so B -> A ->A is B -> A and length is 2 38 | df['page_seq_len_NL'] = df['page_list_NL_eval'].map(len) 39 | 40 | print("Creating search vars") 41 | 42 | # variable to count how many times do the keywords that identify search appear in the page sequence? 43 | df['count_search'] = df.PageSequence.str.count("/search?") + df.PageSequence.str.count("/search/") 44 | 45 | # new variable: does the event list include the term "start"? yes ->1, no ->0 46 | df['event_list_contains_start'] = np.where(df.Event_List.str.contains("start"), 1, 0) 47 | # new variable: does the page sequence include the term "start"? yes ->1, no ->0 48 | df['page_seq_contains_start'] = np.where(df.Sequence.str.contains("start"), 1, 0) 49 | # new variable: does the page sequence include the term "service.gov.uk"? yes ->1, no ->0 50 | # This identifies external links to a service which has passed a serivce assessment 51 | df['page_seq_contains_service.gov.uk'] = np.where(df.Sequence.str.contains("service.gov.uk"), 1, 0) 52 | 53 | df['final_page'] = df['Page_List'].map(get_end_page) 54 | df['final_interaction'] = df['Page_Event_List'].map(get_end_page_event) 55 | 56 | # new variable: does the page sequence include the terms which identify internal search? yes ->1, no ->0 57 | df['contains_search_regex'] = np.where( 58 | (df.PageSequence.str.contains("/search?")) | (df.PageSequence.str.contains("/search/")), 1, 0) 59 | 60 | df['contains_search_n'] = df['contains_search_regex'] * df['Page_Seq_Occurrences'] 61 | 62 | df['desktop'], df['mobile'], df['other_device'] = zip( 63 | *df['DeviceCategories'].map(count_desktop)) 64 | 65 | df['more_desktop'] = np.where(df['desktop'] > (df['mobile'] + df['other_device']), 1, 0) 66 | 67 | print("creating final_page_type") 68 | 69 | df['final_page_type'] = 'other' 70 | df.loc[df['final_page'].str.contains('/government/publications/'), 'final_page_type'] = 'government_publication' 71 | df.loc[df['final_page'].str.contains('log-in'), 'final_page_type'] = 'login' 72 | df.loc[df['final_page'].str.contains('sign-in'), 'final_page_type'] = 'login' 73 | df.loc[df['final_page'].str.contains('login'), 'final_page_type'] = 'login' 74 | df.loc[df['final_page'].str.contains('check'), 'final_page_type'] = 'check' 75 | df.loc[df['final_page'].str.contains('apply'), 'final_page_type'] = 'apply' 76 | df.loc[df['final_page'].str.contains('contact'), 'final_page_type'] = 'contact/enquiries' 77 | df.loc[df['final_page'].str.contains('enquiries'), 'final_page_type'] = 'contact/enquiries' 78 | df.loc[df['final_page'].str.contains(r'get-.*-information.*'), 'final_page_type'] = 'get_information' 79 | df.loc[df['final_page'].str.contains('send'), 'final_page_type'] = 'send' 80 | df.loc[df['final_page'].str.contains('find'), 'final_page_type'] = 'find' 81 | df.loc[df['final_page'].str.contains('calculat'), 'final_page_type'] = 'calculate/calculator' 82 | df.loc[df['final_page'].str.contains('order'), 'final_page_type'] = 'order' 83 | df.loc[df['final_page'].str.contains('manage'), 'final_page_type'] = 'manage' 84 | df.loc[df['final_page'].str.contains('update'), 'final_page_type'] = 'update' 85 | df.loc[df['final_page'].str.contains('eligibility'), 'final_page_type'] = 'eligibility' 86 | df.loc[df['final_page'].str.contains('estimate'), 'final_page_type'] = 'estimate' 87 | df.loc[df['final_page'].str.contains('renew'), 'final_page_type'] = 'renew' 88 | df.loc[df['final_page'].str.contains('pay'), 'final_page_type'] = 'pay' 89 | df.loc[df['final_page'].str.contains('claim'), 'final_page_type'] = 'claim' 90 | df.loc[df['final_page'].str.contains('change'), 'final_page_type'] = 'change' 91 | 92 | df['final_interaction_type'] = df.final_interaction.str.extract(r'<:<(.*)<:<', expand=False) 93 | df['final_external_link'] = df.final_interaction.str.extract(r'EVENT<:>".join(page_list) + "\"" + "\t" 70 | 71 | # Writing events columns 72 | event_list = prep.extract_pe_components(page_event_list, 1) 73 | # write_to_file += "\"" + str(event_list) + "\"" + "\t" 74 | # write_to_file += "\"" + str(feat.count_event_cat(event_list)) + "\"" + "\t" 75 | # write_to_file += "\"" + str(feat.aggregate_event_cat(event_list)) + "\"" + "\t" 76 | 77 | # Event_cat_act_agg 78 | write_to_file += "\"" + str(feat.aggregate_event_cat_act(event_list)) + "\"" + "\t" 79 | 80 | # # Writing taxon_list 81 | # write_to_file += "\"" + str(prep.extract_cd_components(page_event_list, 2)) + "\"" + "\t" 82 | # write_to_file += "\"" + str(prep.extract_pcd_list(page_event_list, 2)) + "\"" + "\t" 83 | 84 | # # Writing loop column stuff 85 | # de_looped = prep.collapse_loop(page_list) 86 | # write_to_file += "\"" + str(de_looped) + "\"" + "\t" 87 | # write_to_file += "\"" + ">>".join(de_looped) + "\"" 88 | 89 | write_to_file += "\n" 90 | 91 | if i % 500000 == 0: 92 | logging.info("At index: {}".format(i)) 93 | write_file.write(write_to_file.encode()) 94 | write_to_file = "" 95 | write_file.flush() 96 | 97 | if i == number_lines - 1 and write_to_file != "": 98 | logging.info("At index via last: {}".format(i)) 99 | write_file.write(write_to_file.encode()) 100 | write_to_file = "" 101 | write_file.flush() 102 | 103 | 104 | if __name__ == "__main__": 105 | 106 | parser = argparse.ArgumentParser(description='Module that produces a metadata-aggregated and ' 107 | 'preprocessed dataset (.csv.gz), given a merged file.') 108 | parser.add_argument('in_file', help='Input dataframe file, this module adds .csv.gz automatically ') 109 | 110 | args = parser.parse_args() 111 | 112 | LOGGING_CONFIG = os.getenv("LOGGING_CONFIG") 113 | logging.config.fileConfig(LOGGING_CONFIG) 114 | logger = logging.getLogger('preprocess_dataset') 115 | 116 | DATA_DIR = os.getenv("DATA_DIR") 117 | 118 | read_path = os.path.join(DATA_DIR, "raw_bq_extract", args.in_file+".csv.gz") 119 | write_path = os.path.join( 120 | DATA_DIR, "processed_journey", 121 | args.in_file.replace("merged", "preprocessed")+"_thinner.csv.gz") 122 | 123 | if os.path.isfile(read_path): 124 | logging.info("Reading from \"{}\" and writing to \"{}\"...".format(read_path, write_path)) 125 | num_lines = count_lines(read_path) 126 | logging.info("Number of rows in dataframe: {}".format(num_lines)) 127 | logging.info("Reading, processing, writing file...") 128 | read_write_file(read_path, write_path, num_lines) 129 | else: 130 | logging.error("Input file \"{}\" does not exist.".format(read_path)) 131 | -------------------------------------------------------------------------------- /src/data/archived_multiprocess/multiprocess_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def delete_vars(x): 6 | """ 7 | Force object deletion 8 | :param x: object to delete 9 | """ 10 | if isinstance(x, list): 11 | for xs in x: 12 | del xs 13 | del x 14 | 15 | 16 | def compute_max_depth(test_list, chunks, depth, fewer_than_cpu): 17 | """ 18 | Compute maximum recursive depth of process_dataframes, governs MAX_DEPTH global and at which point of execution 19 | one-off rows (based on Occurrence # of PageSequence) will be dropped. 20 | :param test_list: dummy list based on list of files to be read/processed. 21 | :param chunks: initial number of partitions 22 | :param depth: init = 0, increases with every recursive call 23 | :return: (int) maximum recursive depth 24 | """ 25 | partitions = partition_list(test_list, chunks, fewer_than_cpu) 26 | if len(test_list) > 1: 27 | new_lst = [0 for _ in partitions] 28 | return compute_max_depth(new_lst, (lambda x: int(x / 2) if int(x / 2) > 0 else 1)(chunks), depth + 1, 29 | fewer_than_cpu) 30 | else: 31 | return depth 32 | 33 | 34 | def compute_initial_chunksize(number_of_files, num_cpu): 35 | """ 36 | 37 | :param num_cpu: 38 | :param number_of_files: 39 | :return: 40 | """ 41 | if number_of_files > num_cpu: 42 | return int(number_of_files / 2) 43 | else: 44 | return number_of_files 45 | 46 | 47 | def compute_batches(files, batchsize): 48 | """ 49 | 50 | :param files: 51 | :param batchsize: 52 | :return: 53 | """ 54 | 55 | if len(files) > int(np.ceil(batchsize * 1.5)): 56 | return True, merge_small_partition([files[i:i + batchsize] for i in range(0, len(files), batchsize)]) 57 | else: 58 | return False, files 59 | 60 | 61 | def merge_sliced_df(sliced_df_list: list, expected_size: int): 62 | """ 63 | Merge dataframe slices (column pairs) when appropriate (codes match) and append to a list of merged dataframes. 64 | Due to order of columns, the Occurrences slice will be used as a basis for the merge. 65 | :param sliced_df_list: list of slices 66 | :param expected_size: number of dataframes that have been originally sliced 67 | :return: list of merged dataframes 68 | """ 69 | final_list = [pd.DataFrame()] * expected_size 70 | # print([df.shape for i, df in sliced_df_list if i == 0]) 71 | # i = dataframe code, dataframes may come from multiple files. 72 | for i, df in sliced_df_list: 73 | # print(df.columns) 74 | if len(final_list[i]) == 0: 75 | # print("new") 76 | final_list[i] = df.copy(deep=True) 77 | else: 78 | # print("merge") 79 | final_list[i] = pd.merge(final_list[i], df, how='left', on='Sequence') 80 | return final_list 81 | 82 | 83 | def partition_list(dataframe_list: list, chunks: int, fewer_than_cpu): 84 | """ 85 | Build a list of partitions from a list of dataframes. Based on indices. 86 | :param dataframe_list: list of dataframes 87 | :param chunks: number of indices lists to generate, len(partition_list) 88 | :return: partition list, list of lists containing indices 89 | """ 90 | if chunks > 0: 91 | initial = [list(xs) for xs in np.array_split(list(range(len(dataframe_list))), chunks)] 92 | # print(initial) 93 | if len(initial) > 1 and not fewer_than_cpu: 94 | initial = merge_small_partition(initial) 95 | return initial 96 | else: 97 | return [[0]] 98 | 99 | 100 | def merge_small_partition(partitions: list): 101 | """ 102 | Merge small partitions of length 1 into previous partition, reduce number of recursive runs. 103 | :param partitions: 104 | :return: 105 | """ 106 | to_merge = [] 107 | for partition in partitions: 108 | if len(partition) == 1: 109 | to_merge.append(partition[0]) 110 | partitions.remove(partition) 111 | if len(to_merge) >= 1: 112 | partitions[-1].extend(to_merge) 113 | return partitions 114 | 115 | 116 | def slice_many_df(df_list, drop_one_offs, sliceable_cols, ordered=False): 117 | """ 118 | Slice a list of dataframes into their columns. First list will consist of 119 | (df_number, [Sequence, PageSequence, Occurrences]) 120 | slices, second list will consist of (df_number, [Sequence, AggregatableMetadata1]), 121 | (df_number, [Sequence, AggregatableMetadata2]) etc. 122 | Reduces size of dataframes passed on to worker processes, so they don't break. 123 | :param df_list: 124 | :param ordered: 125 | :return: 126 | """ 127 | if not ordered: 128 | return [(i, df.iloc[:, ind].copy(deep=True)) for i, df in enumerate(df_list) for ind in 129 | slice_dataframe(df, drop_one_offs, sliceable_cols)] 130 | else: 131 | return [(i, df.iloc[:, ind].copy(deep=True)) for i, df in enumerate(df_list) for ind in 132 | slice_dataframe(df, drop_one_offs, sliceable_cols) if 133 | "Occurrences" in df.columns[ind]], [(i, df.iloc[:, ind].copy(deep=True)) for i, df in enumerate(df_list) 134 | for ind in 135 | slice_dataframe(df, drop_one_offs, sliceable_cols) if 136 | "Occurrences" not in df.columns[ind]] 137 | 138 | 139 | def slice_dataframe(df, drop_one_offs, sliceable_cols): 140 | """ 141 | Computes the slices (column pairs) of dataframe 142 | :param df: dataframe to be sliced 143 | :param drop_one_offs: 144 | :param sliceable_cols: 145 | :return: list of dataframe slices 146 | """ 147 | sliced_df = [] 148 | for col in sliceable_cols: 149 | if col in df.columns: 150 | if col == "Occurrences": 151 | if drop_one_offs: 152 | sliced_df.append( 153 | [df.columns.get_loc("Sequence"), df.columns.get_loc("PageSequence"), df.columns.get_loc(col)]) 154 | else: 155 | sliced_df.append( 156 | [df.columns.get_loc("Sequence"), df.columns.get_loc(col)]) 157 | else: 158 | sliced_df.append([df.columns.get_loc("Sequence"), df.columns.get_loc(col)]) 159 | return sliced_df 160 | -------------------------------------------------------------------------------- /src/data/taxon_translate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import pandas as pd 5 | 6 | 7 | def recursive_parenting(taxon_df, content_id, parent_content_id, parent_list): 8 | """ 9 | Recursively compute a taxon's parents 10 | :param taxon_df: taxon dataframe from content tagger (taxon json file) 11 | :param content_id: target taxon content_id 12 | :param parent_content_id: target taxon's parent content_id 13 | :param parent_list: incrementing list of parents 14 | :return: recursive call, aggregated list of parents if top level 15 | """ 16 | if isinstance(parent_content_id, float) and len(parent_list) == 0: 17 | return [] 18 | elif isinstance(parent_content_id, float): 19 | return [[parent_taxon, i + 1] for i, parent_taxon in enumerate(reversed(parent_list))] 20 | else: 21 | content_id = parent_content_id 22 | parent_content_id = taxon_df[taxon_df.content_id == parent_content_id].iloc[0].parent_content_id 23 | title = taxon_df[taxon_df.content_id == content_id].iloc[0].title 24 | parent_list.append([content_id, parent_content_id, title]) 25 | return recursive_parenting(taxon_df, content_id, parent_content_id, parent_list) 26 | 27 | 28 | def build_taxon_set(taxon_series): 29 | """ 30 | Build set of unique taxons from the input taxon Series induced from the network node dataframe. 31 | :param taxon_series: Taxon column from the network node df, list of taxon content_id lists. 32 | :return: unique set containing taxon content_ids from nodes 33 | """ 34 | return set([content_id for taxon_list in taxon_series for content_id in taxon_list]) 35 | 36 | 37 | def map_taxon_content_ids(target_taxon_df, nodes_df): 38 | """ 39 | Extract taxons from node dataframe as a unique set of taxon content_ids and then compute their title, base_path 40 | (main component to be returned), level, parents (if any, else NaN) and finally the top-most parent. 41 | :param target_taxon_df: taxon dataframe from content tagger (taxon json file) 42 | :param nodes_df: dataframe with network nodes 43 | :return: dataframe containing taxon information 44 | """ 45 | 46 | column_list = ['content_id', 'title', 'base_path', 'level', 'parents', 'level1_parent'] 47 | taxon_level_df = pd.DataFrame(columns=column_list) 48 | 49 | taxon_set = build_taxon_set(nodes_df.Node_Taxon) 50 | 51 | for content_id in taxon_set: 52 | if target_taxon_df[target_taxon_df.content_id == content_id].shape[0] > 0: 53 | title = target_taxon_df[target_taxon_df.content_id == content_id].iloc[0].title 54 | base_path = target_taxon_df[target_taxon_df.content_id == content_id].iloc[0].base_path 55 | parent_list = pd.Series(recursive_parenting(target_taxon_df, content_id, 56 | target_taxon_df[ 57 | target_taxon_df.content_id == content_id].parent_content_id.values[ 58 | 0], [])) 59 | current_level = len(parent_list) + 1 60 | level1_par = title 61 | if len(parent_list.values) > 0: 62 | level1_par = parent_list.values[0][0][2] 63 | taxon_level_df = pd.concat([taxon_level_df, pd.DataFrame([[content_id, 64 | title, 65 | base_path, 66 | current_level, 67 | parent_list.values, 68 | level1_par]], columns=column_list)]) 69 | taxon_level_df.reset_index(drop=True, inplace=True) 70 | taxon_level_df.drop_duplicates(subset="content_id", keep="first", inplace=True) 71 | return taxon_level_df 72 | 73 | 74 | def add_taxon_basepath_to_df(node_df, taxon_level_df): 75 | """ 76 | Compute appropriate taxon base_paths for list of taxon content_ids and add to node dataframe. 77 | :param node_df: dataframe with network nodes 78 | :param taxon_level_df: dataframe containing taxon information (taxons nodes are tagged with) 79 | :return: augmented node dataframe, including taxon base_paths 80 | """ 81 | content_basepath_dict = dict(zip(taxon_level_df.content_id, taxon_level_df.base_path)) 82 | taxon_name_list = [] 83 | for tup in node_df.itertuples(): 84 | taxon_basepath = [] 85 | for taxon in tup.Node_Taxon: 86 | if taxon in content_basepath_dict.keys(): 87 | taxon_basepath.append(content_basepath_dict[taxon]) 88 | taxon_name_list.append(taxon_basepath) 89 | node_df['Node_Taxon_basepath'] = taxon_name_list 90 | return node_df 91 | 92 | 93 | if __name__ == "__main__": 94 | parser = argparse.ArgumentParser( 95 | description='Module to translate taxon content_ids in node file to taxon base paths. In addition, ecursively ' 96 | 'compute taxon' 97 | 'level, parents and top-most parents.') 98 | parser.add_argument('node_filename', help='Node input filename.') 99 | parser.add_argument('taxon_dir', help='Directory containing taxon json file.') 100 | parser.add_argument('taxon_output_filename', default="", 101 | help='Naming convention for resulting taxon dataframe file. Includes taxons that nodes in node ' 102 | 'file are tagged to.') 103 | parser.add_argument('-q', '--quiet', action='store_true', default=False, help='Turn off debugging logging.') 104 | args = parser.parse_args() 105 | 106 | DATA_DIR = os.getenv("DATA_DIR") 107 | nodes_path = os.path.join(DATA_DIR, "processed_data", args.node_filename + ".csv.gz") 108 | taxons_path = os.path.join(args.taxon_dir, "taxons.json.gz") 109 | 110 | if os.path.exists(taxons_path) and os.path.exists(nodes_path): 111 | print("Working on: {}".format(taxons_path)) 112 | taxons_json_df = pd.read_json(taxons_path, compression="gzip") 113 | print("Working on: {} ".format(nodes_path)) 114 | nodes_df = pd.read_csv(nodes_path, sep="\t", compression="gzip") 115 | 116 | taxon_df = map_taxon_content_ids(taxons_json_df, nodes_df) 117 | nodes_df = add_taxon_basepath_to_df(nodes_df, taxon_df) 118 | 119 | # overwrite option? should it be an option or default? 120 | nodes_df.to_csv(nodes_path.replace(".csv.gz", "_taxon_base_path.csv.gz"), sep="\t", compression="gzip", 121 | index=False) 122 | # save taxon-specific dataframe 123 | taxon_output_path = os.path.join(DATA_DIR, "processed_data", args.taxon_output_filename) 124 | taxon_df.to_csv(taxon_output_path, compression="gzip", index=False) 125 | else: 126 | print("Files do not exist:\n {}: {},\n {}: {}".format(taxons_path, os.path.exists(taxons_path), nodes_path, 127 | os.path.exists(nodes_path))) 128 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Contributing to govuk-network-data 2 | ================================== 3 | 4 | Welcome! govuk-network-data is a community project that aims to work for a wide 5 | range of Python users and Python codebases. If you're trying govuk-network-data on 6 | your Python code, your experience and what you can contribute are 7 | important to the project's success. 8 | 9 | 10 | Getting started, building, and testing 11 | -------------------------------------- 12 | 13 | If you haven't already, take a look at the project's 14 | [README.md file](README.md). 15 | 16 | Discussion 17 | ---------- 18 | 19 | If you've run into behavior in govuk-network-data you don't understand, or you're 20 | having trouble working out a good way to apply it to your code, or 21 | you've found a bug or would like a feature it doesn't have, we want to 22 | hear from you! 23 | 24 | Our main forum for discussion is the project's [GitHub issue 25 | tracker](https://github.com/python/mypy/issues). This is the right 26 | place to start a discussion of any of the above or most any other 27 | topic concerning the project. 28 | 29 | #### Code of Conduct 30 | 31 | Everyone participating in the govuk-network-data community, and in particular in our 32 | issue tracker, pull requests, and Slack channel, is expected to treat 33 | other people with respect and more generally to follow the guidelines 34 | articulated in the [Python Community Code of 35 | Conduct](https://www.python.org/psf/codeofconduct/). 36 | 37 | Submitting Changes 38 | ------------------ 39 | 40 | Even more excellent than a good bug report is a fix for a bug, or the 41 | implementation of a much-needed new feature. (*) We'd love to have 42 | your contributions. 43 | 44 | (*) If your new feature will be a lot of work, we recommend talking to 45 | us early -- see below. 46 | 47 | We use the usual GitHub pull-request flow, which may be familiar to 48 | you if you've contributed to other projects on GitHub. For the mechanics, 49 | see [our git and GitHub workflow help page](https://github.com/python/mypy/wiki/Using-Git-And-GitHub), 50 | or [GitHub's own documentation](https://help.github.com/articles/using-pull-requests/). 51 | 52 | Anyone interested in govuk-network-data may review your code. One of the govuk-network-data core 53 | developers will merge your pull request when they think it's ready. 54 | For every pull request, we aim to promptly either merge it or say why 55 | it's not yet ready; if you go a few days without a reply, please feel 56 | free to ping the thread by adding a new comment. 57 | 58 | Preparing Changes 59 | ----------------- 60 | 61 | Before you begin: if your change will be a significant amount of work 62 | to write, we highly recommend starting by opening an issue laying out 63 | what you want to do. That lets a conversation happen early in case 64 | other contributors disagree with what you'd like to do or have ideas 65 | that will help you do it. 66 | 67 | The best pull requests are focused, clearly describe what they're for 68 | and why they're correct, and contain tests for whatever changes they 69 | make to the code's behavior. As a bonus these are easiest for someone 70 | to review, which helps your pull request get merged quickly! Standard 71 | advice about good pull requests for open-source projects applies. 72 | 73 | For coding conventions see the reference to 74 | [PEP 8](https://www.python.org/dev/peps/pep-0008/) -- for the code you 75 | put in the pull request. 76 | 77 | Also, do not squash your commits after you have submitted a pull request, as this 78 | erases context during review. We will squash commits when the pull request is merged. 79 | 80 | You may also find other pages in the 81 | [govuk-network-data developer guide](https://github.com/python/mypy/wiki/Developer-Guides) 82 | helpful in developing your change. 83 | 84 | 85 | Core developer guidelines 86 | ------------------------- 87 | 88 | Core developers should follow these rules when processing pull requests: 89 | 90 | * Always wait for tests to pass before merging PRs. 91 | * Use "[Squash and merge](https://github.com/blog/2141-squash-your-commits)" 92 | to merge PRs. 93 | * Delete branches for merged PRs (by core devs pushing to the main repo). 94 | * Edit the final commit message before merging to conform to the following 95 | style (we wish to have a clean `git log` output): 96 | * When merging a multi-commit PR make sure that the commit message doesn't 97 | contain the local history from the committer and the review history from 98 | the PR. Edit the message to only describe the end state of the PR. 99 | * Make sure there is a *single* newline at the end of the commit message. 100 | This way there is a single empty line between commits in `git log` 101 | output. 102 | * Split lines as needed so that the maximum line length of the commit 103 | message is under 80 characters, including the subject line. 104 | * Capitalize the subject and each paragraph. 105 | * Make sure that the subject of the commit message has no trailing dot. 106 | * Use the imperative mood in the subject line (e.g. "Fix typo in README"). 107 | * If the PR fixes an issue, make sure something like "Fixes #xxx." occurs 108 | in the body of the message (not in the subject). 109 | * Use Markdown for formatting. 110 | 111 | 112 | Issue-tracker conventions 113 | ------------------------- 114 | 115 | We aim to reply to all new issues promptly. We'll assign a milestone 116 | to help us track which issues we intend to get to when, and may apply 117 | labels to carry some other information. Here's what our milestones 118 | and labels mean. 119 | 120 | Sometimes this information might be on Trello and not duplicated on Github, however when we open this code up we should endeavour to rely on Github. 121 | 122 | ### Task priority and sizing 123 | 124 | We use GitHub "labels" ([see our 125 | list](https://github.com/python/mypy/labels)) to roughly order what we 126 | want to do soon and less soon. There's two dimensions taken into 127 | account: **priority** (does it matter to our users) and **size** (how 128 | long will it take to complete). 129 | 130 | Bugs that aren't a huge deal but do matter to users and don't seem 131 | like a lot of work to fix generally will be dealt with sooner; things 132 | that will take longer may go further out. 133 | 134 | We are trying to keep the backlog at a manageable size, an issue that is 135 | unlikely to be acted upon in foreseeable future is going to be 136 | respectfully closed. This doesn't mean the issue is not important, but 137 | rather reflects the limits of the team. 138 | 139 | The **question** label is for issue threads where a user is asking a 140 | question but it isn't yet clear that it represents something to actually 141 | change. We use the issue tracker as the preferred venue for such 142 | questions, even when they aren't literally issues, to keep down the 143 | number of distinct discussion venues anyone needs to track. These might 144 | evolve into a bug or feature request. 145 | 146 | Issues **without a priority or size** haven't been triaged. We aim to 147 | triage all new issues promptly, but there are some issues from previous 148 | years that we haven't yet re-reviewed since adopting these conventions. 149 | 150 | ### Other labels 151 | 152 | * **needs discussion**: This issue needs agreement on some kind of 153 | design before it makes sense to implement it, and it either doesn't 154 | yet have a design or doesn't yet have agreement on one. 155 | * **feature**, **bug**, **crash**, **refactoring**, **documentation**: 156 | These classify the user-facing impact of the change. Specifically 157 | "refactoring" means there should be no user-facing effect. 158 | * **topic-** labels group issues touching a similar aspect of the 159 | project, for example PEP 484 compatibility, a specific command-line 160 | option or dependency. 161 | -------------------------------------------------------------------------------- /src/data/bq_extract_data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import fnmatch 4 | import logging.config 5 | import os 6 | import sys 7 | import traceback 8 | 9 | import pandas as pd 10 | 11 | 12 | def find_query(query_arg, query_dir): 13 | """(str, str) -> str 14 | Return the relative path of the first file in 15 | query_dir that contains a match for query_arg string. 16 | The first file will be based on alphabetical order. 17 | >>>find_query('work', './') 18 | './work' 19 | """ 20 | for file in os.listdir(query_dir): 21 | if fnmatch.fnmatch(file, "*" + query_arg + "*.sql"): 22 | return os.path.join(query_dir, file) 23 | 24 | 25 | def read_query(filepath): 26 | """(str) -> str 27 | Opens the file at filepath for reading, removing /n 28 | before rejoining seperate lines with " " seperator. 29 | """ 30 | with open(filepath, 'r') as file: 31 | lines = " ".join(line.strip("\n") for line in file) 32 | return lines 33 | 34 | 35 | def change_timestamp(x, date, dialect): 36 | """(str, str, str) -> str 37 | Replace the timestamp in x, where x is the SQL query from file, 38 | with the date, using the desired SQL dialect, which defaults to legacy. 39 | """ 40 | if dialect == "standard": 41 | return x.replace("TIME_STAMP", date.replace("-", "")) 42 | else: 43 | change = str("TIMESTAMP(\"") + date + "\"), " + str("TIMESTAMP(\"") + date + "\"))" 44 | return x.replace("TIME_STAMP", change) 45 | 46 | 47 | def looped_query(query_from_file, date_range, exclude_dates, project_id, key_path, destination_dir, filename_stub, 48 | dialect="legacy"): 49 | """(str, list, list, str, str, str, str) -> file 50 | Saves a compressed csv with filename_stub suffixed to date queried 51 | into destination_dir. They'll be one .csv per day queried. The query is 52 | derived from query_from_file and run against dates in the date_range 53 | that are not excluded by exclude_dates. The project_id and key_path 54 | are used to query the correct table and provide the permissions 55 | for the query to run using BigQuery. These csv files can be 56 | merged later in the pipeline with make_dataset.py. 57 | """ 58 | runs = len(date_range) - len(exclude_dates) 59 | 60 | logging.info(query_from_file) 61 | 62 | for i, date in enumerate(date_range): 63 | logger.info("RUN {} OUT OF {}".format(str(i + 1), runs)) 64 | if date not in exclude_dates: 65 | df_in = None 66 | logger.info("Working on: {}".format(date)) 67 | logger.info("Query start...") 68 | query_for_paths = change_timestamp(query_from_file, date, dialect) 69 | 70 | try: 71 | df_in = pd.io.gbq.read_gbq(query_for_paths, 72 | project_id=project_id, 73 | reauth=False, 74 | # verbose=True, 75 | private_key=key_path, 76 | dialect=dialect) 77 | except Exception as e: 78 | logging.error("Oops, gbq failed.\n======\n {} \n======\n".format(traceback.format_exc())) 79 | 80 | if df_in is not None: 81 | file_name = os.path.join(destination_dir, filename_stub + "_" + str(date) + '.csv.gz') 82 | logger.info("Saving at: {}".format(file_name)) 83 | df_in.to_csv(file_name, compression='gzip', index=False, sep="\t") 84 | logger.info("Saved to file.") 85 | else: 86 | logger.error("Nothing to save, query failed.") 87 | 88 | else: 89 | logger.info("Skipped target date: {}".format(date)) 90 | 91 | 92 | if __name__ == "__main__": 93 | parser = argparse.ArgumentParser( 94 | description='BigQuery extractor module', 95 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 96 | parser.add_argument('start_date', help='Start date in Y-m-d, eg 2018-12-31') 97 | parser.add_argument('end_date', help='End date in Y-m-d, eg 2018-12-31') 98 | parser.add_argument('filename', help='Naming convention for resulting dataframe file(s).') 99 | parser.add_argument('query', help=''' 100 | Name of query to use, within queries directory (specified by 101 | environment variable QUERIES_DIR). The first file in query_dir that 102 | contains a match for query string is used, this is based on 103 | alphabetical order. 104 | ''') 105 | parser.add_argument('dest_dir', default="", nargs="?", 106 | help='Specialized destination directory for resulting dataframe file(s).') 107 | parser.add_argument('--standard', action='store_true', default=False, 108 | help='Specify BigQuery dialect. Legacy default.') 109 | parser.add_argument('-q', '--quiet', action='store_true', default=False, help='Turn off debugging logging.') 110 | parser.add_argument('--ab_test_prefix', help=''' 111 | For use with the stnd_taxon_ab query, prefix of the value in the AB 112 | test custom dimension, the bit before the colon, not including it, for 113 | example, if you care about values 'RelatedLinksAATest:A' and 114 | 'RelatedLinksAATest:B', pass 'RelatedLinksAATest' through this arg. 115 | ''') 116 | args = parser.parse_args() 117 | if args.standard: 118 | dialect = "standard" 119 | else: 120 | dialect = "legacy" 121 | # Logger setup 122 | LOGGING_CONFIG = os.getenv("LOGGING_CONFIG") 123 | logging.config.fileConfig(LOGGING_CONFIG) 124 | logger = logging.getLogger('bq_extract') 125 | 126 | if args.quiet: 127 | logging.disable(logging.DEBUG) 128 | # BQ PROJECT SETUP 129 | ProjectID = 'govuk-bigquery-analytics' 130 | KEY_DIR = os.getenv("BQ_KEY_DIR") 131 | key_file_path = os.path.join(KEY_DIR, os.listdir(KEY_DIR)[0]) 132 | 133 | # DATA DIRECTORIES 134 | QUERIES_DIR = os.getenv("QUERIES_DIR") 135 | DATA_DIR = os.getenv("DATA_DIR") 136 | dest_dir = os.path.join(DATA_DIR, args.dest_dir if args.dest_dir != "" else "raw_bq_extract") 137 | 138 | # DATAFRAME FILENAME(S) 139 | filename = args.filename 140 | 141 | # DATES TO EVALUATE 142 | start_date = datetime.datetime.strptime(args.start_date, '%Y-%m-%d') 143 | end_date = datetime.datetime.strptime(args.end_date, '%Y-%m-%d') 144 | date_list = list(map(lambda x: x.strftime("%Y-%m-%d"), pd.date_range(start_date, end_date).tolist())) 145 | 146 | # RESOLVE QUERY FROM ARG 147 | if len(args.query) > 1: 148 | query_path = find_query(args.query, QUERIES_DIR) 149 | 150 | # If dest_dir doesn't exist, create it. 151 | if not os.path.isdir(dest_dir): 152 | logging.info("Specified destination directory \"{}\" does not exist, creating...".format(dest_dir)) 153 | os.mkdir(dest_dir) 154 | 155 | logger.info( 156 | "\n======\nStart date: {} \nEnd date: {} \nDestination directory: {}\ 157 | \nFilename: {} \nQuery: {}\n======\n".format( 158 | start_date, 159 | end_date, 160 | dest_dir, 161 | filename, 162 | query_path)) 163 | 164 | if query_path is not None: 165 | logger.info("Specified query exists, running...") 166 | query = read_query(query_path) 167 | 168 | if "AB_DIMENSION_VALUE_PREFIX" in query: 169 | try: 170 | query = query.replace( 171 | "AB_DIMENSION_VALUE_PREFIX", args.ab_test_prefix) 172 | except TypeError: 173 | logging.error( 174 | f"Tried to replace AB_DIMENSION_VALUE_PREFIX in query," 175 | f" ab_test_prefix argument is {args.ab_test_prefix}") 176 | sys.exit() 177 | looped_query(query, date_list, [], ProjectID, key_file_path, dest_dir, filename, dialect) 178 | else: 179 | logger.info("Query failed, not enough info provided") 180 | -------------------------------------------------------------------------------- /src/data/preprocess.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import numpy as np 4 | 5 | 6 | def clean_tuple(pe_str_tuple): 7 | """ 8 | TODO: not sure why this is here... maybe quotes break things 9 | Transform raw SQL BigQuery string to list of page/event tuples: 10 | :param pe_str_tuple: a tuple, ideally length 2 (page1,eventCategory1<:>page2<>... into a 20 | list of tuples page_event_list = [(page1,eventCategory1<:>iii\.+|>>\.+|\s>>>\s", "", bq_journey_string) 28 | page_event_list = [] 29 | for hit in bq_journey_string.split(">>"): 30 | # Old delimiter: split("//") 31 | page_event_tup = clean_tuple(hit.split("<<")) 32 | # For len==3 Taxon present within bq_journey_string 33 | if len(page_event_tup) == 2 or len(page_event_tup) == 3: 34 | page_event_list.append(tuple(page_event_tup)) 35 | else: 36 | # TODO remove in future 37 | print("Error, tuple split generated too many elements.") 38 | print("Overall BigQuery string:", bq_journey_string) 39 | print("Too long page_event tuple:", page_event_tup) 40 | # Add in dummy variable for debugging and to avoid empty lists 41 | # Useful for inspecting real data, uncomment if desired 42 | # page_event_list.append(("page1","eventCategory<: 0: 62 | position_dict = [(0, page_event_list[0])] 63 | for i, (page, event) in enumerate(page_event_list[1:]): 64 | # print(i) 65 | if page != page_event_list[i][0]: 66 | index = position_dict[-1][0] 67 | position_dict.append((index + 1, (page, event))) 68 | elif page == page_event_list[i][0] and (event != position_dict[-1][1][1]): 69 | position_dict.append((position_dict[-1][0], (page, event))) 70 | return position_dict 71 | return np.NaN 72 | 73 | 74 | def split_event(event_str): 75 | """ 76 | Split eventCategory<: 3: 93 | print("Event tuple has more than two elements:", event_tuple) 94 | print("Original:", event_str) 95 | # event_tuple = (event_tuple[0], "<<".join(event_tuple[1:])) 96 | if len(event_tuple) == 2: 97 | print("Event tuple has only one element:", event_tuple) 98 | print("Original:", event_str) 99 | 100 | 101 | def extract_pe_components(page_event_list, i): 102 | """ 103 | Extract page_list or event_list from page_event_list 104 | :param page_event_list: list of (page,event) tuples 105 | :param i: 0 for page_list 1, for event_list 106 | :return: appropriate hit_list 107 | """ 108 | hit_list = [] 109 | # page_event is a tuple 110 | for page_event in page_event_list: 111 | if i == 0 and page_event[1] == "PAGE<:>A>>B page loops from page_list. Saved as new dataframe column. 159 | :param page_list: the list of pages to de-loop 160 | :return: de-loop page list 161 | """ 162 | return [node for i, node in enumerate(page_list) if i == 0 or node != page_list[i - 1]] 163 | 164 | 165 | # Network things, should probably be moved somewhere else 166 | def start_end_page(page_list): 167 | """ 168 | Find start and end pages (nodes) in a list of page hits 169 | :param page_list: list of page hits 170 | :return: start and end nodes 171 | """ 172 | if len(page_list) == 1: 173 | return page_list[0] 174 | else: 175 | return page_list[0], page_list[-1] 176 | 177 | 178 | def subpaths_from_list(page_list): 179 | """ 180 | Build node pairs (edges) from a list of page hits 181 | :param page_list: list of page hits 182 | :return: list of all possible node pairs 183 | """ 184 | return [[page, page_list[i + 1]] for i, page in enumerate(page_list) if i < len(page_list) - 1] 185 | 186 | 187 | def start_page(page_list): 188 | """ 189 | First page/node in a list of page hits 190 | :param page_list: list of page hits 191 | :return: First page 192 | """ 193 | return page_list[0] 194 | 195 | 196 | def end_page(page_list): 197 | """ 198 | Last page/node in a list of page hits 199 | :param page_list: list of page hits 200 | :return: last page 201 | """ 202 | return page_list[-1] 203 | 204 | 205 | def start_end_subpath_list(subpath_list): 206 | """ 207 | First and last page from list of node pairs 208 | :param subpath_list: list of node pairs 209 | :return: first and last page 210 | """ 211 | return subpath_list[0][0], subpath_list[-1][-1] 212 | 213 | 214 | def start_end_edges_subpath_list(subpath_list): 215 | """ 216 | First/last node pairs (edges) from list of node pairs 217 | :param subpath_list: list of node pairs 218 | :return: first and last node pairs 219 | """ 220 | return subpath_list[0], subpath_list[-1] 221 | -------------------------------------------------------------------------------- /notebooks/taxon/taxon_translate.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import os\n", 11 | "import numpy as np\n", 12 | "import json\n", 13 | "from ast import literal_eval" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "DATA_DIR = os.getenv(\"DATA_DIR\")\n", 23 | "filename = \"preprocessed_with_dupes_31_10_taxon2.csv.gz\"\n", 24 | "path = os.path.join(DATA_DIR,\"output\", filename)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "df = pd.read_csv(path,sep=\"\\t\",compression=\"gzip\")" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "df[\"Taxon_List\"] = df[\"Taxon_List\"].map(literal_eval)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "def taxon_split(taxon_list):\n", 52 | " return [t for taxon in taxon_list for t in taxon.split(\",\")]" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "#### Build list of unique taxons, excluding \"other\"" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "taxon_counter = Counter()\n", 69 | "for tup in df.itertuples():\n", 70 | " taxons = taxon_split(tup.Taxon_List)\n", 71 | " for taxon in taxons:\n", 72 | " taxon_counter[taxon]+=1\n", 73 | "len(taxon_counter) " 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "#### Map taxon `content_id` to `base_path` using content tagger extract" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "taxon_path = os.path.join(os.getenv(\"DOCUMENTS\"),\"taxons.json.gz\")\n", 90 | "taxon_df = pd.read_json(taxon_path,compression=\"gzip\")" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "# taxon_path = os.path.join(os.path.dirname(os.getenv(\"DOCUMENTS\")), \"Downloads\", \"2018-11-19 Taxonomy.csv\")\n", 100 | "# taxon_df = pd.read_csv(taxon_path)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "taxon_df" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "taxon_df.shape" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "taxon_df.columns" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "## Count taxons present in both journeys and taxon export and write to file" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "found = 0\n", 144 | "with open(\"taxon_id_title_311018.tsv\",\"w\") as writer:\n", 145 | " writer.write(\"content_id\\ttitle\\tbase_path\\tparent_content_id\\n\")\n", 146 | " for taxon,value in taxon_counter.items():\n", 147 | " temp = taxon_df[taxon_df.content_id==taxon]\n", 148 | " if temp.shape[0]>0:\n", 149 | " found +=1\n", 150 | "# print(taxon,\",\",temp.iloc[0].title)\n", 151 | " writer.write(\"{}\\t{}\\t{}\\t{}\\n\".format(taxon,\n", 152 | " temp.iloc[0].title,\n", 153 | " temp.iloc[0].base_path,\n", 154 | " temp.iloc[0].parent_content_id))\n", 155 | "found" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "(found*100)/taxon_df.shape[0]\n", 165 | "\n", 166 | "## Translate content_id to level + parents\n", 167 | "\n", 168 | "def recursive_parenting(df,content_id,parent_content_id,parent_list):\n", 169 | " if isinstance(parent_content_id,float) and len(parent_list)==0:\n", 170 | " return []\n", 171 | " elif isinstance(parent_content_id,float):\n", 172 | " return [[thing,i+1]for i,thing in enumerate(reversed(parent_list))]\n", 173 | " else:\n", 174 | " content_id = parent_content_id\n", 175 | " parent_content_id = df[df.content_id==parent_content_id].iloc[0].parent_content_id\n", 176 | " title = df[df.content_id==content_id].iloc[0].title\n", 177 | " parent_list.append([content_id,parent_content_id,title])\n", 178 | " return recursive_parenting(df,content_id,parent_content_id,parent_list)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "column_list = ['content_id','title','level','parents','level1_parent']\n", 188 | "taxon_level_df = pd.DataFrame(columns=column_list)\n", 189 | "missed=0\n", 190 | "for content_id,value in taxon_counter.items():\n", 191 | " if taxon_df[taxon_df.content_id==content_id].shape[0] > 0:\n", 192 | " title = taxon_df[taxon_df.content_id==content_id].iloc[0].title\n", 193 | " parent_list = pd.Series(recursive_parenting(taxon_df,content_id,\n", 194 | " taxon_df[taxon_df.content_id==content_id].parent_content_id.values[0],[]))\n", 195 | " current_level = len(parent_list)+1\n", 196 | " level1_par = title\n", 197 | " if len(parent_list.values) > 0:\n", 198 | " level1_par = parent_list.values[0][0][2]\n", 199 | " taxon_level_df = pd.concat([taxon_level_df,pd.DataFrame([[content_id,\n", 200 | " title,\n", 201 | " current_level,\n", 202 | " parent_list.values,\n", 203 | " level1_par]],columns=column_list)])" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "taxon_level_df" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "taxon_level_df.to_csv(\"taxon_level_df.tsv\",sep='\\t',index=False)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "## Count parent taxons, self-parenting if nan" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "counter =0\n", 238 | "parent_taxons = Counter()\n", 239 | "for taxon,value in taxon_counter.items():\n", 240 | " temp = taxon_df[taxon_df.content_id==taxon]\n", 241 | " if temp.shape[0]>0:\n", 242 | " taxon_base_path = temp.iloc[0].base_path\n", 243 | " parent = None\n", 244 | " if isinstance(temp.iloc[0].parent_content_id,str):\n", 245 | " parent = taxon_df[taxon_df.content_id == temp.iloc[0].parent_content_id].iloc[0].title\n", 246 | " else:\n", 247 | " parent = temp.iloc[0].title\n", 248 | " parent_taxons[parent]+=value" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "list(parent_taxons.most_common(30))" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "len(parent_taxons)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [] 282 | } 283 | ], 284 | "metadata": { 285 | "kernelspec": { 286 | "display_name": "Python 3", 287 | "language": "python", 288 | "name": "python3" 289 | }, 290 | "language_info": { 291 | "codemirror_mode": { 292 | "name": "ipython", 293 | "version": 3 294 | }, 295 | "file_extension": ".py", 296 | "mimetype": "text/x-python", 297 | "name": "python", 298 | "nbconvert_exporter": "python", 299 | "pygments_lexer": "ipython3", 300 | "version": "3.6.0" 301 | } 302 | }, 303 | "nbformat": 4, 304 | "nbformat_minor": 2 305 | } 306 | -------------------------------------------------------------------------------- /src/analysis/journey_events_analysis.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging.config 3 | import os 4 | from ast import literal_eval 5 | from collections import Counter 6 | 7 | import pandas as pd 8 | from scipy import stats 9 | 10 | AGGREGATE_COLUMNS = ['DeviceCategories', 'Event_cats_agg', 'Event_cat_act_agg'] 11 | 12 | NAVIGATE_EVENT_CATS = ['breadcrumbClicked', 13 | 'homeLinkClicked', 14 | '/search', 15 | 'navDocumentCollectionLinkClicked', 16 | 'navAccordionLinkClicked', 17 | 'navLeafLinkClicked', 18 | 'navPolicyAreaLinkClicked', 19 | 'navServicesInformationLinkClicked', 20 | 'navSubtopicContentItemLinkClicked', 21 | 'navSubtopicLinkClicked', 22 | 'navTopicLinkClicked', 23 | 'relatedTaxonomyLinkClicked', 24 | 'stepNavHeaderClicked', 'stepNavLinkClicked', 'stepNavPartOfClicked'] 25 | 26 | # Useful for explicit event category and action matching, may extend in the future 27 | NAVIGATE_EVENT_CATS_ACTS = [('relatedLinkClicked', 'Explore the topic')] 28 | 29 | 30 | def device_count(x, device): 31 | return sum([value for item, value in x if item == device]) 32 | 33 | 34 | def has_related_event(sequence_str): 35 | return all(cond in sequence_str for cond in ["relatedLinkClicked", "Related content"]) 36 | 37 | 38 | def has_nav_event_cat(sequence_str): 39 | return any(event_cat in sequence_str for event_cat in NAVIGATE_EVENT_CATS) 40 | 41 | 42 | def has_nav_event_cat_act(sequence_str): 43 | return any( 44 | event_cat in sequence_str and event_act in sequence_str for event_cat, event_act in NAVIGATE_EVENT_CATS_ACTS) 45 | 46 | 47 | def map_device_counter(df): 48 | """ 49 | Count the device-based occurrences per target device and add as new cols. 50 | 51 | Tablet is ignored as it is assumed to have been filtered. 52 | :param df: 53 | :return: 54 | """ 55 | logging.info("Mapping device counts") 56 | df["DesktopCount"] = df['DeviceCategories'].map(lambda x: device_count(x, "desktop")) 57 | df["MobileCount"] = df['DeviceCategories'].map(lambda x: device_count(x, "mobile")) 58 | 59 | 60 | def chi2_test(vol_desk, vol_mobile, vol_mobile_rel, vol_desk_rel): 61 | vol_mobile_no_rel = vol_mobile - vol_mobile_rel 62 | vol_desk_no_rel = vol_desk - vol_desk_rel 63 | obs = [[vol_mobile_rel, vol_mobile_no_rel], [vol_desk_rel, vol_desk_no_rel]] 64 | return stats.chi2_contingency(obs) 65 | 66 | 67 | def compute_volumes(df, occ_cols): 68 | return (df[occ].sum() for occ in occ_cols) 69 | 70 | 71 | def compute_percents(nums, denoms): 72 | if len(nums) == len(denoms): 73 | return (round((num * 100) / denom, 2) for num, denom in zip(nums, denoms)) 74 | return -1 75 | 76 | 77 | def compute_stats(df, df_filtered, occ_cols): 78 | logger.info("Computing occurrence-based statistics...") 79 | 80 | ind = ["All", "All_related", "Desktop", "Desktop_rel", "Mobile", "Mobile_rel"] 81 | cols = ["Volume", "Percentage", "Shape"] 82 | df_stats = pd.DataFrame(index=ind, columns=cols) 83 | 84 | vol_all, vol_desk, vol_mobile = compute_volumes(df, occ_cols) 85 | vol_all_related, vol_desk_rel, vol_mobile_rel = compute_volumes(df_filtered, occ_cols) 86 | 87 | percent_from_desk, percent_from_mobile = compute_percents([vol_desk, vol_mobile], 2 * [vol_all]) 88 | 89 | percent_related, percent_from_desk_rel, percent_from_mobile_rel = compute_percents( 90 | [vol_all_related, vol_desk_rel, vol_mobile_rel], 91 | [vol_all, vol_desk, vol_mobile]) 92 | 93 | df_stats["Volume"] = [vol_all, vol_all_related, 94 | vol_desk, vol_desk_rel, 95 | vol_mobile, vol_mobile_rel] 96 | df_stats["Percentage"] = [100, percent_related, 97 | percent_from_desk, percent_from_desk_rel, 98 | percent_from_mobile, percent_from_mobile_rel] 99 | 100 | # a, b, c, _ = chi2_test(vol_desk, vol_mobile, vol_mobile_rel, vol_desk_rel) 101 | 102 | return df_stats 103 | 104 | 105 | def weight_seq_length(page_lengths, occurrences, name): 106 | length_occ = Counter() 107 | for length, occ in zip(page_lengths, occurrences): 108 | length_occ[length] += occ 109 | data = [] 110 | for key, value in length_occ.items(): 111 | for i in range(value): 112 | data.append(key) 113 | return pd.Series(data, name=name) 114 | 115 | 116 | def list_zipper(df_list, count_cols, names, col_to_describe): 117 | return [[df_all[col_to_describe], df_all[count_col], name] for df_all, count_col, name in 118 | zip(df_list, count_cols, names)] 119 | 120 | 121 | def describe_dfs(df_list_all, df_list_filtered, col_to_describe, count_cols): 122 | """ 123 | 124 | :param df: 125 | :param df_related: 126 | :param col_to_describe: 127 | :return: 128 | """ 129 | 130 | logger.info("Computing statistics for {}".format(col_to_describe)) 131 | descriptive = pd.DataFrame() 132 | names_all = ["All_" + name for name in ["Journeys", "Desktop", "Mobile"]] 133 | names_rel = [name + "_Related" for name in ["Journeys", "Desktop", "Mobile"]] 134 | 135 | to_eval = list_zipper(df_list_all, count_cols, names_all, col_to_describe) + list_zipper(df_list_filtered, 136 | count_cols, 137 | names_rel, col_to_describe) 138 | 139 | for length, occ, name in to_eval: 140 | sr = weight_seq_length(length, occ, name).describe().apply(lambda x: format(x, '.3f')) 141 | descriptive[sr.name] = sr 142 | 143 | return descriptive 144 | 145 | 146 | def column_eval(df): 147 | """ 148 | Change type of specified columns from str to list. Compute Page_List lengths, if missing. 149 | :param df: 150 | :return: void, inplace 151 | """ 152 | logger.info("Literal eval...") 153 | for column in AGGREGATE_COLUMNS: 154 | if column in df.columns and not isinstance(df[column].iloc[0], list): 155 | print("Working on column: {}".format(column)) 156 | df[column] = df[column].map(literal_eval) 157 | if "PageSeq_Length" not in df.columns: 158 | logger.info("Computing PageSeq_Length...") 159 | df['Page_List'] = df['Page_List'].map(literal_eval) 160 | df['PageSeq_Length'] = df['Page_List'].map(len) 161 | 162 | 163 | def initialize(filename, reports_dest): 164 | df = pd.read_csv(filename, sep="\t", compression="gzip") 165 | column_eval(df) 166 | # For dataframe files that include tablet devices 167 | df["TabletCount"] = df['DeviceCategories'].map(lambda x: device_count(x, "tablet")) 168 | df["Occurrences"] = df["Occurrences"] - df["TabletCount"] 169 | 170 | map_device_counter(df) 171 | 172 | df["Has_Related"] = df["Sequence"].map(has_related_event) 173 | 174 | # Journeys per device 175 | desktop_journeys = df[df.DesktopCount > 0] 176 | mobile_journeys = df[df.MobileCount > 0] 177 | 178 | # Related journeys, all/per device 179 | df_related = df[df["Has_Related"]] 180 | desk_rel_journeys = desktop_journeys[desktop_journeys["Has_Related"]] 181 | mobile_rel_journeys = mobile_journeys[mobile_journeys["Has_Related"]] 182 | 183 | occurrence_cols = ["Occurrences", "DesktopCount", "MobileCount"] 184 | 185 | df_stats = compute_stats(df, df_related, occurrence_cols) 186 | df_stats['Shape'] = [df.shape[0], df_related.shape[0], desktop_journeys.shape[0], desk_rel_journeys.shape[0], 187 | mobile_journeys.shape[0], mobile_rel_journeys.shape[0]] 188 | 189 | descriptive_df = describe_dfs([df, desktop_journeys, mobile_journeys], 190 | [df_related, desk_rel_journeys, mobile_rel_journeys], 191 | "PageSeq_Length", occurrence_cols) 192 | 193 | df_stats.to_csv(os.path.join(reports_dest, "device_rel_stats.csv")) 194 | descriptive_df.to_csv(os.path.join(reports_dest, "PageSeq_Length" + "_describe.csv")) 195 | 196 | 197 | if __name__ == "__main__": 198 | parser = argparse.ArgumentParser(description='Module to run analysis on user journeys in terms of a specific' 199 | 'event(s). For now focusing on \'Related content\' links. Reads' 200 | 'in data from the \'processed_journey\' directory.') 201 | parser.add_argument('input_filename', help='Source user journey file to analyse.') 202 | parser.add_argument('-q', '--quiet', action='store_true', default=False, help='Turn off debugging logging.') 203 | args = parser.parse_args() 204 | 205 | DATA_DIR = os.getenv("DATA_DIR") 206 | REPORTS_DIR = os.getenv("REPORTS_DIR") 207 | source_directory = os.path.join(DATA_DIR, "processed_journey") 208 | dest_directory = os.path.join(REPORTS_DIR, args.input_filename) 209 | input_file = os.path.join(source_directory, args.input_filename + ".csv.gz") 210 | 211 | LOGGING_CONFIG = os.getenv("LOGGING_CONFIG") 212 | logging.config.fileConfig(LOGGING_CONFIG) 213 | logger = logging.getLogger('user_journey_event_analysis') 214 | 215 | if args.quiet: 216 | logging.disable(logging.DEBUG) 217 | 218 | if os.path.isfile(input_file): 219 | if not os.path.isdir(dest_directory): 220 | logging.info( 221 | "Specified destination directory \"{}\" does not exist, creating...".format(dest_directory)) 222 | os.mkdir(dest_directory) 223 | initialize(input_file, dest_directory) 224 | else: 225 | logging.info( 226 | "Specified destination directory \"{}\" exists, adding \'v2\' to results...".format(dest_directory)) 227 | -------------------------------------------------------------------------------- /src/data/make_network_data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gzip 3 | import logging.config 4 | import os 5 | import re 6 | import sys 7 | from ast import literal_eval 8 | from collections import Counter 9 | 10 | import pandas as pd 11 | 12 | src = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 13 | sys.path.append(os.path.join(src, "data")) 14 | import preprocess as prep 15 | 16 | COLUMNS_TO_KEEP = ['Page_List', 'Page_List_NL', 'PageSequence', 'Page_Seq_NL', 'Occurrences', 'Page_Seq_Occurrences', 17 | 'Occurrences_NL'] 18 | NODE_ATTRIBUTES = ['Taxon_Page_List'] 19 | OCCURRENCES = ['Occurrences_NL', 'Page_Seq_Occurrences'] 20 | 21 | 22 | def read_file(filename, columns_to_read, collapse_search=False, use_delooped_journeys=False, 23 | drop_incorrect_occ=False, with_attribute=False): 24 | """ 25 | Read a dataframe compressed csv file, init as dataframe, drop unnecessary columns, prepare target columns 26 | to be evaluated as lists with literal_eval. 27 | :param with_attribute: 28 | :param use_delooped_journeys: 29 | :param drop_incorrect_occ: 30 | :param filename: processed_journey dataframe 31 | :return: processed for list-eval dataframe 32 | """ 33 | logger.debug("Reading file {}...".format(filename)) 34 | df = pd.read_csv(filename, sep='\t', compression="gzip", skipinitialspace=True, usecols=columns_to_read) 35 | logger.debug("Read in {} columns...".format(df.columns)) 36 | 37 | if drop_incorrect_occ and all(col in df.columns for col in OCCURRENCES): 38 | logger.debug("Dropping incorrect occurrence counts...") 39 | df.drop(['Occurrences_NL', 'Page_Seq_Occurrences'], axis=1, inplace=True) 40 | 41 | print(df.shape) 42 | print(df[df.Occurrences == 1].shape) 43 | # Sample 30% of one-off journeys and then use these indices to drop them 44 | indices = df[df.Occurrences == 1].sample(frac=0.3, random_state=1234).index 45 | print(len(indices)) 46 | df.drop(indices, inplace=True) 47 | print(df.shape) 48 | 49 | logger.debug("Number of rows post one-off occurrence drop: {}".format(df.shape)) 50 | 51 | if with_attribute: 52 | for attribute_column in NODE_ATTRIBUTES: 53 | logger.debug("Working on literal_eval for \"{}\"".format(attribute_column)) 54 | df[attribute_column] = df[attribute_column].map(literal_eval) 55 | 56 | column_to_eval = 'Page_List' 57 | 58 | if use_delooped_journeys: 59 | column_to_eval = 'Page_List_NL' 60 | 61 | if isinstance(df[column_to_eval].iloc[0], str) and any(["," in val for val in df[column_to_eval].values]): 62 | logger.debug("Working on literal_eval for \"{}\"".format(column_to_eval)) 63 | df[column_to_eval] = df[column_to_eval].map(literal_eval) 64 | 65 | if collapse_search: 66 | logger.debug("Collapsing /search nodes in \"{}\"".format(column_to_eval)) 67 | df[column_to_eval] = df[column_to_eval].map(collapse_search_page) 68 | 69 | return df 70 | 71 | 72 | def collapse_search_page(page_list): 73 | return [page for page in page_list if not (re.match(r"^/search[//?|/]", page) or page == "/search")] 74 | 75 | 76 | def compute_occurrences(user_journey_df, page_sequence, occurrences): 77 | logging.debug("Computing specialized occurrences \"{}\" based on \"{}\"...".format(occurrences, page_sequence)) 78 | user_journey_df[occurrences] = user_journey_df.groupby(page_sequence)['Occurrences'].transform( 79 | 'sum') 80 | 81 | 82 | def generate_subpaths(user_journey_df, page_list, subpaths): 83 | """ 84 | Compute lists of subpaths ie node-pairs/edges (where a node is a page) from both original and de-looped page_lists 85 | (page-hit only journeys) 86 | :param subpaths: 87 | :param page_list: 88 | :param user_journey_df: user journey dataframe 89 | :return: inplace assign new columns 90 | """ 91 | logger.debug("Setting up \"{}\" based on \"{}\"...".format(subpaths, page_list)) 92 | user_journey_df[subpaths] = user_journey_df[page_list].map(prep.subpaths_from_list) 93 | 94 | 95 | def edgelist_from_subpaths(user_journey_df, use_delooped_journeys=False): 96 | """ 97 | Generate a counter that represents the edge list. Keys are edges (node pairs) which represent a user going from 98 | first element of pair to second one), values are a sum of journey occurrences (de-looped occurrences since current 99 | computation is based on de-looped subpaths), ie number of times a user/agent went from one page (node) to another. 100 | :param use_delooped_journeys: 101 | :param user_journey_df: user journey dataframe 102 | :return: edgelist counter 103 | """ 104 | subpath_default = 'Subpaths' 105 | occurrences_default = 'Page_Seq_Occurrences' 106 | page_list_default = 'Page_List' 107 | page_sequence_default = 'PageSequence' 108 | 109 | if use_delooped_journeys: 110 | logger.debug("Creating edge list from de-looped journeys (based on Subpaths_NL) ...") 111 | subpath_default = 'Subpaths_NL' 112 | occurrences_default = 'Occurrences_NL' 113 | page_list_default = 'Page_List_NL' 114 | page_sequence_default = 'Page_Seq_NL' 115 | 116 | else: 117 | logger.debug("Creating edge list from original journeys (based on Subpaths) ...") 118 | 119 | if occurrences_default not in user_journey_df.columns: 120 | compute_occurrences(user_journey_df, page_sequence_default, occurrences_default) 121 | 122 | logger.debug("Dropping duplicates {}...".format(page_sequence_default)) 123 | user_journey_df.drop_duplicates(page_sequence_default, keep="first", inplace=True) 124 | 125 | generate_subpaths(user_journey_df, page_list_default, subpath_default) 126 | edgelist_counter = Counter() 127 | 128 | ind_path = user_journey_df.columns.get_loc(subpath_default) 129 | ind_occ = user_journey_df.columns.get_loc(occurrences_default) 130 | 131 | for tup in user_journey_df.itertuples(index=False): 132 | for edge in tup[ind_path]: 133 | edgelist_counter[tuple(edge)] += tup[ind_occ] 134 | 135 | return edgelist_counter 136 | 137 | 138 | def compute_node_attribute(user_journey_df): 139 | """ 140 | 141 | :param user_journey_df: 142 | :return: 143 | """ 144 | logger.debug("Identifying node taxons from \"Taxon_Page_List\"...") 145 | node_taxon_dict = {} 146 | for tup in user_journey_df.itertuples(): 147 | for page, taxons in tup.Taxon_Page_List: 148 | if page not in node_taxon_dict.keys(): 149 | node_taxon_dict[page] = taxons 150 | return node_taxon_dict 151 | 152 | 153 | def nodes_from_edgelist(edgelist): 154 | """ 155 | Generate a node list (from edges). Internally represented as a set, returned as alphabetically sorted list 156 | :param edgelist: list of edges (node-pairs) 157 | :return: sorted list of nodes 158 | """ 159 | logger.debug("Creating node list...") 160 | nid = 0 161 | node_list = {} 162 | 163 | for keys, _ in edgelist.items(): 164 | for key in keys: 165 | if key not in node_list.keys(): 166 | node_list[key] = nid 167 | nid += 1 168 | return node_list 169 | 170 | 171 | def compute_nodes_edges(source_filename, dest_filename, cols, collapse_search, use_delooped_journeys, 172 | drop_incorrect_occ, 173 | with_attribute): 174 | """ 175 | Read processed_journey dataframe file, preprocess, compute node/edge lists, write contents of lists to file. 176 | :param collapse_search: 177 | :param with_attribute: 178 | :param drop_incorrect_occ: 179 | :param use_delooped_journeys: 180 | :param source_filename: dataframe to be loaded 181 | :param dest_filename: filename prefix for node and edge files 182 | """ 183 | df = read_file(source_filename, cols, collapse_search, use_delooped_journeys, drop_incorrect_occ, with_attribute) 184 | edges = edgelist_from_subpaths(df, use_delooped_journeys) 185 | node_list = nodes_from_edgelist(edges) 186 | 187 | print(list(node_list.items())[0:10]) 188 | 189 | default_edge_header = "Source_node\tSource_id\tDestination_node\tDestination_id\tWeight\n" 190 | default_node_header = "Node\tNode_id\n" 191 | node_attr = None 192 | 193 | if with_attribute: 194 | logger.debug("Creating node-attribute (taxon) dictionary...") 195 | node_attr = compute_node_attribute(df) 196 | default_edge_header = "Source_node\tSource_id\tDestination_node\tDestination_id\tWeight\tSource_Taxon\tDestination_Taxon\n" 197 | default_node_header = "Node\tNode_id\tNode_Taxon\n" 198 | 199 | logger.info("Number of nodes: {} Number of edges: {}".format(len(node_list), len(edges))) 200 | logger.info("Writing edge list to file...") 201 | 202 | edge_writer(dest_filename + "_edges.csv.gz", default_edge_header, edges, node_list, node_attr) 203 | node_writer(dest_filename + "_nodes.csv.gz", default_node_header, node_list, node_attr) 204 | 205 | 206 | def node_writer(filename, header, node_id, node_attr): 207 | with gzip.open(filename, "w") as file: 208 | print(filename) 209 | file.write(header.encode()) 210 | for node, nid in node_id.items(): 211 | file.write("{}\t{}".format(node, nid).encode()) 212 | if node_attr is not None: 213 | file.write("\t{}".format(node_attr[node]).encode()) 214 | file.write("\n".encode()) 215 | 216 | 217 | def edge_writer(filename, header, edges, node_id, node_attr): 218 | with gzip.open(filename, "w") as file: 219 | print(filename) 220 | file.write(header.encode()) 221 | for key, value in edges.items(): 222 | file.write("{}\t{}\t{}\t{}\t{}".format(key[0], node_id[key[0]], key[1], node_id[key[1]], value).encode()) 223 | if node_attr is not None: 224 | file.write("\t{}\t{}".format(node_attr[key[0]], node_attr[key[1]]).encode()) 225 | file.write("\n".encode()) 226 | 227 | 228 | def check_header(filename): 229 | with gzip.open(filename, "rb") as reader: 230 | header = set(reader.readline().decode().replace("\n", "").split("\t")) 231 | return list(header.intersection(set(COLUMNS_TO_KEEP + NODE_ATTRIBUTES))) 232 | 233 | 234 | if __name__ == "__main__": 235 | parser = argparse.ArgumentParser(description='Module that produces node and edge files given a user journey file.') 236 | parser.add_argument('source_directory', default="", nargs="?", help='Source directory for input dataframe file(s).') 237 | parser.add_argument('input_filename', help='Source directory for input dataframe file(s).') 238 | parser.add_argument('dest_directory', default="", nargs="?", 239 | help='Specialized destination directory for output files.') 240 | parser.add_argument('output_filename', help='Naming convention for resulting node and edge files.') 241 | parser.add_argument('-q', '--quiet', action='store_true', default=False, help='Turn off debugging logging.') 242 | parser.add_argument('-d', '--delooped', action='store_true', default=False, 243 | help='Use delooped journeys for edge and weight computation') 244 | parser.add_argument('-i', '--incorrect', action='store_true', default=False, 245 | help='Drop incorrect occurrences if necessary') 246 | parser.add_argument('-t', '--taxon', action='store_true', default=False, 247 | help='Compute and include additional node attributes (only taxon for now).') 248 | parser.add_argument('-cs', '--collapse_search', action='store_true', default=False, 249 | help='Remove /search? page hits.') 250 | parser.add_argument('-s', '--sampling', action='store_true', default=False, 251 | help='Remove /search? page hits.') 252 | 253 | args = parser.parse_args() 254 | 255 | DATA_DIR = os.getenv("DATA_DIR") 256 | source_directory = os.path.join(DATA_DIR, 257 | args.source_directory if args.source_directory != "" else "processed_journey") 258 | input_filename = os.path.join(source_directory, ( 259 | args.input_filename + ".csv.gz" if "csv.gz" not in args.input_filename else args.input_filename)) 260 | dest_directory = os.path.join(DATA_DIR, args.dest_directory if args.dest_directory != "" else "processed_network") 261 | 262 | output_filename = os.path.join(dest_directory, args.output_filename) 263 | LOGGING_CONFIG = os.getenv("LOGGING_CONFIG") 264 | logging.config.fileConfig(LOGGING_CONFIG) 265 | logger = logging.getLogger('make_network_data') 266 | 267 | if args.quiet: 268 | logging.disable(logging.DEBUG) 269 | 270 | if os.path.exists(input_filename): 271 | logger.info("Working on file: {}".format(input_filename)) 272 | logger.info("Using de-looped journeys: {}\nDropping incorrect occurrence counts: {}".format(args.delooped, 273 | args.incorrect)) 274 | cols = check_header(input_filename) 275 | compute_nodes_edges(input_filename, output_filename, cols, args.collapse_search, args.delooped, args.incorrect, 276 | args.taxon) 277 | else: 278 | logger.debug("Specified filename does not exist: {}".format(input_filename)) 279 | -------------------------------------------------------------------------------- /src/data/tests/check_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import os\n", 11 | "from collections import Counter" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "DOCUMENTS = os.getenv(\"DOCUMENTS\")" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "source_dir = os.path.join(DOCUMENTS,\"test1\")" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "flist = sorted([os.path.join(source_dir,f) for f in os.listdir(source_dir) if \"user_network_paths_meta_\" in f])" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "flist" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "file1 = flist[0]\n", 57 | "file2 = flist[1]" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "scrolled": true 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "df1 = pd.read_csv(os.path.join(source_dir,file1))" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "df2 = pd.read_csv(os.path.join(source_dir,file2))" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "df1.head(2)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "target = df1.Languages.iloc[0]" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "type(target)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "test_out_file = os.path.join(source_dir,\"output/merge_test_1.csv.gz\")\n", 114 | "df3 = pd.read_csv(test_out_file)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "df3.head(2)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "target2 = df3.Sequence.iloc[0]" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "df1[df1.Sequence==target2].iloc[0]" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "Counter([x for x in df1[df1.Sequence==target2].Languages.iloc[0].split(\",\")])" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "df2[df2.Sequence==target2].iloc[0]" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "Counter([x for x in df2[df2.Sequence==target2].Languages.iloc[0].split(\",\")])" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "df2.Occurrences.sum()" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "df3.Occurrences.sum()" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "[(df3.columns.get_loc(c),c) for c in df3.columns]" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "COUNTABLE_AGGREGATE_COLUMNS = ['Occurrences','Languages', 'Locations', 'DeviceCategories', 'TrafficSources',\n", 205 | " 'TrafficMediums', 'NetworkLocations']" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "def dataframe_splitter(x):\n", 215 | " return [[x.columns.get_loc(\"Sequence\"),x.columns.get_loc(col)]\\\n", 216 | " for col in x.columns if col in COUNTABLE_AGGREGATE_COLUMNS]\n", 217 | "# for col in x.columns:\n", 218 | "# if col in COUNTABLE_AGGREGATE_COLUMNS:\n", 219 | "# print(x.columns.get_loc(col))" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "dataframe_splitter(df3)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "df3.columns" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "for i,df in enumerate([df1,df2,df3]):\n", 247 | " for ind in dataframe_splitter(df):\n", 248 | " print(i,df.iloc[:,ind].columns)\n", 249 | " print(df.columns[ind[0]],df.columns[ind[1]])" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "def multi_column_split(list_of_dfs):\n", 259 | " return [(i,df.iloc[0:5,ind]) for i,df in enumerate(list_of_dfs) for ind in dataframe_splitter(df)]\n", 260 | "# for ind in dataframe_splitter(df):\n", 261 | "# to_ret.append(i,df.iloc[:,ind])\n", 262 | "# return to_ret" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "df1[0:5]" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "expected_size = 2\n", 281 | "final_list = [pd.DataFrame()] * expected_size\n", 282 | "print(final_list)\n", 283 | "for i,df in multi_column_split([df1,df2]):\n", 284 | " if len(final_list[i])==0:\n", 285 | " final_list[i] = df\n", 286 | " else:\n", 287 | " final_list[i] = pd.merge(final_list[i],df,how='left',on='Sequence')\n", 288 | "# print(i,\"Occurrences\" in df.columns)\n", 289 | "final_list[0]" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "def receiver(code_df_tup):\n", 299 | " return (code_df_tup)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "left = df3.iloc[0:5,[0,1,2]].drop(0)\n", 309 | "right = df3.iloc[0:5,[1,3]]\n", 310 | "pd.merge(left,right,how='left',on='Sequence')" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "list1 = [1,2]\n", 320 | "list2 = [1,2,3,4]" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "temp = df3.iloc[:,[1,5]]" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "df3.shape" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": null, 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "seq_target = df2.Sequence.values" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "df3 = df3.query(\"Sequence.isin(@seq_target)\")" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "df3.shape" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "import itertools" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "metadata": {}, 381 | "outputs": [], 382 | "source": [ 383 | "list_occ = [(0,1),(1,1),(2,1)]\n", 384 | "list_meta = [(0,4),(1,4),(2,4),(0,3),(1,3),(2,3)]" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [ 393 | "list(zip(list_occ,list_meta))" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "list((code,occ,meta) for (code,occ),(code1,meta) in itertools.product(list_occ,list_meta) if code==code1)" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "list(itertools.product(list_occ,list_meta))" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [ 420 | "\"Occurrences\" in df3.columns[[1,2,3]]" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "[1] * 2" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "metadata": {}, 436 | "outputs": [], 437 | "source": [ 438 | "any([False])" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": {}, 445 | "outputs": [], 446 | "source": [ 447 | "test_out_file2 = os.path.join(source_dir,\"output/merge_test_sliced_13days.csv.gz\")\n", 448 | "df3 = pd.read_csv(test_out_file2,compression=\"gzip\")" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "df3.head()" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": null, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [ 466 | "from ast import literal_eval" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "cols_to_eval = ['Languages','Locations','DeviceCategories','TrafficSources']\n", 476 | "for col in cols_to_eval:\n", 477 | " print(col)\n", 478 | " df3[col] = df3[col].map(literal_eval)" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [ 487 | "len(df3.sort_values(\"Occurrences\",ascending=False).iloc[0].Locations)" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": null, 493 | "metadata": {}, 494 | "outputs": [], 495 | "source": [ 496 | "df3.head()" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "metadata": {}, 503 | "outputs": [], 504 | "source": [ 505 | "df3.drop(cols_to_eval,axis=1)" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "metadata": {}, 512 | "outputs": [], 513 | "source": [ 514 | "test_oct = \"../../data/output\"" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": null, 520 | "metadata": {}, 521 | "outputs": [], 522 | "source": [ 523 | "os.listdir(test_oct)" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": null, 529 | "metadata": {}, 530 | "outputs": [], 531 | "source": [ 532 | "df4 = pd.read_csv(os.path.join(test_oct,\"merged_oct_15_17.csv.gz\"),compression=\"gzip\")" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": null, 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [ 541 | "df4[df4.PageSeq_Length==1].sort_values(\"Occurrences\",ascending=False).head().Sequence.values" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": null, 547 | "metadata": {}, 548 | "outputs": [], 549 | "source": [ 550 | "df4[df4.Sequence.str.contains(\"%26&licenceid=\")].Event_List.iloc[0]" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": null, 556 | "metadata": {}, 557 | "outputs": [], 558 | "source": [ 559 | "### object size stuff" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": null, 565 | "metadata": {}, 566 | "outputs": [], 567 | "source": [ 568 | "import sys" 569 | ] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "execution_count": null, 574 | "metadata": {}, 575 | "outputs": [], 576 | "source": [ 577 | "df3.shape" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": null, 583 | "metadata": {}, 584 | "outputs": [], 585 | "source": [ 586 | "sys.getsizeof(temp)" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": null, 592 | "metadata": {}, 593 | "outputs": [], 594 | "source": [ 595 | "sys.getsizeof(df3)" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": null, 601 | "metadata": {}, 602 | "outputs": [], 603 | "source": [ 604 | "2,147,483,647" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": null, 610 | "metadata": {}, 611 | "outputs": [], 612 | "source": [ 613 | "105,849,367" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": null, 619 | "metadata": {}, 620 | "outputs": [], 621 | "source": [ 622 | "475,231,861" 623 | ] 624 | } 625 | ], 626 | "metadata": { 627 | "kernelspec": { 628 | "display_name": "Python 3", 629 | "language": "python", 630 | "name": "python3" 631 | }, 632 | "language_info": { 633 | "codemirror_mode": { 634 | "name": "ipython", 635 | "version": 3 636 | }, 637 | "file_extension": ".py", 638 | "mimetype": "text/x-python", 639 | "name": "python", 640 | "nbconvert_exporter": "python", 641 | "pygments_lexer": "ipython3", 642 | "version": "3.6.0" 643 | } 644 | }, 645 | "nbformat": 4, 646 | "nbformat_minor": 2 647 | } 648 | -------------------------------------------------------------------------------- /src/data/merge_dataset.py: -------------------------------------------------------------------------------- 1 | # # -*- coding: utf-8 -*- 2 | 3 | import argparse 4 | import logging.config 5 | import os 6 | import sys 7 | from collections import Counter 8 | 9 | import numpy as np 10 | import pandas as pd 11 | from pandas import DataFrame 12 | 13 | src = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 14 | sys.path.append(os.path.join(src, "data")) 15 | sys.path.append(os.path.join(src, "features")) 16 | import preprocess as prep 17 | import build_features as feat 18 | 19 | COUNTABLE_AGGREGATE_COLUMNS = ['Languages', 'Locations', 'DeviceCategories', 'TrafficSources', 20 | 'TrafficMediums', 'NetworkLocations', 'Dates'] 21 | # Execute module for only one file 22 | SINGLE: bool = False 23 | # Fewer files to process than available cpus. 24 | FEWER_THAN_CPU: bool = False 25 | # Drop journeys occurring once (not in a day, multiple days, governed by DEPTH globals). If false, overrides depth 26 | # globals and keeps journeys, resulting in massive dataframes (danger zone). 27 | DROP_ONE_OFFS: bool = False 28 | # Drop journeys of length 1 29 | DROP_ONES: bool = False 30 | # Keep only journeys of length 1 31 | KEEP_ONES: bool = False 32 | 33 | 34 | def list_to_dict(metadata_list): 35 | """ 36 | Transform metadata lists to dictionary aggregates 37 | :param metadata_list: 38 | :return: 39 | """ 40 | return Counter([xs for xs in metadata_list]) 41 | 42 | 43 | def str_to_dict(metadata_str): 44 | """ 45 | Transform metadata string eg mobile,desktop,mobile to [(mobile,2),(desktop,1)] dict-like 46 | list. 47 | :param metadata_str: 48 | :return: dict-like list of frequencies 49 | """ 50 | return list_to_dict(metadata_str.split(',')) 51 | 52 | 53 | def sequence_preprocess(user_journey_df): 54 | """ 55 | Bulk-execute main input pre-processing functions: from BigQuery journey strings to Page_Event_List to Page_List. 56 | PageSequence required for dataframes groupbys/filtering. 57 | :param user_journey_df: dataframe 58 | :return: no return, columns added in place. 59 | """ 60 | logger.info("BQ Sequence string to Page_Event_List...") 61 | user_journey_df['Page_Event_List'] = user_journey_df['Sequence'].map(prep.bq_journey_to_pe_list) 62 | logger.info("Page_Event_List to Page_List...") 63 | user_journey_df['Page_List'] = user_journey_df['Page_Event_List'].map(lambda x: prep.extract_pe_components(x, 0)) 64 | logger.info("Page_List to PageSequence...") 65 | # TODO: Remove condition + internal PageSequence post-testing/debugging. 66 | if 'PageSequence' not in user_journey_df.columns: 67 | user_journey_df['PageSequence'] = user_journey_df['Page_List'].map(lambda x: ">>".join(x)) 68 | else: 69 | user_journey_df['PageSequence_internal'] = user_journey_df['Page_List'].map(lambda x: ">>".join(x)) 70 | 71 | 72 | def event_preprocess(user_journey_df): 73 | """ 74 | Bulk-execute event related functions... Run after sequence_preprocess(user_journey_df) so that 75 | Page_Event_List column exists 76 | :param user_journey_df: dataframe 77 | :return: no return, columns added in place. 78 | """ 79 | logger.info("Preprocess and aggregate events...") 80 | logger.debug("Page_Event_List to Event_List...") 81 | user_journey_df['Event_List'] = user_journey_df['Page_Event_List'].map(lambda x: prep.extract_pe_components(x, 1)) 82 | logger.debug("Computing event-related counts and frequencies...") 83 | event_counters(user_journey_df) 84 | 85 | 86 | def taxon_preprocess(user_journey_df): 87 | """ 88 | Bulk map functions for event frequency/counts. 89 | :param user_journey_df: dataframe 90 | :return: no return, columns added in place. 91 | """ 92 | logger.info("Preprocess taxons...") 93 | logger.debug("Page_Event_List to Taxon_List...") 94 | user_journey_df['Taxon_List'] = user_journey_df['Page_Event_List'].map(lambda x: prep.extract_cd_components(x, 2)) 95 | logger.debug("Page_Event_List to Taxon_Page_List...") 96 | user_journey_df['Taxon_Page_List'] = user_journey_df['Page_Event_List'].map(lambda x: prep.extract_pcd_list(x, 2)) 97 | 98 | 99 | def event_counters(user_journey_df): 100 | """ 101 | Bulk map functions for event frequency/counts. 102 | :param user_journey_df: dataframe 103 | :return: no return, columns added in place. 104 | """ 105 | # logger.debug("Computing number of event categories...") 106 | # user_journey_df['num_event_cats'] = user_journey_df['Event_List'].map(feat.count_event_cat) 107 | logger.debug("Computing frequency of event categories...") 108 | user_journey_df['Event_cats_agg'] = user_journey_df['Event_List'].map(feat.aggregate_event_cat) 109 | logger.debug("Computing frequency of event categories and actions...") 110 | user_journey_df['Event_cat_act_agg'] = user_journey_df['Event_List'].map(feat.aggregate_event_cat_act) 111 | 112 | 113 | def add_loop_columns(user_journey_df): 114 | """ 115 | Bulk map functions for event frequency/counts. 116 | :param user_journey_df: dataframe 117 | :return: no return, columns added in place. 118 | """ 119 | logger.info("Preprocess journey looping...") 120 | logger.debug("Collapsing loops...") 121 | user_journey_df['Page_List_NL'] = user_journey_df['Page_List'].map(prep.collapse_loop) 122 | # In order to groupby during analysis step 123 | logger.debug("De-looped lists to string...") 124 | user_journey_df['Page_Seq_NL'] = user_journey_df['Page_List_NL'].map(lambda x: ">>".join(x)) 125 | 126 | if 'Page_Seq_Occurrences' not in user_journey_df.columns: 127 | logger.debug("Setting up Page_Seq_Occurrences...") 128 | user_journey_df['Page_Seq_Occurrences'] = user_journey_df.groupby('PageSequence')['Occurrences'].transform( 129 | 'sum') 130 | 131 | # Count occurrences of de-looped journeys, most generic journey frequency metric. 132 | logger.debug("Aggregating de-looped journey occurrences...") 133 | user_journey_df['Occurrences_NL'] = user_journey_df.groupby('Page_Seq_NL')['Occurrences'].transform('sum') 134 | logger.debug("De-looped page sequence to list...") 135 | user_journey_df['Page_List_NL'] = user_journey_df['Page_Seq_NL'].map( 136 | lambda x: x.split(">>") if isinstance(x, str) else np.NaN) 137 | 138 | 139 | def agg_dict(agg_from_dict, row_dict): 140 | for xs, value in row_dict.items(): 141 | if xs in agg_from_dict.keys(): 142 | agg_from_dict[xs] += value 143 | else: 144 | agg_from_dict[xs] = value 145 | return agg_from_dict 146 | 147 | 148 | def aggregate_metadata(dataframe): 149 | metadata_counter = {} 150 | for agg in dataframe.columns: 151 | if agg in COUNTABLE_AGGREGATE_COLUMNS: 152 | logging.info("Setting up aggregate dictionary {}".format(agg)) 153 | metadata_counter[agg] = {} 154 | 155 | logging.info("Starting iteration...") 156 | for agg in metadata_counter.keys(): 157 | logging.info("Aggregating: {}".format(agg)) 158 | for row in zip(dataframe['Sequence'], dataframe[agg]): 159 | if row[0] in metadata_counter[agg].keys(): 160 | metadata_counter[agg][row[0]] = agg_dict(metadata_counter[agg][row[0]], 161 | str_to_dict(row[1])) 162 | else: 163 | metadata_counter[agg][row[0]] = str_to_dict(row[1]) 164 | 165 | return metadata_counter 166 | 167 | 168 | def preprocess_dataframe(dataframe): 169 | """ 170 | 171 | :param dataframe: 172 | :param single: 173 | :return: 174 | """ 175 | logging.info("Dataframe shape: {}".format(dataframe.shape)) 176 | 177 | multiple = any(dataframe.Sequence.duplicated()) 178 | 179 | if multiple: 180 | logging.info("Working on multiple merged dataframes") 181 | metadata_counter = aggregate_metadata(dataframe) 182 | else: 183 | logging.info("Working on a single dataframe") 184 | for agg in dataframe.columns: 185 | 186 | if agg in COUNTABLE_AGGREGATE_COLUMNS: 187 | logging.info("Agg {}".format(agg)) 188 | dataframe[agg] = dataframe[agg].map(lambda x: list(str_to_dict(x).items())) 189 | 190 | logging.info("Computing sequence occurrences...") 191 | dataframe['Occurrences'] = dataframe.groupby('Sequence')['Occurrences'].transform('sum') 192 | 193 | if multiple: 194 | bef = dataframe.shape[0] 195 | logger.debug("Current # of rows: {}. Dropping duplicate rows...".format(bef)) 196 | dataframe.drop_duplicates(subset='Sequence', keep='first', inplace=True) 197 | after = dataframe.shape[0] 198 | logger.debug("Dropped {} duplicated rows.".format(bef - after)) 199 | 200 | for agg in metadata_counter.keys(): 201 | logger.info("Mapping {}, items: {}...".format(agg, len(metadata_counter[agg]))) 202 | dataframe[agg] = dataframe['Sequence'].map(lambda x: list(metadata_counter[agg][x].items())) 203 | 204 | if DROP_ONE_OFFS: 205 | dataframe['Page_Seq_Occurrences'] = dataframe.groupby('PageSequence')['Occurrences'].transform('sum') 206 | bef = dataframe.shape[0] 207 | dataframe = dataframe[dataframe.Page_Seq_Occurrences > 1] 208 | after = dataframe.shape[0] 209 | logger.debug("Dropped {} one-off rows.".format(bef - after)) 210 | 211 | 212 | def initialize_make(files: list, destination: str, merged_filename: str): 213 | """ 214 | 215 | :param files: 216 | :param destination: 217 | :param merged_filename: 218 | :return: 219 | """ 220 | 221 | logging.info("Reading {} files...".format(len(files))) 222 | 223 | df = pd.concat([read_file(file) for file in files], ignore_index=True) 224 | 225 | preprocess_dataframe(df) 226 | 227 | logging.debug(df.iloc[0]) 228 | 229 | path_to_file = os.path.join(destination, "merged_" + merged_filename) 230 | 231 | 232 | logging.debug("Saving merged dataframe...") 233 | logger.info("Saving at: {}".format(path_to_file)) 234 | df.to_csv(path_to_file, sep="\t", compression='gzip', index=False) 235 | 236 | 237 | def read_file(filename): 238 | """ 239 | Initialize dataframe using specified filename, do some initial prep if necessary depending on global vars 240 | (specified via arguments) 241 | :param filename: filename to read, no exists_check because files are loaded from a specified directory 242 | :return: loaded (maybe modified) pandas dataframe 243 | """ 244 | logging.info("Reading: {}".format(filename)) 245 | df: DataFrame = pd.read_csv(filename, compression="gzip") 246 | # logging.info("pre {}".format(df.shape)) 247 | df.dropna(subset=['Sequence'], inplace=True) 248 | # logging.info("post {}".format(df.shape)) 249 | # print(df.shape) 250 | 251 | # Drop journeys of length 1 252 | if DROP_ONES: 253 | logging.debug("Dropping ones...") 254 | df.query("PageSeq_Length > 1", inplace=True) 255 | 256 | # Keep ONLY journeys of length 1 257 | elif KEEP_ONES: 258 | logging.debug("Keeping only ones...") 259 | df.query("PageSeq_Length == 1", inplace=True) 260 | # If 261 | if DROP_ONE_OFFS: 262 | if "PageSequence" not in df.columns: 263 | sequence_preprocess(df) 264 | # df.drop(DROPABLE_COLS, axis=1, inplace=True) 265 | return df 266 | 267 | 268 | def generate_file_list(source_dir, stub): 269 | """ 270 | Initialize list of files to read from a specified directory. If stub is not empty, filter files to be read 271 | based on whether their filename includes the stub. 272 | :param source_dir: Source directory 273 | :param stub: Filename stub for file filtering 274 | :return: a list of files 275 | """ 276 | file_list = sorted([os.path.join(source_dir, file) for file in os.listdir(source_dir)]) 277 | if stub is not None: 278 | return [file for file in file_list if stub in file] 279 | else: 280 | return file_list 281 | 282 | 283 | def build_filename(file_list): 284 | """ 285 | 286 | :param file_list: 287 | :return: 288 | """ 289 | file_name = "_".join(file_list[0].split("/")[-1].split("_")[0:-1]) 290 | date_list = ["".join(file.split("_")[-1].replace(".csv.gz", "").split("-")) for file in 291 | [file_list[0], file_list[-1]]] 292 | if DROP_ONES: 293 | file_name += ("_dlo") 294 | if KEEP_ONES: 295 | file_name += ("_klo") 296 | if DROP_ONE_OFFS: 297 | file_name += ("_doo") 298 | 299 | return file_name + "_" + "_".join(date_list) 300 | 301 | 302 | if __name__ == "__main__": 303 | parser = argparse.ArgumentParser(description='Module that produces a merged, metadata-aggregated and ' 304 | 'preprocessed dataset (.csv.gz), given a source directory ' 305 | 'containing raw BigQuery extract dataset(s). Merging is ' 306 | 'skipped if only one file is provided.') 307 | parser.add_argument('output_filename', default="", nargs="?", 308 | help='Naming convention for resulting merged dataframe file.') 309 | parser.add_argument('source_directory', default="", nargs="?", help='Source directory for input dataframe file(s).') 310 | parser.add_argument('dest_directory', default="", nargs="?", 311 | help='Specialized destination directory for output dataframe file.') 312 | parser.add_argument('-doo', '--drop_one_offs', action='store_true', 313 | help='Drop journeys occurring only once (on a daily basis, ' 314 | 'or over approximately 3 day periods).') 315 | parser.add_argument('-kloo', '--keep_len_one_only', action='store_true', 316 | help='Keep ONLY journeys with length 1 ie journeys visiting only one page.') 317 | parser.add_argument('-dlo', '--drop_len_one', action='store_true', 318 | help='Drop journeys with length 1 ie journeys visiting only one page.') 319 | parser.add_argument('-f', '--filename_stub', default=None, type=str, 320 | help='Filter files to be loaded based on whether their filenames contain specified stub.') 321 | parser.add_argument('-q', '--quiet', action='store_true', default=False, help='Turn off debugging logging.') 322 | args = parser.parse_args() 323 | 324 | DATA_DIR = os.getenv("DATA_DIR") 325 | source_directory = os.path.join(DATA_DIR, 326 | args.source_directory if args.source_directory != "" else "raw_bq_extract") 327 | dest_directory = os.path.join(DATA_DIR, args.dest_directory if args.dest_directory != "" else "processed_journey") 328 | # final_filename = args.output_filename 329 | filename_stub = args.filename_stub 330 | 331 | LOGGING_CONFIG = os.getenv("LOGGING_CONFIG") 332 | logging.config.fileConfig(LOGGING_CONFIG) 333 | logger = logging.getLogger('merge_dataset') 334 | 335 | if args.quiet: 336 | logging.disable(logging.DEBUG) 337 | 338 | if os.path.isdir(source_directory): 339 | # Set up variable values from parsed arguments 340 | DROP_ONE_OFFS = args.drop_one_offs 341 | DROP_ONES = args.drop_len_one 342 | KEEP_ONES = args.keep_len_one_only 343 | logger.info( 344 | "Data exclusion parameters:\nDrop one-off journeys: {}" 345 | "\nDrop journeys of length 1: {}" 346 | "\nKeep journeys only of length 1: {}".format(DROP_ONE_OFFS, DROP_ONES, KEEP_ONES)) 347 | 348 | logger.info("Loading data...") 349 | 350 | to_load = generate_file_list(source_directory, filename_stub) 351 | 352 | if len(to_load) > 0: 353 | 354 | if not os.path.isdir(dest_directory): 355 | logging.info( 356 | "Specified destination directory \"{}\" does not exist, creating...".format(dest_directory)) 357 | os.mkdir(dest_directory) 358 | 359 | final_filename = build_filename(to_load) 360 | logger.debug("Produced output filename: {}".format(final_filename)) 361 | initialize_make(to_load, dest_directory, final_filename + ".csv.gz") 362 | else: 363 | logging.info( 364 | "Specified source directory \"{}\" contains no target files.".format(source_directory)) 365 | 366 | else: 367 | logging.info("Specified source directory \"{}\" does not exist, cannot read files.".format(source_directory)) 368 | -------------------------------------------------------------------------------- /notebooks/taxon/taxon_eda.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import sys\n", 10 | "import os\n", 11 | "src_data = os.path.join(os.path.dirname(os.getenv(\"DATA_DIR\")),\"src/data\")\n", 12 | "sys.path.append(src_data)\n", 13 | "import preprocess as prep\n", 14 | "import datetime\n", 15 | "import colorsys\n", 16 | "import pandas as pd\n", 17 | "import re\n", 18 | "import numpy as np\n", 19 | "from ast import literal_eval\n", 20 | "from collections import Counter\n", 21 | "import pprint\n", 22 | "import networkx as nx\n", 23 | "import pygraphviz\n", 24 | "from networkx.drawing.nx_agraph import graphviz_layout\n", 25 | "%matplotlib inline\n", 26 | "import matplotlib.pyplot as plt" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "DATA_DIR = os.getenv(\"DATA_DIR\")\n", 36 | "filename = \"preprocessed_with_dupes_31_10_taxon2.csv.gz\"\n", 37 | "path = os.path.join(DATA_DIR,\"output\", filename)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "df = pd.read_csv(path,sep=\"\\t\",compression=\"gzip\")" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "df.shape" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "df.columns" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "any(df.Sequence.duplicated())" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "for col in df.columns:\n", 83 | "# if \"Sequence\" not in col and not col.startswith(\"Event\"):\n", 84 | "# if isinstance(df[col].iloc[0],str) and \"[\" in df[col].iloc[0]:\n", 85 | "# print(col)\n", 86 | "# df[col] = df[col].map(literal_eval)\n", 87 | " if re.search(\"^Taxon|^Page\",col):\n", 88 | " if isinstance(df[col].iloc[0],str) and \"[\" in df[col].iloc[0]:\n", 89 | " print(col)\n", 90 | " df[col] = df[col].map(literal_eval)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "## Count taxons within journeys\n", 98 | "### Setup" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "def unique_taxon_flat_unique(taxon_list):\n", 108 | " return sum(Counter(set([t for taxon in taxon_list for t in taxon.split(\",\")])).values())\n", 109 | "def unique_taxon_nested_unique(taxon_list):\n", 110 | " return sum(Counter(set([taxon for taxon in taxon_list])).values())\n", 111 | "def unique_taxon_flat_pages(taxon_list):\n", 112 | " return sum(Counter([t for taxon in taxon_list for t in taxon.split(\",\")]).values())\n", 113 | "def unique_taxon_nested_pages(taxon_list):\n", 114 | " return sum(Counter([taxon for taxon in taxon_list]).values())" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "df.iloc[0].Sequence" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "target = df.Taxon_List.iloc[1]\n", 133 | "print(target)\n", 134 | "print(unique_taxon_flat_unique(target))\n", 135 | "print(unique_taxon_nested_unique(target))\n", 136 | "print(unique_taxon_flat_pages(target))\n", 137 | "print(unique_taxon_nested_pages(target))" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "df['taxon_flat_unique'] = df['Taxon_List'].map(unique_taxon_flat_unique)\n", 147 | "df['taxon_nested_unique'] = df['Taxon_List'].map(unique_taxon_nested_unique)\n", 148 | "df['taxon_flat_pages'] = df['Taxon_List'].map(unique_taxon_flat_pages)\n", 149 | "df['taxon_nested_pages'] = df['Taxon_List'].map(unique_taxon_nested_pages)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "df.describe().drop(\"count\").applymap(lambda x: format(x,\"f\"))" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "df.describe().drop(\"count\").applymap(lambda x: '%.2f' % x)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "df[df.taxon_flat_unique == 429].Taxon_List.values" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "df[df.taxon_flat_unique == 0].Sequence.values" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "def taxon_split(taxon_list):\n", 195 | " return [t for taxon in taxon_list for t in taxon.split(\",\")]" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "#### Build list of unique taxons, excluding \"other\"\n", 205 | "taxon_counter = Counter()\n", 206 | "for tup in df.itertuples():\n", 207 | " taxons = taxon_split(tup.Taxon_List)\n", 208 | " for taxon in taxons:\n", 209 | " taxon_counter[taxon]+=1\n", 210 | "len(taxon_counter) " 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "list(taxon_counter.keys())[0:10]" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "taxon_counter.most_common(10)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "taxon_df = pd.read_csv(\"taxon_level_df.tsv\",sep='\\t')" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "### Assign unique parent taxons per journey" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "df['subpaths'] = df['Page_List'].map(prep.subpaths_from_list)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "for val in df[['Page_List','subpaths']].iloc[0].values:\n", 263 | " pprint.pprint(val)\n", 264 | " print(\"\\n====\")" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "### create new subpaths where each element is a (page,parent taxon pair, pick one?)" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "def get_taxon_name(taxon_id):\n", 281 | " if taxon_id in taxon_df.content_id.values:\n", 282 | " return taxon_df[taxon_df.content_id==taxon_id].iloc[0].title\n", 283 | " else:\n", 284 | " return None" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "def taxon_title(taxon_id_list):\n", 294 | " return [get_taxon_name(taxon_id) for taxon_id in taxon_id_list]" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "def subpaths_from_pcd_list(pcd_list):\n", 304 | " return [[(page,taxon_title(taxons)), (pcd_list[i + 1][0],taxon_title(pcd_list[i + 1][1]))] \n", 305 | " for i, (page,taxons) in enumerate(pcd_list) if i < len(pcd_list) - 1]" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "test_journey = df[df.PageSeq_Length>4].iloc[0]" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "pprint.pprint([p for p,_ in test_journey.Taxon_Page_List])" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "for i,element in enumerate(subpaths_from_pcd_list(test_journey.Taxon_Page_List)):\n", 333 | " print(i,element,\"\\n====\")" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "df['taxon_subpaths'] = df['Taxon_Page_List'].map(subpaths_from_pcd_list)" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "# taxon_title(df.Taxon_Page_List.iloc[0][0][1])\n", 352 | "\n", 353 | "# def add_to_taxon_dict(diction,taxon_list):\n", 354 | "# for taxon in taxon_list:\n", 355 | "# if taxon not in diction.keys():\n", 356 | "# diction[taxon] = get_taxon_name(taxon)\n", 357 | "\n", 358 | "# df.Taxon_Page_List.iloc[0][0][1]\n", 359 | "\n", 360 | "# df.Taxon_Page_List.iloc[0][1][1]\n", 361 | "\n", 362 | "# taxon_name = {}\n", 363 | "# add_to_taxon_dict(taxon_name,df.Taxon_Page_List.iloc[0][0][1]+df.Taxon_Page_List.iloc[0][1][1])\n", 364 | "\n", 365 | "# taxon_name\n", 366 | "\n", 367 | "# df.shape\n", 368 | "\n", 369 | "# print(datetime.datetime.now().strftime(\"[%H:%M:%S]\"))" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": {}, 375 | "source": [ 376 | "## Graph viz" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": {}, 382 | "source": [ 383 | "## graph some stuff based on taxon (parent?)" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "def add_page_taxon(diction,key,value):\n", 393 | " if key not in diction.keys():\n", 394 | " diction[key] = value" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "adjacency_list = {}\n", 404 | "adjacency_counter = Counter()\n", 405 | "freq_filter = 1000\n", 406 | "dupe_count = 0\n", 407 | "page_taxon_title = {}\n", 408 | "\n", 409 | "for i,tup in enumerate(df.sort_values(by=\"Occurrences\",ascending=False).itertuples()):\n", 410 | "# for page,taxon in tup.Taxon_Page_List:\n", 411 | " for subpath in subpaths_from_pcd_list(tup.Taxon_Page_List):\n", 412 | " start = subpath[0][0]\n", 413 | " end = subpath[1][0]\n", 414 | "# print(subpath[0][1]+subpath[1][1])\n", 415 | " adjacency_counter [(start,end)] += tup.Occurrences\n", 416 | " \n", 417 | " \n", 418 | " if start!=end and adjacency_counter[(start,end)] >= freq_filter:\n", 419 | " \n", 420 | " add_page_taxon(page_taxon_title,start,subpath[0][1])\n", 421 | " add_page_taxon(page_taxon_title,end,subpath[1][1])\n", 422 | " \n", 423 | "\n", 424 | " if start in adjacency_list.keys():\n", 425 | " if end not in adjacency_list[start]:\n", 426 | " adjacency_list[start].append(end)\n", 427 | " else:\n", 428 | " adjacency_list[start] = [end]\n", 429 | " \n", 430 | " if len(adjacency_list)>1000:\n", 431 | " break\n", 432 | " \n", 433 | " if i%30000==0:\n", 434 | " print(datetime.datetime.now().strftime(\"[%H:%M:%S]\"),\"ind\",i)\n", 435 | " print(len(adjacency_list))" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": null, 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [ 444 | "len(adjacency_list)" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [ 453 | "list(adjacency_list.items())[0:10]" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": null, 459 | "metadata": {}, 460 | "outputs": [], 461 | "source": [ 462 | "list(page_taxon_title.items())[0:10]" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": null, 468 | "metadata": {}, 469 | "outputs": [], 470 | "source": [ 471 | "for page,taxons in page_taxon_title.items():\n", 472 | " page_taxon_title[page] = \"_\".join([taxon if taxon is not None else \"None\" for taxon in taxons]) " 473 | ] 474 | }, 475 | { 476 | "cell_type": "markdown", 477 | "metadata": {}, 478 | "source": [ 479 | "### Set up colors" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": null, 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [ 488 | "N = len(page_taxon_title.values())\n", 489 | "HSV_tuples = [(x*1.0/N, 0.5, 0.5) for x in range(N)]\n", 490 | "RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples)\n", 491 | "RGB_tuples = list(RGB_tuples)" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": null, 497 | "metadata": {}, 498 | "outputs": [], 499 | "source": [ 500 | "taxon_color = {taxon:RGB_tuples[i] for i,taxon in enumerate(page_taxon_title.values())}" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [ 509 | "digraph = nx.DiGraph()" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "metadata": {}, 516 | "outputs": [], 517 | "source": [ 518 | "for node,out_nodes in adjacency_list.items():\n", 519 | " color = taxon_color[page_taxon_title[node]]\n", 520 | " digraph.add_node(node,taxon=page_taxon_title[node],color=color)\n", 521 | " for o_node in out_nodes:\n", 522 | " color = taxon_color[page_taxon_title[o_node]]\n", 523 | " digraph.add_node(o_node,taxon=page_taxon_title[o_node],color=color)\n", 524 | " digraph.add_edge(node,o_node)" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": null, 530 | "metadata": {}, 531 | "outputs": [], 532 | "source": [ 533 | "digraph.edges()" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": null, 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [ 542 | "edges = digraph.edges()\n", 543 | "color_map = [data['color'] for _,data in digraph.nodes(data=True)]\n", 544 | "pos = nx.nx_agraph.graphviz_layout(digraph, prog='neato')\n", 545 | "nx.draw(digraph, pos, node_size=20, fontsize=12, edges=edges, node_color=color_map)\n", 546 | "plt.show()" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": null, 552 | "metadata": {}, 553 | "outputs": [], 554 | "source": [] 555 | } 556 | ], 557 | "metadata": { 558 | "kernelspec": { 559 | "display_name": "Python 3", 560 | "language": "python", 561 | "name": "python3" 562 | }, 563 | "language_info": { 564 | "codemirror_mode": { 565 | "name": "ipython", 566 | "version": 3 567 | }, 568 | "file_extension": ".py", 569 | "mimetype": "text/x-python", 570 | "name": "python", 571 | "nbconvert_exporter": "python", 572 | "pygments_lexer": "ipython3", 573 | "version": "3.6.0" 574 | } 575 | }, 576 | "nbformat": 4, 577 | "nbformat_minor": 2 578 | } 579 | -------------------------------------------------------------------------------- /notebooks/eda/look_at_sampling_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 38, 6 | "metadata": { 7 | "ExecuteTime": { 8 | "end_time": "2019-02-04T16:02:59.714543Z", 9 | "start_time": "2019-02-04T16:02:59.709887Z" 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "import os \n", 15 | "import pandas as pd\n", 16 | "import numpy as np\n", 17 | "import ast\n", 18 | "import matplotlib.pyplot as plt\n", 19 | "import math\n", 20 | "\n", 21 | "from collections import Counter" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 39, 27 | "metadata": { 28 | "ExecuteTime": { 29 | "end_time": "2019-02-04T16:02:59.969283Z", 30 | "start_time": "2019-02-04T16:02:59.956071Z" 31 | } 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "%matplotlib inline" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 40, 41 | "metadata": { 42 | "ExecuteTime": { 43 | "end_time": "2019-02-04T16:03:00.309544Z", 44 | "start_time": "2019-02-04T16:03:00.301949Z" 45 | } 46 | }, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "500\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "# Some of the columns we will look at can be quite wide, but it's good to get an idea of what they contain\n", 58 | "print(pd.get_option('max_colwidth'))\n", 59 | "pd.set_option('max_colwidth',500)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "## File/dir locations\n" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 5, 72 | "metadata": { 73 | "ExecuteTime": { 74 | "end_time": "2019-02-04T14:49:55.813774Z", 75 | "start_time": "2019-02-04T14:49:55.809824Z" 76 | } 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "DATA_DIR = os.getenv(\"DATA_DIR\")\n", 81 | "filename = \"preprocessed_taxon_pageseq_20190114_20190116.csv.gz\"\n", 82 | "# df_file = os.path.join(DATA_DIR, \"processed_journey\", filename)\n", 83 | "# df_reduced_file = os.path.join(DATA_DIR, \"processed_journey\", \"reduced_\"+filename)\n", 84 | "# df_rel_file = os.path.join(DATA_DIR, \"processed_journey\", \"rel_\"+filename)\n", 85 | "# df_doo_file = os.path.join(\n", 86 | "# DATA_DIR, \"processed_journey\",\n", 87 | "# \"doo_prelim_meta_standard_with_pageseq_from_29-10_to_04-11-2018.csv.gz\")\n", 88 | "\n", 89 | "df_dlo_file = os.path.join(\n", 90 | " DATA_DIR, \"processed_journey\",\n", 91 | " \"dlo_prelim_meta_standard_with_pageseq_from_29-10_to_04-11-2018.csv.gz\")\n", 92 | "df_kloo_file = os.path.join(\n", 93 | " DATA_DIR, \"processed_journey\",\n", 94 | " \"kloo_prelim_meta_standard_with_pageseq_from_29-10_to_04-11-2018.csv.gz\")" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 6, 100 | "metadata": { 101 | "ExecuteTime": { 102 | "end_time": "2019-02-04T14:58:13.251383Z", 103 | "start_time": "2019-02-04T14:49:56.256851Z" 104 | } 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "#the 'drop length one' data read into pandas dataframe\n", 109 | "dlo = pd.read_csv(df_dlo_file, compression='gzip')\n", 110 | "#the 'keep length one only' data read into pandas dataframe\n", 111 | "kloo = pd.read_csv(df_kloo_file, compression='gzip')" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 7, 117 | "metadata": { 118 | "ExecuteTime": { 119 | "end_time": "2019-02-04T14:58:13.428292Z", 120 | "start_time": "2019-02-04T14:58:13.339211Z" 121 | } 122 | }, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/plain": [ 127 | "(3788851, 15)" 128 | ] 129 | }, 130 | "execution_count": 7, 131 | "metadata": {}, 132 | "output_type": "execute_result" 133 | } 134 | ], 135 | "source": [ 136 | "dlo.shape" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 8, 142 | "metadata": { 143 | "ExecuteTime": { 144 | "end_time": "2019-02-04T14:58:13.446260Z", 145 | "start_time": "2019-02-04T14:58:13.439735Z" 146 | } 147 | }, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/plain": [ 152 | "(890977, 15)" 153 | ] 154 | }, 155 | "execution_count": 8, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "kloo.shape" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "Load up a data from two files: dlo = drop length one journeys, kloo = keep length one journeys only \n", 169 | "\n", 170 | "This data was produced by an early version of the pipeline and is missing some descriptive variables, such as taxons etc. However, it contains the sequences of pages and behaviours (or events) of users on those pages, including interaction with the sidebar and the related links contained therein." 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 18, 176 | "metadata": { 177 | "ExecuteTime": { 178 | "end_time": "2019-02-04T15:23:15.685611Z", 179 | "start_time": "2019-02-04T15:23:15.477848Z" 180 | } 181 | }, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "6537680\n", 188 | "7650687\n" 189 | ] 190 | } 191 | ], 192 | "source": [ 193 | "print(dlo['Occurrences'].sum())\n", 194 | "print(kloo['Occurrences'].sum())" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 62, 200 | "metadata": { 201 | "ExecuteTime": { 202 | "end_time": "2019-02-04T16:53:09.376433Z", 203 | "start_time": "2019-02-04T16:53:07.241082Z" 204 | }, 205 | "code_folding": [] 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "#get a reproducible sample of 20% of journey types from each dataframe, \n", 210 | "#sampled in proportion to the number of occurrences of each journey type\n", 211 | "#then join the new samples together into a single dataframe\n", 212 | "\n", 213 | "# df = pd.concat([dlo.sample(frac=0.2, random_state=1234, weights=dlo.Occurrences).copy(), kloo.sample(frac=0.2, random_state=1234, weights=kloo.Occurrences).copy()], ignore_index=True)\n", 214 | "\n", 215 | "\n", 216 | "# try sampling with replacement, using occurrences as weights, but then \n", 217 | "# change all \"occurrences\" to 1, to try to create a more representative sample?\n", 218 | "# df = pd.concat([\n", 219 | "# dlo.sample(\n", 220 | "# frac=0.4, random_state=1234, weights=dlo.Occurrences, replace=True\n", 221 | "# ).copy(),\n", 222 | "# kloo.sample(\n", 223 | "# frac=0.4, random_state=1234, weights=kloo.Occurrences, replace=True\n", 224 | "# ).copy()],\n", 225 | "# ignore_index=True)\n", 226 | "\n", 227 | "# try concatting and THEN sampling with replacement, using occurrences as\n", 228 | "# weights, but then change all \"occurrences\" to 1, to try to create a more \n", 229 | "# representative sample?\n", 230 | "# df = pd.concat([\n", 231 | "# dlo.copy(),\n", 232 | "# kloo.copy()],\n", 233 | "# ignore_index=True)\n", 234 | "# df = df.sample(\n", 235 | "# frac=0.4, random_state=1234, weights=df.Occurrences, replace=True\n", 236 | "# )\n", 237 | "\n", 238 | "# # try concatting and THEN sampling without replacement, using occurrences as\n", 239 | "# # weights\n", 240 | "# df = pd.concat([\n", 241 | "# dlo.copy(),\n", 242 | "# kloo.copy()],\n", 243 | "# ignore_index=True)\n", 244 | "# df = df.sample(\n", 245 | "# frac=0.4, random_state=1234, weights=df.Occurrences\n", 246 | "# )\n", 247 | "\n", 248 | "# try sampling with, using occurrences as weights, \n", 249 | "# and sum(Occurrences)*0.4 as n, but then change all \"occurrences\" to 1, \n", 250 | "# to try to create a more representative sample?\n", 251 | "df = pd.concat([\n", 252 | " dlo.sample(\n", 253 | " n=math.ceil(0.4*dlo['Occurrences'].sum()), random_state=1234, \n", 254 | " weights=dlo.Occurrences, replace=True\n", 255 | " ).copy(),\n", 256 | " kloo.sample(\n", 257 | " n=math.ceil(0.4*kloo['Occurrences'].sum()), random_state=1234,\n", 258 | " weights=kloo.Occurrences, replace=True\n", 259 | " ).copy()],\n", 260 | " ignore_index=True)\n", 261 | "\n", 262 | "\n", 263 | "# try just concatting them\n", 264 | "df = pd.concat([\n", 265 | " dlo[\n", 266 | " ['DeviceCategories', 'Occurrences', 'Sequence', 'Event_cat_act_agg']\n", 267 | " ].copy(),\n", 268 | " kloo[\n", 269 | " ['DeviceCategories', 'Occurrences', 'Sequence', 'Event_cat_act_agg']\n", 270 | " ].copy()],\n", 271 | " ignore_index=True)\n" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 63, 277 | "metadata": { 278 | "ExecuteTime": { 279 | "end_time": "2019-02-04T16:53:09.386798Z", 280 | "start_time": "2019-02-04T16:53:09.380347Z" 281 | } 282 | }, 283 | "outputs": [ 284 | { 285 | "data": { 286 | "text/plain": [ 287 | "(4679828, 4)" 288 | ] 289 | }, 290 | "execution_count": 63, 291 | "metadata": {}, 292 | "output_type": "execute_result" 293 | } 294 | ], 295 | "source": [ 296 | "df.shape" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "## Remove tablet occurrences" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 64, 309 | "metadata": { 310 | "ExecuteTime": { 311 | "end_time": "2019-02-04T16:54:22.456736Z", 312 | "start_time": "2019-02-04T16:53:09.388872Z" 313 | } 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "def device_count(x, device):\n", 318 | " return sum([value for item, value in x if item == device])\n", 319 | "df[\"TabletCount\"] = df['DeviceCategories'].apply(\n", 320 | " ast.literal_eval).map(lambda x: device_count(x, \"tablet\"))\n", 321 | "df[\"Occurrences\"] = df[\"Occurrences\"] - df[\"TabletCount\"]" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 65, 327 | "metadata": { 328 | "ExecuteTime": { 329 | "end_time": "2019-02-04T16:54:23.745408Z", 330 | "start_time": "2019-02-04T16:54:22.459505Z" 331 | } 332 | }, 333 | "outputs": [ 334 | { 335 | "data": { 336 | "text/plain": [ 337 | "(4294728, 5)" 338 | ] 339 | }, 340 | "execution_count": 65, 341 | "metadata": {}, 342 | "output_type": "execute_result" 343 | } 344 | ], 345 | "source": [ 346 | "df = df[df[\"Occurrences\"] != 0]\n", 347 | "df.shape" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 61, 353 | "metadata": { 354 | "ExecuteTime": { 355 | "end_time": "2019-02-04T16:52:44.670535Z", 356 | "start_time": "2019-02-04T16:52:44.617370Z" 357 | } 358 | }, 359 | "outputs": [], 360 | "source": [ 361 | "# MAKE EACH OCCURRENCES 1\n", 362 | "# df['Occurrences'] = 1" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": {}, 368 | "source": [ 369 | "## journey_click_rate\n", 370 | "There is no difference in the proportion of journeys using at least one related link (journey_click_rate) between page variant A and page variant B.\n", 371 | "\n" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": {}, 377 | "source": [ 378 | "\\begin{equation*}\n", 379 | "\\frac{\\text{total number of journeys including at least one click on a related link}}{\\text{total number of journeys}}\n", 380 | "\\end{equation*}" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": {}, 386 | "source": [ 387 | "### total number of journeys including at least one click on a related link\n", 388 | "The numerator.\n", 389 | "\n", 390 | "We need to check within the Sequence column, whether the corresponding user journey has an Event where a related link was clicked. There is more than one level to this Event, we are specifically interested in \"Related content\" (as this is the sidebar of the page, the related links we are interested in)." 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 66, 396 | "metadata": { 397 | "ExecuteTime": { 398 | "end_time": "2019-02-04T16:54:48.924293Z", 399 | "start_time": "2019-02-04T16:54:48.917549Z" 400 | }, 401 | "code_folding": [] 402 | }, 403 | "outputs": [], 404 | "source": [ 405 | "#Compute whether a journey includes at least one related link click\n", 406 | "def is_related(x):\n", 407 | " return all(cond in x for cond in [\"relatedLinkClicked\",\"Related content\"])" 408 | ] 409 | }, 410 | { 411 | "cell_type": "markdown", 412 | "metadata": {}, 413 | "source": [ 414 | "Please note, `is_related` does not make sure that `relatedLinkClicked` and `Related content` exist in the same event in `Sequence`" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 67, 420 | "metadata": { 421 | "ExecuteTime": { 422 | "end_time": "2019-02-04T16:55:03.066834Z", 423 | "start_time": "2019-02-04T16:54:49.296795Z" 424 | } 425 | }, 426 | "outputs": [], 427 | "source": [ 428 | "# map across the Sequence variable, which includes pages and Events\n", 429 | "# we want to pass all the list elements to a function one-by-one and then collect the output.\n", 430 | "df[\"Has_Related\"] = df[\"Sequence\"].map(is_related)" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 68, 436 | "metadata": { 437 | "ExecuteTime": { 438 | "end_time": "2019-02-04T16:55:03.188149Z", 439 | "start_time": "2019-02-04T16:55:03.069423Z" 440 | } 441 | }, 442 | "outputs": [ 443 | { 444 | "data": { 445 | "text/plain": [ 446 | "395771" 447 | ] 448 | }, 449 | "execution_count": 68, 450 | "metadata": {}, 451 | "output_type": "execute_result" 452 | } 453 | ], 454 | "source": [ 455 | "# We can filter for True and sum\n", 456 | "df[df[\"Has_Related\"]].Occurrences.sum()" 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "### total number of journeys\n", 464 | "The denominator." 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 69, 470 | "metadata": { 471 | "ExecuteTime": { 472 | "end_time": "2019-02-04T16:55:03.222965Z", 473 | "start_time": "2019-02-04T16:55:03.191101Z" 474 | } 475 | }, 476 | "outputs": [ 477 | { 478 | "data": { 479 | "text/plain": [ 480 | "12971165" 481 | ] 482 | }, 483 | "execution_count": 69, 484 | "metadata": {}, 485 | "output_type": "execute_result" 486 | } 487 | ], 488 | "source": [ 489 | "df.Occurrences.sum()" 490 | ] 491 | }, 492 | { 493 | "cell_type": "markdown", 494 | "metadata": {}, 495 | "source": [ 496 | "### final metric" 497 | ] 498 | }, 499 | { 500 | "cell_type": "markdown", 501 | "metadata": {}, 502 | "source": [ 503 | "Given this sample, we see:" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": 70, 509 | "metadata": { 510 | "ExecuteTime": { 511 | "end_time": "2019-02-04T16:55:03.340889Z", 512 | "start_time": "2019-02-04T16:55:03.233213Z" 513 | } 514 | }, 515 | "outputs": [ 516 | { 517 | "data": { 518 | "text/plain": [ 519 | "0.030511600153108838" 520 | ] 521 | }, 522 | "execution_count": 70, 523 | "metadata": {}, 524 | "output_type": "execute_result" 525 | } 526 | ], 527 | "source": [ 528 | "df[df[\"Has_Related\"]].Occurrences.sum() / df.Occurrences.sum()" 529 | ] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "metadata": {}, 534 | "source": [ 535 | "## ratio of clicks on navigation elements vs. clicks on related links\n", 536 | "\n", 537 | "There is no statistically significant difference in the ratio of clicks on navigation elements vs. clicks on related links between page variant A and page variant B" 538 | ] 539 | }, 540 | { 541 | "cell_type": "markdown", 542 | "metadata": {}, 543 | "source": [ 544 | "\\begin{equation*}\n", 545 | "\\frac{\\text{total number of navigation element click events from content pages}}{\\text{total number of related link click events}}\n", 546 | "\\end{equation*}" 547 | ] 548 | }, 549 | { 550 | "cell_type": "markdown", 551 | "metadata": {}, 552 | "source": [ 553 | "### total number of related link click events" 554 | ] 555 | }, 556 | { 557 | "cell_type": "markdown", 558 | "metadata": {}, 559 | "source": [ 560 | "we need to check `Related content` is in the event, because the `relatedLinkClicked` is also used for \"explore the topic\" links at the bottom of the page, with the event action containing `Explore the topic`, e.g. `(('relatedLinkClicked', '2.1 Explore the topic'), 1)`" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 51, 566 | "metadata": { 567 | "ExecuteTime": { 568 | "end_time": "2019-02-04T16:30:35.229409Z", 569 | "start_time": "2019-02-04T16:30:35.204678Z" 570 | } 571 | }, 572 | "outputs": [], 573 | "source": [ 574 | "# If the event category is 'relatedLinkClicked' and the event action contains 'Related content', \n", 575 | "# return the count of that event\n", 576 | "def get_number_of_events_rl(event):\n", 577 | " if event[0][0] == 'relatedLinkClicked' and 'Related content' in event[0][1]:\n", 578 | " return event[1]\n", 579 | " return 0\n", 580 | "\n", 581 | "def sum_related_click_events(event_list):\n", 582 | " return sum([get_number_of_events_rl(event) for event in event_list])" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": 52, 588 | "metadata": { 589 | "ExecuteTime": { 590 | "end_time": "2019-02-04T16:34:03.277220Z", 591 | "start_time": "2019-02-04T16:30:35.762510Z" 592 | } 593 | }, 594 | "outputs": [], 595 | "source": [ 596 | "# get the number of related links clicks per Sequence\n", 597 | "df['Related Links Clicks per seq'] = df['Event_cat_act_agg'].apply(\n", 598 | " ast.literal_eval).map(sum_related_click_events)\n", 599 | "\n", 600 | "# get the total number of related links clicks for that row (clicks per sequence multiplied by occurrences)\n", 601 | "df['Related Links Clicks row total'] = df['Related Links Clicks per seq'] * df['Occurrences']" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": 53, 607 | "metadata": { 608 | "ExecuteTime": { 609 | "end_time": "2019-02-04T16:34:03.326684Z", 610 | "start_time": "2019-02-04T16:34:03.282394Z" 611 | } 612 | }, 613 | "outputs": [ 614 | { 615 | "data": { 616 | "text/plain": [ 617 | "205595" 618 | ] 619 | }, 620 | "execution_count": 53, 621 | "metadata": {}, 622 | "output_type": "execute_result" 623 | } 624 | ], 625 | "source": [ 626 | "df['Related Links Clicks row total'].sum()" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": null, 632 | "metadata": {}, 633 | "outputs": [], 634 | "source": [] 635 | } 636 | ], 637 | "metadata": { 638 | "kernelspec": { 639 | "display_name": "Python 3", 640 | "language": "python", 641 | "name": "python3" 642 | }, 643 | "language_info": { 644 | "codemirror_mode": { 645 | "name": "ipython", 646 | "version": 3 647 | }, 648 | "file_extension": ".py", 649 | "mimetype": "text/x-python", 650 | "name": "python", 651 | "nbconvert_exporter": "python", 652 | "pygments_lexer": "ipython3", 653 | "version": "3.6.0" 654 | }, 655 | "toc": { 656 | "base_numbering": 1, 657 | "nav_menu": {}, 658 | "number_sections": true, 659 | "sideBar": true, 660 | "skip_h1_title": false, 661 | "title_cell": "Table of Contents", 662 | "title_sidebar": "Contents", 663 | "toc_cell": false, 664 | "toc_position": { 665 | "height": "507px", 666 | "left": "62px", 667 | "top": "154px", 668 | "width": "165px" 669 | }, 670 | "toc_section_display": true, 671 | "toc_window_display": true 672 | } 673 | }, 674 | "nbformat": 4, 675 | "nbformat_minor": 2 676 | } 677 | --------------------------------------------------------------------------------