├── assets ├── img │ ├── .gitkeep │ └── favicon.ico ├── js │ ├── script.js │ ├── plugins.js │ └── main.js └── css │ └── style.css ├── bioc_webstats ├── webpack │ └── .gitkeep ├── __init__.py ├── templates │ ├── nav.html │ ├── footer.html │ ├── 500.html │ ├── 404.html │ ├── 401.html │ ├── about.html │ ├── main.css │ ├── home.html │ ├── layout.html │ ├── category.html │ └── stats-bioc.html ├── flask_ingest.sh ├── flask_ingest_crontab_setup.sh ├── app_waitress.py ├── extensions.py ├── static │ ├── cache_manifest.json │ └── barchart.js ├── splash.py ├── database.py ├── ingest_logs.py ├── configmodule.py ├── packages_table_update.py ├── aws_functions.py ├── commands.py ├── app.py └── stats.py ├── tests ├── __init__.py ├── .env ├── factories.py ├── test_models.py └── conftest.py ├── migrations ├── README ├── script.py.mako ├── alembic.ini ├── versions │ └── 9c266b1a4aa9_.py └── env.py ├── .coveragerc ├── e2e ├── constants.ts ├── .env ├── workflow_page.spec.ts └── smoke_test.spec.ts ├── test-deployment ├── ansible │ ├── inventory.ini │ ├── ansible.cfg │ └── install-packages.yml ├── terraform.tfstate.backup ├── terra-development │ ├── main.tf │ └── variables.tf ├── terra-production │ ├── main.tf │ └── variables.tf └── main.tf ├── etl ├── i_populate_web_downloads.sql ├── t_bio _webstats_info.sql ├── postgresql │ ├── u_webstats_info_psql.sql │ ├── t_webstats_info_psql.sql │ ├── t_categorystats_postgres.sql │ ├── t_stats_postgres.sql │ ├── t_packages_postgres.sql │ ├── update_stats.sql │ ├── v_categorystats_postgres.sql │ ├── v_stats_psqlsql.sql │ ├── t_bioc_web_downloads_psql.sql │ ├── f_stats.sql │ └── sp_update_stats.sql ├── change-table-location.json ├── cli_specimens.sh ├── t_stats_parquet.sql ├── t_stats_tsv.sql ├── t_bioc_web_downloads.sql ├── i_populate_category_stats.sql ├── legacy-access_log-to-web_downloads.sql ├── bioc-www-logreader-prod.json ├── legacy-table-def.sql ├── retrieve_package_info.py ├── t_cloudfront_logs.sql ├── s3_move_objects.sh ├── v_bioc_web_downloads.sql ├── t_bioc_web_logs_partitioned.sql └── log_partition_projector.py ├── docs ├── webstats-erd-0_1_9.png ├── design_brief_block_diagram.png ├── bioc-webstats-architecture-v2.excalidraw.png ├── test-instance-setup.md ├── old_specimen.txt ├── webstats-system-overview.md └── stats_replacement_design_brief.md ├── conversion ├── README.md ├── isql_row_counts.sh ├── global_package_history.R ├── global_package_history.py └── access_logs_to_stats_tsv.py ├── dist ├── bioc_webstats-0.1.10-py3-none-any.whl ├── bioc_webstats-0.1.8-py3-none-any.whl └── bioc_webstats-0.1.9-py3-none-any.whl ├── autoapp.py ├── installer_scripts ├── build_docker.sh ├── logrotate.d │ ├── bioc-webstats │ └── README.txt ├── entrypoint.sh ├── bioc-webstats.service ├── apache2.service ├── flask_environment ├── run_docker.sh ├── aws_installer.sh ├── create_ec2_instance.sh ├── Dockerfile ├── docker_dev_setup_example.md └── installer.sh ├── .env.example ├── test-instance-config.py ├── .github └── workflows │ └── playwright.yml ├── .gitignore ├── pyproject.toml ├── playwright.config.ts ├── package.json ├── webpack.config.js └── README.md /assets/img/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bioc_webstats/webpack/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for the app.""" 2 | -------------------------------------------------------------------------------- /assets/js/script.js: -------------------------------------------------------------------------------- 1 | // App initialization code goes here 2 | -------------------------------------------------------------------------------- /migrations/README: -------------------------------------------------------------------------------- 1 | Single-database configuration for Flask. 2 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = bioc_webstats 3 | omit = 4 | tests/* 5 | 6 | -------------------------------------------------------------------------------- /bioc_webstats/__init__.py: -------------------------------------------------------------------------------- 1 | """Main application package.""" 2 | __version__ = "0.1.8" 3 | -------------------------------------------------------------------------------- /e2e/constants.ts: -------------------------------------------------------------------------------- 1 | // constants.ts 2 | export const URL_STEM = '/packages/stats/'; 3 | -------------------------------------------------------------------------------- /e2e/.env: -------------------------------------------------------------------------------- 1 | # WEBSTATS_URL='http://localhost:5000' 2 | # WEBSTATS_URL='http://3.217.171.126:8000/' -------------------------------------------------------------------------------- /test-deployment/ansible/inventory.ini: -------------------------------------------------------------------------------- 1 | [ec2] 2 | ec2-3-142-133-254.us-east-2.compute.amazonaws.com -------------------------------------------------------------------------------- /test-deployment/ansible/ansible.cfg: -------------------------------------------------------------------------------- 1 | [defaults] 2 | inventory = inventory.ini 3 | remote_user = ubuntu -------------------------------------------------------------------------------- /assets/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/bio-web-stats/devel/assets/img/favicon.ico -------------------------------------------------------------------------------- /etl/i_populate_web_downloads.sql: -------------------------------------------------------------------------------- 1 | insert into bioc_web_downloads 2 | select * 3 | from v_bioc_web_downloads; -------------------------------------------------------------------------------- /assets/js/plugins.js: -------------------------------------------------------------------------------- 1 | // place any jQuery/helper plugins in here, instead of separate, slower script files. 2 | -------------------------------------------------------------------------------- /docs/webstats-erd-0_1_9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/bio-web-stats/devel/docs/webstats-erd-0_1_9.png -------------------------------------------------------------------------------- /docs/design_brief_block_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/bio-web-stats/devel/docs/design_brief_block_diagram.png -------------------------------------------------------------------------------- /etl/t_bio _webstats_info.sql: -------------------------------------------------------------------------------- 1 | # TODO operationalize 2 | CREATE TABLE webstats_info 3 | ("key" varchar(23) PRIMARY KEY, 4 | "value" varchar(128)); -------------------------------------------------------------------------------- /conversion/README.md: -------------------------------------------------------------------------------- 1 | This folder contains artifacts for the conversion from the legacy stats.biconductor.org server to the bioc-weblogs system 2 | -------------------------------------------------------------------------------- /dist/bioc_webstats-0.1.10-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/bio-web-stats/devel/dist/bioc_webstats-0.1.10-py3-none-any.whl -------------------------------------------------------------------------------- /dist/bioc_webstats-0.1.8-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/bio-web-stats/devel/dist/bioc_webstats-0.1.8-py3-none-any.whl -------------------------------------------------------------------------------- /dist/bioc_webstats-0.1.9-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/bio-web-stats/devel/dist/bioc_webstats-0.1.9-py3-none-any.whl -------------------------------------------------------------------------------- /autoapp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Create an application instance.""" 3 | from bioc_webstats.app import create_app 4 | app = create_app("development") -------------------------------------------------------------------------------- /docs/bioc-webstats-architecture-v2.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/bio-web-stats/devel/docs/bioc-webstats-architecture-v2.excalidraw.png -------------------------------------------------------------------------------- /bioc_webstats/templates/nav.html: -------------------------------------------------------------------------------- 1 | 2 | 7 | 8 | -------------------------------------------------------------------------------- /installer_scripts/build_docker.sh: -------------------------------------------------------------------------------- 1 | # To override execution user:group webstats:webstats, add --build-arg OSUSER="yankee" --build-arg OSGROUP="doddle" 2 | docker build -t webstats-server . 3 | -------------------------------------------------------------------------------- /bioc_webstats/flask_ingest.sh: -------------------------------------------------------------------------------- 1 | # TODO what is the correct directory for this script? 2 | . .venv/bin/activate 3 | export FLASK_APP="bioc_webstats.app:create_app('production', 'bioc/webstats/prod')" 4 | flask ingest -c E1TVLJONPTUXV3 5 | -------------------------------------------------------------------------------- /installer_scripts/logrotate.d/bioc-webstats: -------------------------------------------------------------------------------- 1 | /var/log/bioc-webstats/webstats.log { 2 | weekly 3 | rotate 4 4 | compress 5 | missingok 6 | notifempty 7 | create 0640 ubuntu ubuntu 8 | delaycompress 9 | } 10 | -------------------------------------------------------------------------------- /conversion/isql_row_counts.sh: -------------------------------------------------------------------------------- 1 | # report row counts for all sqlite3 databases 2 | for file in /mnt/data/home/biocadmin/download_dbs/download_db_*.sqlite; do 3 | echo $(basename "$file") $(sqlite3 "$file" "select count(*) from access_log") 4 | done 5 | -------------------------------------------------------------------------------- /etl/postgresql/u_webstats_info_psql.sql: -------------------------------------------------------------------------------- 1 | -- update "Valid through" date 2 | 3 | INSERT INTO webstats_info (key, value) 4 | VALUES ('ValidThru', (SELECT MAX(date) FROM bioc_web_downloads)) 5 | ON CONFLICT (key) 6 | DO UPDATE SET value = EXCLUDED.value; 7 | -------------------------------------------------------------------------------- /test-deployment/terraform.tfstate.backup: -------------------------------------------------------------------------------- 1 | { 2 | "version": 4, 3 | "terraform_version": "1.5.5", 4 | "serial": 5, 5 | "lineage": "ed68b15c-df94-c839-bf44-3a56b77689b5", 6 | "outputs": {}, 7 | "resources": [], 8 | "check_results": null 9 | } 10 | -------------------------------------------------------------------------------- /etl/change-table-location.json: -------------------------------------------------------------------------------- 1 | { 2 | "DatabaseName": "glue-sup-db", 3 | "TableInput": { 4 | "Name": "bw-dev_bioc_weblogs_small_test", 5 | "StorageDescriptor": { 6 | "Location": "s3://dev-bioc-weblogs-small-test/weblogs/" 7 | } 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /bioc_webstats/templates/footer.html: -------------------------------------------------------------------------------- 1 | 13 | -------------------------------------------------------------------------------- /installer_scripts/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Start systemd as the init system 5 | exec /lib/systemd/systemd 6 | 7 | 8 | # Wait for systemd to start 9 | sleep 5 10 | 11 | # Start ssh service 12 | systemctl start ssh 13 | 14 | # Wait indefinitely to keep the container running 15 | tail -f /dev/null 16 | -------------------------------------------------------------------------------- /tests/.env: -------------------------------------------------------------------------------- 1 | # Environment variable overrides for local development 2 | FLASK_APP=autoapp.py 3 | FLASK_DEBUG=1 4 | FLASK_ENV=development 5 | DATABASE_URL=sqlite:///dev.db 6 | LOG_LEVEL=INFO 7 | SECRET_KEY=012983901238102381038012381298 8 | # In production, set to a higher number, like 31556926 9 | SEND_FILE_MAX_AGE_DEFAULT=0 10 | -------------------------------------------------------------------------------- /test-deployment/terra-development/main.tf: -------------------------------------------------------------------------------- 1 | # development/main.tf 2 | provider "aws" { 3 | region = var.aws_region 4 | } 5 | 6 | # ... 7 | 8 | resource "aws_instance" "example" { 9 | ami = var.ami_id 10 | instance_type = var.instance_type 11 | 12 | tags = { 13 | Name = "development-web-server" 14 | } 15 | } -------------------------------------------------------------------------------- /test-deployment/terra-production/main.tf: -------------------------------------------------------------------------------- 1 | # development/main.tf 2 | provider "aws" { 3 | region = var.aws_region 4 | } 5 | 6 | # ... 7 | 8 | resource "aws_instance" "example" { 9 | ami = var.ami_id 10 | instance_type = var.instance_type 11 | 12 | tags = { 13 | Name = "development-web-server" 14 | } 15 | } -------------------------------------------------------------------------------- /bioc_webstats/flask_ingest_crontab_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # TODO correct directory? 4 | 5 | # Define the script path 6 | SCRIPT_PATH="flask_ingest.sh" 7 | 8 | # Ensure the script is executable 9 | chmod +x $SCRIPT_PATH 10 | 11 | # Add a new cron job 12 | (crontab -l 2>/dev/null; echo "12 1 * * * $SCRIPT_PATH") | crontab - 13 | -------------------------------------------------------------------------------- /installer_scripts/bioc-webstats.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Waitress service for www-webstats 3 | After=network.target 4 | 5 | [Service] 6 | WorkingDirectory=/home/ubuntu 7 | EnvironmentFile=/home/ubuntu/flask_environment 8 | ExecStart=/home/ubuntu/.venv/bin/python3 -m bioc_webstats.app_waitress 9 | 10 | [Install] 11 | WantedBy=multi-user.target 12 | -------------------------------------------------------------------------------- /test-deployment/terra-development/variables.tf: -------------------------------------------------------------------------------- 1 | # development/variables.tf 2 | variable "aws_region" { 3 | type = string 4 | default = "us-east-2" 5 | } 6 | 7 | variable "ami_id" { 8 | type = string 9 | default = "ami-09d9029d9fc5e5238" 10 | } 11 | 12 | variable "instance_type" { 13 | type = string 14 | default = "t2.micro" 15 | } 16 | -------------------------------------------------------------------------------- /test-deployment/terra-production/variables.tf: -------------------------------------------------------------------------------- 1 | # development/variables.tf 2 | variable "aws_region" { 3 | type = string 4 | default = "us-east-2" 5 | } 6 | 7 | variable "ami_id" { 8 | type = string 9 | default = "ami-09d9029d9fc5e5238" 10 | } 11 | 12 | variable "instance_type" { 13 | type = string 14 | default = "t2.micro" 15 | } 16 | -------------------------------------------------------------------------------- /installer_scripts/logrotate.d/README.txt: -------------------------------------------------------------------------------- 1 | Specimen log rotation configuration for the web app. 2 | 3 | Place the file installer_scripts/logrotate.d/bioc-webstats 4 | on the target system under /etc/logrotate.d/ 5 | 6 | Then set owner to root 7 | sudo chown root:root bioc-webstats 8 | 9 | Test by running: 10 | sudo logrotate -d /etc/logrotate.d/bioc-webstats -------------------------------------------------------------------------------- /bioc_webstats/app_waitress.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from waitress import serve 4 | from bioc_webstats import app 5 | import logging 6 | import sys 7 | 8 | logging.basicConfig(stream=sys.stderr) 9 | 10 | # TODO parameterize port 5000 11 | if __name__ == "__main__": 12 | serve(app.create_app('production', '/bioc/webstats/prod'), host='0.0.0.0', port=8000) -------------------------------------------------------------------------------- /etl/cli_specimens.sh: -------------------------------------------------------------------------------- 1 | aws glue get-table --database-name glue-sup-db --name "bw-dev_bioc_weblogs_small_test" 2 | 3 | aws glue update-table --cli-input-json file://change-table-location.json 4 | 5 | aws glue create-table --database-name glue-sup-db --table-input file://etl/glue_weblog_table_in.json 6 | 7 | aws glue delete-table --database-name glue-sup-db --name bioc_web_logs 8 | -------------------------------------------------------------------------------- /installer_scripts/apache2.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=The Apache HTTP Server 3 | After=network.target remote-fs.target nss-lookup.target 4 | 5 | [Service] 6 | ExecStart=/usr/sbin/apachectl -D FOREGROUND 7 | ExecReload=/usr/sbin/apachectl graceful 8 | ExecStop=/usr/sbin/apachectl stop 9 | Type=notify 10 | PrivateTmp=true 11 | 12 | [Install] 13 | WantedBy=multi-user.target 14 | -------------------------------------------------------------------------------- /test-deployment/main.tf: -------------------------------------------------------------------------------- 1 | provider "aws" { 2 | region = "us-east-2" # Change this to your desired AWS region 3 | } 4 | 5 | resource "aws_instance" "example" { 6 | ami = "ami-09d9029d9fc5e5238" # Change this to the desired AMI ID 7 | instance_type = "t2.micro" # Change this to the desired instance type 8 | 9 | tags = { 10 | Name = "test-bio-web-stats" 11 | } 12 | } -------------------------------------------------------------------------------- /installer_scripts/flask_environment: -------------------------------------------------------------------------------- 1 | export FLASK_APP="bioc_webstats.app:create_app('production', '/bioc/webstats/prod')" 2 | export FLASK_AWS_PATH_PARAMETER=/bioc/webstats/prod 3 | export FLASK_OSUSER="webstats" 4 | export FLASK_OSGROUP="webstats" 5 | export FLASK_APPROOT="/var/www/bioc-webstats" 6 | export FLASK_LOGROOT="/var/log/bioc-webstats" 7 | export FLASK_AWS_PATH_PARAMETER='/bioc/webstats/prod' 8 | -------------------------------------------------------------------------------- /bioc_webstats/templates/500.html: -------------------------------------------------------------------------------- 1 | 2 | {% extends "layout.html" %} 3 | 4 | {% block page_title %}Server error{% endblock %} 5 | 6 | {% block content %} 7 |
8 |
9 |

500

10 |

Sorry, something went wrong on our system. Don't panic, we are fixing it! Please try again later.

11 |
12 |
13 | {% endblock %} 14 | 15 | -------------------------------------------------------------------------------- /bioc_webstats/templates/404.html: -------------------------------------------------------------------------------- 1 | 2 | {% extends "layout.html" %} 3 | 4 | {% block page_title %}Page Not Found{% endblock %} 5 | 6 | {% block content %} 7 |
8 |
9 |

404

10 |

Sorry, that page doesn't exist.

11 |

Want to go home instead?

12 |
13 |
14 | {% endblock %} 15 | 16 | -------------------------------------------------------------------------------- /installer_scripts/run_docker.sh: -------------------------------------------------------------------------------- 1 | # Start the server 2 | docker run --privileged --name=webstats1 -d webstats-server 3 | # Retrieve the local ip address 4 | docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' webstats1 5 | # Move the most recent whl file to the docker container 6 | docker cp "$(ls -1ar ../dist/bioc_webstats-*.whl|head -1)" webstats1:/home/ubuntu 7 | # login 8 | docker exec -it --user ubuntu webstats1 /bin/bash 9 | -------------------------------------------------------------------------------- /bioc_webstats/templates/401.html: -------------------------------------------------------------------------------- 1 | 2 | {% extends "layout.html" %} 3 | 4 | {% block page_title %}Unauthorized{% endblock %} 5 | 6 | {% block content %} 7 |
8 |
9 |

401

10 |

You are not authorized to see this page. 11 |

12 |

Want to go home instead?

13 |
14 |
15 | {% endblock %} 16 | 17 | -------------------------------------------------------------------------------- /assets/js/main.js: -------------------------------------------------------------------------------- 1 | /* 2 | * Main Javascript file for bioc_webstats. 3 | * 4 | * This file bundles all of your javascript together using webpack. 5 | */ 6 | 7 | // JavaScript modules 8 | require('@fortawesome/fontawesome-free'); 9 | require('jquery'); 10 | require('bootstrap'); 11 | 12 | require.context( 13 | '../img', // context folder 14 | true, // include subdirectories 15 | /.*/, // RegExp 16 | ); 17 | 18 | // Your own code 19 | require('./plugins'); 20 | require('./script'); -------------------------------------------------------------------------------- /bioc_webstats/templates/about.html: -------------------------------------------------------------------------------- 1 | 2 | {% extends "layout.html" %} 3 | 4 | {% block content %} 5 |
6 |

About

7 |
8 |

This template was created by Steven Loria for use with the cookiecutter package by Audrey Roy.

9 |
10 |
11 | {% endblock %} 12 | 13 | -------------------------------------------------------------------------------- /etl/postgresql/t_webstats_info_psql.sql: -------------------------------------------------------------------------------- 1 | -- Table: public.webstats_info 2 | 3 | -- DROP TABLE IF EXISTS public.webstats_info; 4 | 5 | CREATE TABLE IF NOT EXISTS public.webstats_info 6 | ( 7 | key character varying(23) COLLATE pg_catalog."default" NOT NULL, 8 | value character varying(128) COLLATE pg_catalog."default", 9 | CONSTRAINT webstats_info_pkey PRIMARY KEY (key) 10 | ) 11 | 12 | TABLESPACE pg_default; 13 | 14 | ALTER TABLE IF EXISTS public.webstats_info 15 | OWNER to postgres; -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # Environment variable overrides for local development 2 | FLASK_APP="bioc_webstats.app:create_app('development')" 3 | FLASK_DEBUG=1 4 | FLASK_ENV=development 5 | #DATABASE_URL=sqlite:///dev.db 6 | DATABASE_URL=postgresql://postgres@localhost:5432/webstats 7 | LOG_LEVEL=DEBUG 8 | SECRET_KEY=not-so-secret 9 | # In production, set to a higher number, like 31556926 10 | SEND_FILE_MAX_AGE_DEFAULT=0 11 | 12 | # AWS Roles 13 | AWS_ROLE_WEBRUNNER=bioc-webstats-webrunner 14 | AWS_ROLE_ETL=bioc-webstats-etl 15 | -------------------------------------------------------------------------------- /bioc_webstats/templates/main.css: -------------------------------------------------------------------------------- 1 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 59 | 60 |
54 | See download stats for:     55 | {% for ref, desc in category_links %} 56 | Bioconductor {{ desc }} packages     57 | {% endfor %} 58 |
61 |

Download stats for Bioconductor {{ category_name }} packages

62 |

{{ url_list }}

63 |

64 | Data as of {{ generated_date.strftime("%a. %d %b %Y") }}. 65 |

66 |

The number reported next to each package name is the download score, that is, the average number of 67 | distinct IPs that "hit" the package each month for the last 12 months (not counting the current month).

68 | {% if top_count > 0 %} 69 |
70 |

Top {{ top_count }}

71 | 72 | 73 | 74 | 79 | {% for j in range(3) %} 80 | {% set (package_name, package_score, package_rank)=top[i + j * N] %} 82 | 87 | {% endfor %} 88 | 89 | {% endfor %} 90 | 91 | 92 | 93 |
75 | 76 | {% set N = top|length // 3 %} 77 | {% for i in range(N) %} 78 |
{{ package_rank }} 83 | 84 | {{ package_name}} ({{ package_score }}) 85 | 86 |
94 |
95 |

All {{ category_name }} packages

96 | {% endif %} 97 |

98 | All {{ category_name }} package stats in one file:  99 | 100 | {{ tab_page_prefix }}_pkg_stats.tab 101 | 102 |

103 |

104 | All {{ category_name }} download scores in one file:  105 | 106 | {{ tab_page_prefix }}_pkg_scores.tab 107 | 108 |

109 |

110 | 111 | See Download stats for Bioconductor {{ category_name }} repository (all packages combined) 112 | 113 |

114 | 115 | {% for letterkey, package_list in scores.items() %} 116 |

{{ letterkey }}

117 |
118 | {% set n = package_list|length %} 119 | {% set third = (n / 3)|round(method='ceil')|int %} 120 | 121 | {% for i in range(3) %} 122 |
123 | {% for package_name, package_score, package_rank in package_list[i*third:(i+1)*third] %} 124 |

125 | {{ package_name }} ({{ package_score }}) 126 |

127 | {% endfor %} 128 |
129 | {% endfor %} 130 |
131 | {% endfor %} 132 | 133 | 134 | -------------------------------------------------------------------------------- /bioc_webstats/packages_table_update.py: -------------------------------------------------------------------------------- 1 | """get_bioc_package_history - Extract Bioconductor package history from source repo""" 2 | import logging 3 | import yaml 4 | import requests 5 | import bioc_webstats.models as db 6 | 7 | 8 | from flask import current_app 9 | 10 | # Local manifest constants 11 | BIOCONDUCTOR_HOME_URI = "https://www.bioconductor.org/" 12 | PACKAGE_CATEGORIES = ["bioc", "data-annotation", "data-experiment", "workflows"] 13 | PACKAGE_UPDATE_MAXIMUM_ALLOWED = 50 14 | 15 | def version_str_to_int(version_number:chr): 16 | parts = version_number.split('.') 17 | if len(parts) == 2: 18 | return int(parts[0]) * 100 + int(parts[1]) 19 | else: 20 | raise ValueError(f"Invalid version number: {version_number}") 21 | 22 | def version_int_to_str(version_int: int): 23 | return f"{version_int // 100}.{version_int % 100}" 24 | 25 | def web_download( stem:str, fqdn: str = BIOCONDUCTOR_HOME_URI): 26 | uri = f"{fqdn}{stem}" 27 | response = requests.get(uri) 28 | if response.status_code != 200: 29 | raise ValueError(f"Failed to download file from {uri}") 30 | return response.text 31 | 32 | 33 | def packages_table_update(dry_run:bool, verbose:bool, force:bool): 34 | """Update database table packages from manifests on www.bioconductor.org 35 | 36 | Keyword Arguments: 37 | dry_run -- Calcluate changes to packges but do not update the database (default: {False}) 38 | verbose -- Additional information to log file (default: {True}) 39 | force -- Proceed with update even if the number of changes exceeds PACKAGE_UPDATE_MAXIMUM_ALLOWED. (default: {False}) 40 | 41 | Returns: 42 | _description_ 43 | """ 44 | 45 | log = current_app.logger 46 | log.log(logging.INFO, f'starting pacakges update') 47 | 48 | bioconductor_config = yaml.safe_load(web_download("config.yaml")) 49 | 50 | release_version = bioconductor_config["release_version"] 51 | devel_version = bioconductor_config["devel_version"] 52 | 53 | manifest_packages = {} 54 | for category in PACKAGE_CATEGORIES: 55 | package_text = web_download(f"packages/devel/{category.replace("-", "/")}/src/contrib/PACKAGES") 56 | package_list = package_text.splitlines() 57 | p = {line.split(": ")[1]: db.PackageType[category.removeprefix("data-").upper()] for line in package_list if line.startswith("Package:")} 58 | manifest_packages.update(p) 59 | 60 | # dev_pcakages represents the currently active packages in the "devel" version. 61 | # all_packages is the complete package history 62 | # A package that is in dev but not all is new in the devel version 63 | # and so will be added with first_version set to the devel version and last_version set to NULL 64 | # A package that is in all but not dev has been removed from the last version 65 | # and so will have its last_version value set to the release_version 66 | # If it is in both dev and all, the last_version should be null. If it is not, then 67 | # the package was reinstated in the devel release and the last_version will be reset to NULL 68 | 69 | dev_packages = set(manifest_packages.keys()) 70 | # the Packages model will return 4-tuples. Turn this into a dictionary, indexted by package name 71 | all_package_details = db.Packages.all_package_details() 72 | all_active_packages = {t[0] for t in all_package_details if t[3] == 'NULL'} 73 | all_inactive_packages = {t[0] for t in all_package_details} - all_active_packages 74 | 75 | removed_package_names = all_active_packages - dev_packages 76 | reinstated_package_names = all_inactive_packages & dev_packages 77 | new_package_names = dev_packages - all_active_packages - reinstated_package_names 78 | if (verbose): 79 | log.log(logging.INFO, f"Total packages before update: {len(all_package_details)}") 80 | log.log(logging.INFO, f"Packages removed: {len(removed_package_names)}") 81 | log.log(logging.INFO, f"Packages added: {len(new_package_names)}") 82 | log.log(logging.INFO, f"Packages reinstated: {len(reinstated_package_names)}") 83 | 84 | total_changes = len(removed_package_names) + len(reinstated_package_names) + len(new_package_names) 85 | if total_changes > PACKAGE_UPDATE_MAXIMUM_ALLOWED: 86 | log.log(logging.WARN, f"total number of changes ({total_changes}) excceds maximum allowed ({PACKAGE_UPDATE_MAXIMUM_ALLOWED})") 87 | if not force: 88 | log.log(logging.ERROR, "No update made") 89 | return 90 | log.log(logging.WARN, "Force parameter is TRUE. Update will proceed") 91 | 92 | # mark the inactive packages with the value of the last release 93 | db.Packages.update_package_last_version(removed_package_names, release_version) 94 | # mark any reinstated packages by setting the last_vesion to NLL 95 | db.Packages.update_package_last_version(reinstated_package_names, None) 96 | # insert any new packages 97 | records = [{"package": package, "category": manifest_packages[package], "first_version": str(version_str_to_int(devel_version)), "last_version": None} for package in new_package_names] 98 | if (len(records) > 0): 99 | db.Packages.insert_records(records) 100 | # TODO REPORT ON CONSOLe IF ERROR 101 | return 102 | 103 | -------------------------------------------------------------------------------- /bioc_webstats/templates/stats-bioc.html: -------------------------------------------------------------------------------- 1 | {% macro title_text(package, category) %} Download stats for {% if package != 2 | None %} {{ category_name }} package {{ package }} {% else %} Bioconductor {{ 3 | category_name }} repository (all packages combined) {% endif %} {% endmacro %} 4 | 5 | 6 | 7 | 8 | 9 | {{ title_text(package, category) }} 10 | 42 | 43 | 44 | 45 | 46 | 47 | 51 | 52 |
48 | Back to the "Download stats for Bioconductor {{ category_name }} 49 | packages" 50 |
53 | 54 |

{{ title_text(package, category) }}

55 |

56 | Data as of {{ generated_date.strftime("%a. %d %b %Y") }} 57 |

58 | {% if package is not none %} 59 |

60 | {% if deprecated_version is not none %} 61 | Package {{ package }} is not in the current release of Bioconductor. It was last seen in {{ deprecated_version }}. 62 | {% else %} 63 | {{ package }} home page: 64 | release version, 65 | devel version. 66 | {% endif %} 67 |

68 | {% endif %} 69 |

70 | Number of package downloads from the Bioconductor software package 71 | repository, year by year, from {{ last_year }} back to {{ first_year }} (years with no downloads are omitted): 72 |

73 | 74 | {% for data_year in data_by_year.keys() %} 75 |
76 |

{{ data_year }}

77 | 78 | 79 | 80 | 85 | 112 | 113 |
81 | 82 | Your browser may be too old as it does not support the html canvas element. 83 | 84 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | {% for line in data_by_year[data_year] %} 96 | 97 | 98 | 101 | 104 | 105 | {% endfor %} 106 | 107 |
MonthNb of distinct IPsNb of downloads
{{line["month"]}}/{{ line["year"] }} 99 | {{ line["unique_ips"] }} 100 | 102 | {{ line["downloads"] }} 103 |
108 | 109 | {{ package or category }}_{{ data_year }}_stats.tab 110 | 111 |
114 | {% endfor %} 115 |
116 |

117 | All years in one file: {{ package or category 118 | }}_stats.tab 119 |

120 | 121 | 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /tests/test_models.py: -------------------------------------------------------------------------------- 1 | """Model unit tests.""" 2 | import datetime as dt 3 | 4 | import pytest 5 | from sqlalchemy import select 6 | 7 | from bioc_webstats.models import ( 8 | Packages, 9 | PackageType, 10 | Stats, 11 | WebstatsInfo, 12 | list_to_dict, 13 | ) 14 | 15 | from .conftest import check_hashed_count_list 16 | 17 | 18 | @pytest.mark.usefixtures("db") 19 | class TestStats: 20 | """Stats tests.""" 21 | 22 | def test_db_valid_thru_date(self, webstatsinfo): 23 | """Verify appropriate last dtabase update date.""" 24 | # Arrange 25 | expected = dt.date(2023, 10, 4) 26 | 27 | # Act 28 | result = WebstatsInfo.get_valid_thru_date() 29 | 30 | # Assert 31 | assert result == expected 32 | 33 | def test_statsfactory_types(self, db): 34 | """Test stats factory.""" 35 | # Arrange 36 | 37 | # Act 38 | results = db.session.execute(select(Stats)) 39 | result = next(results, None)[0] 40 | 41 | # Assert 42 | assert isinstance(result.category, PackageType) 43 | assert str(result.package) 44 | assert isinstance(result.date, dt.date) 45 | assert bool(result.is_monthly) 46 | assert int(result.ip_count) 47 | assert int(result.download_count) 48 | 49 | def test_stats_getall(self, db, stats): 50 | """Compare contents of stats table with the list of dictionaries from which it was created.""" 51 | # Arrange 52 | 53 | # Act 54 | result = db.session.scalars(select(Stats)) 55 | result = list_to_dict(result) 56 | 57 | # Assert 58 | assert check_hashed_count_list(result) 59 | assert stats == result 60 | 61 | def test_get_package_names(self, packages): 62 | """Get the complete list of package names in collation sequence.""" 63 | # Arrange 64 | expected = sorted(packages) 65 | 66 | # Act 67 | result = Packages.get_package_names() 68 | 69 | # Assert 70 | assert expected == result 71 | 72 | # TODO Review database return values for consistency 73 | # TODO Verify that we only want dates and counts for this function 74 | def test_get_download_counts_year(self, stats): 75 | """Select category, package and year.""" 76 | category = PackageType.BIOC 77 | package = "affy" 78 | year = 2023 79 | expected = [(x["date"], x["ip_count"], x["download_count"]) for x in stats 80 | if x["category"] == category and x["package"] == package and x["date"].year == year] 81 | 82 | result = Stats.get_download_counts(category=category, package=package, year=year) 83 | 84 | # Assert 85 | assert expected == result 86 | 87 | def test_get_download_counts_full_year(self, stats): 88 | """Select one full year of download counts.""" 89 | # Arrange 90 | # 91 | category = PackageType.ANNOTATION 92 | package = "BSgenome.Hsapiens.UCSC.hg38" 93 | year = 2022 94 | expected = [(d["date"], d["ip_count"], d["download_count"]) for d in stats 95 | if d["category"] == category and d["package"] == package and d["date"].year == year] 96 | 97 | result = Stats.get_download_counts( 98 | category=category, package=package, year=year 99 | ) 100 | 101 | # Assert 102 | assert result == expected 103 | 104 | def test_get_download_counts_package(self, stats): 105 | """Select all the download counts for a given package.""" 106 | # Arrange 107 | # 108 | category = PackageType.ANNOTATION 109 | package = "BSgenome.Hsapiens.UCSC.hg38" 110 | expected = [(d["date"], d["ip_count"], d["download_count"]) for d in stats 111 | if d["category"] == category and d["package"] == package] 112 | 113 | # Check the legacy sort order (increasing or decreasing year, but always increasing month and day) 114 | result_hi_first = Stats.get_download_counts(category=category, package=package, newest_year_first=True) 115 | result_lo_first = Stats.get_download_counts(category=category, package=package, newest_year_first=False) 116 | 117 | # Assert 118 | expected_hi = sorted(expected, key=lambda x: x[0], reverse=True) 119 | expected_hi = sorted(expected, key=lambda x: (-x[0].year, x[0].month, x[0].day)) 120 | 121 | assert result_hi_first == expected_hi 122 | 123 | expected_lo = sorted(expected, key=lambda x: (x[0].year, x[0].month, x[0].day)) 124 | assert result_lo_first == expected_lo 125 | 126 | def test_get_download_counts_category(self, stats): 127 | """Select all the download counts for a given category.""" 128 | # Arrange 129 | category = PackageType.BIOC 130 | expected = [(d["package"], d["date"], d["ip_count"], d["download_count"]) 131 | for d in stats if d["category"] == category] 132 | 133 | result = Stats.get_download_counts(category=category) 134 | 135 | # Assert 136 | assert result == expected 137 | 138 | def test_get_download_scores(self, stats): 139 | """Select all the scores for a given category.""" 140 | # Arrange 141 | category = PackageType.BIOC 142 | expected = [('affy', 2, 2), ('affydata', 7, 1)] 143 | 144 | result = Stats.get_download_scores(category=category) 145 | 146 | # Assert 147 | assert result == expected 148 | -------------------------------------------------------------------------------- /bioc_webstats/aws_functions.py: -------------------------------------------------------------------------------- 1 | """ aws_functions TODO rename this. """ 2 | import json 3 | import logging 4 | import boto3 5 | import psycopg2 6 | from botocore.exceptions import ClientError 7 | 8 | 9 | import boto3 10 | import logging 11 | 12 | 13 | def aws_assume_sts_role(role_arn, role_session_name): 14 | 15 | try: 16 | # Create an STS client 17 | sts_client = boto3.client('sts') 18 | 19 | # Assume the specified role 20 | assumed_role_object = sts_client.assume_role( 21 | RoleArn=role_arn, 22 | RoleSessionName=role_session_name 23 | ) 24 | 25 | # Credentials to be used for the session with the assumed role 26 | credentials = assumed_role_object['Credentials'] 27 | return credentials 28 | 29 | except Exception as e: 30 | logging.critical(f"Could not assume AWS role {role_arn}. {e}") 31 | raise SystemExit(1) 32 | 33 | 34 | def get_parameter_store_values(parameter_path: str, region_name='us-east-1') -> dict: 35 | """Get all SSM parameter store values for a specific configuration. 36 | 37 | Arguments: 38 | parameter_path -- The prefix for the configure. Example: "/bioc/webstats/dev" 39 | 40 | Returns: 41 | A dictionary of parameter names (excluding the prefix) and their values. 42 | """ 43 | 44 | try: 45 | ssm_client = boto3.client('ssm', region_name = region_name) 46 | plist = ssm_client.get_parameters_by_path(Path = parameter_path, Recursive=True) 47 | for item in plist["Parameters"]: 48 | plist = ssm_client.get_parameters_by_path(Path = parameter_path, Recursive=True) 49 | result = {item["Name"][len(parameter_path)+1:] : item["Value"] for item in plist["Parameters"]} 50 | return result 51 | 52 | except Exception as e: 53 | logging.critical(f"Failed to read AWS parameter store {parameter_path}. {e}") 54 | raise SystemExit(1) 55 | 56 | 57 | def get_secret(secret_name, region_name): 58 | 59 | # Create a Secrets Manager client 60 | session = boto3.session.Session() 61 | client = session.client( 62 | service_name='secretsmanager', 63 | region_name=region_name 64 | ) 65 | 66 | try: 67 | get_secret_value_response = client.get_secret_value( 68 | SecretId=secret_name 69 | ) 70 | 71 | # Decrypts secret using the associated KMS key. 72 | secret = get_secret_value_response['SecretString'] 73 | return secret 74 | 75 | except Exception as e: 76 | logging.critical(f"Could not read AWS secret {secret_name}. {e}") 77 | raise SystemExit(1) 78 | 79 | def psql_get_connection(secret_name, region_name, database_name): 80 | """TODO.""" 81 | 82 | try: 83 | connection_string = aws_secret_to_psql_url(secret_name, region_name, database_name) 84 | conn = psycopg2.connect(connection_string) 85 | return conn 86 | 87 | except Exception as e: 88 | logging.critical(f"Could not open psql {database_name} with {secret_name}. {e}") 89 | raise SystemExit(1) 90 | 91 | 92 | def aws_secret_to_psql_url(secret_name, region_name, database_name): 93 | secret = get_secret(secret_name, region_name) 94 | db_credentials = json.loads(secret) 95 | # TODO: add database name to secret in Secrets Manager 96 | db_credentials['dbname'] = database_name 97 | 98 | # Create the PostgreSQL connection string 99 | connection_string = f"postgresql://{db_credentials['username']}:{db_credentials['password']}@{db_credentials['host']}:{db_credentials['port']}/{db_credentials['dbname']}" 100 | return connection_string 101 | 102 | 103 | def uri_to_arn(uri): 104 | # Parse the URI to extract the components 105 | scheme, path = uri.split("://", 1) 106 | if scheme != "awsarn": 107 | raise ValueError("Invalid scheme in URI: expected 'awsarn'") 108 | 109 | # Split the path into its components 110 | service_region, account_id, resource_type_path = path.split("/", 2) 111 | service, region = service_region.split(".", 1) 112 | 113 | # Construct the ARN 114 | arn = f"arn:aws:{service}:{region}:{account_id}:{resource_type_path}" 115 | 116 | return arn 117 | 118 | # Example usage 119 | # uri = "awsarn://secretsmanager.us-east-1.amazonaws.com/931729544676/secret/bioc/rdb/login/webstats_runner-fQFuUn" 120 | # arn = uri_to_arn(uri) 121 | # print(arn) 122 | 123 | 124 | def arn_to_uri(arn): 125 | # Validate and parse the ARN 126 | parts = arn.split(':') 127 | if len(parts) != 6 or parts[0] != 'arn' or parts[1] != 'aws': 128 | raise ValueError("Invalid ARN format") 129 | 130 | # Extract the ARN components 131 | _, _, service, region, account_id, resource_path = parts 132 | 133 | # Construct the URI 134 | uri = f"awsarn://{service}.{region}.amazonaws.com/{account_id}/{resource_path}" 135 | 136 | return uri 137 | 138 | # Example usage 139 | # arn = "arn:aws:secretsmanager:us-east-1:931729544676:secret:/bioc/rdb/login/webstats_runner-fQFuUn" 140 | # uri = arn_to_uri(arn) 141 | # print(uri) 142 | 143 | def cloudfront_invalidation(distribution_id, paths): 144 | """Invalidate Cloudfront Cache 145 | 146 | Keyword Arguments: 147 | distribution_id -- _description_ (default: {'E1TVLJONPTUXV3'}) 148 | paths -- _description_ (default: {['/packages/stats/*']}) 149 | """ 150 | 151 | 152 | # TODO move distribution_id and paths defaults to flask dispatcher 153 | client = boto3.client('cloudfront') 154 | 155 | # Distribution ID and the paths you want to invalidate 156 | 157 | # Create the invalidation 158 | response = client.create_invalidation( 159 | DistributionId=distribution_id, 160 | InvalidationBatch={ 161 | 'Paths': { 162 | 'Quantity': len(paths), 163 | 'Items': paths 164 | }, 165 | 'CallerReference': str(hash(frozenset(paths))) # Unique reference 166 | } 167 | ) 168 | 169 | # TODO from response, log error if needed, otherwise report timestamp for invalidation 170 | -------------------------------------------------------------------------------- /docs/webstats-system-overview.md: -------------------------------------------------------------------------------- 1 | # `webstats` System Overview 2 | 3 | Author: Robert Shear rshear@ds.dfci.harvard.edu 4 | Date: 2024-09-25 5 | 6 | This document provides a high level overview of the operation of the `bioc-webstats` system. It is current as of version `0.1.9`. 7 | 8 | The purpose of bioc-webstats is to maintain a permanent record of the download counts for each Bioconductor package in an SQL database and to report this information on www.bioconductor.org. It includes records from January 1, 2009, to the present. 9 | 10 | This application replaces the "stats server" application. That server produced static pages for all the content under `www.bioconductor.org/packages/stats/`. The `bio-webstats` appliation, in its initial implementation, is designed to match the eact form of the application that it replaces. 11 | 12 | The `bioc-webstats application is implementd as in Python application and supported by a SQL database. It has two major functions. 13 | 14 | 1. Data ingestion. Consume web traffic logs in Common Log Format (CLF), select those records which are package downloads, store them in a SQL table, and maintain summary statistics for each package. 15 | 2. Web reporting. The application can serve as a backend to any web server that supports the `WSGI` standard. It consumes `http get` requests, infers their semantics from the the `URI` stem, and returns content that is functionally the same as the system it replaces. In the case of `html` responses, this means that both the content as well as the look and feel are the same. Other responses are unformatted text downloags (`.tab` and `.txt`), which are byte-for-byte idenitcal to the prior system. 16 | 17 | ## Technical Stack 18 | 19 | The application implmenetation is based on several frameworks and libraries: 20 | 21 | - Python 3.12 22 | - [Poetry](https://python-poetry.org) - Dependency management and padckaging. 23 | - [Flask](https://flask.palletsprojects.com) - Web application framework. 24 | - [SQLAlchemy](https://www.sqlalchemy.org) - Pythn SQL toolkit and bject-rlational mapper. 25 | - [Chart.js](https://www.chartjs.org/) - JavaScript charting library. 26 | - [Bootstrap 5](https://getbootstrap.com) - Responsive JavaScript frontend toolkit. 27 | 28 | 29 | There are various additional Python and JavaScript dependencies that support the application. See `pyproject.toml` in the project root directory for details. 30 | 31 | The application is distibuted as a `whl` file and can be installed by any installation and package manager, including `poetry`, `pipenv`, `pipx`, `virtualenv`, or `conda`. 32 | 33 | ## General System Flow 34 | 35 | The general system flow as deployed as of this writing is depicted figure 1 below. The Python application can be run on any server that has secure access to `master.biconductor.org` and the AWS `Athena` service that can read the CloudFront CLF logs. This includes `master.biocnductor.org` itself. Other components, including the SQL Server, and the interal web server, are easily replaced. 36 | 37 | Not depicted in the stack are AWS-specific features for configuration (the AWS Systems Manager Parameter Store) and security (the AWS Security Manager). 38 | 39 | Note: The direction of each line indicates the functional flow of information. That is, for every request-response pattern, the arrow points to the consumer of the response. 40 | 41 | ![General System Flow](bioc-webstats-architecture-v2.excalidraw.png) 42 |

43 | Figure 1: General System Flow. 44 |

45 | 46 | A. A [Waitress](https://pypi.org/project/waitress/) lightweight webserver that consumes incoming `http get` requests from and returns results to an upstream webserver (arrow 3). 47 | 48 | B. A cron job that runs daily at 01:00 UTC that: 49 | - Detects detects changes in the development version of the manifest (arrow 1). 50 | - Invokes an AWS Athena View to return all CloudFront log entries for dates newer than those previously uploaded (arrow 2), but only if they have a URI stem that implies a download, and only if the package name is valid. 51 | - Updates the summary tables, `stats` and `categorystats` (see Database Structure below). 52 | 53 | C. A SQL Database server. Currently implemented as a serverless AWS RDS instance, Postgres 15. 54 | 55 | 56 | # Database Structure 57 | 58 | ![Database Model](webstats-erd-0_1_9.png) 59 | 60 | 61 | # Configuration 62 | 63 | ## Development Configuratiion 64 | 65 | ## Production Configuration 66 | 67 | ## Paramater Names and Default Valuses 68 | 69 | 70 | | Name | FlaskName | Default alue | Description | 71 | | ----------------- | ------------- | ------------------------------ | ------------------------ | 72 | | db/dbname | DBNAME | webstats | Postgres database name, default 'webstats' | 73 | | db/credentials | DBCREDENTIALS | arn:aws:secretsmanager:reference-to-database-credentials-secret | arn of secrets manager secret | 74 | | db/dbuser | DBUSER | webstats_runner | PostgrSQL user name, default 'webstats_runner' | 75 | | db/port | DBPORT | 5432 | Server endpoint port number | 76 | | db/server | DBSERVER | None | The symbolic address of the endpoint for the Postgres server | 77 | | flask/flask_app | APP | bioc_webstats. app:create_app('development') | Default initiation call for Flask | 78 | | flask/approot | APPROOT | /var/www/webstats | Working directory for app | 79 | | flask/logroot | LOGROOT | /var/log/bioc-webstats | Location of log files for the app | 80 | | flask/osgroup | OSGROUP | webstats | Execution group name | 81 | | flask/osuser | OSUSER | webstats | Execution user name | 82 | | flask/flask_debug | DEBUG | FALSE | False' Caution: Do not enable in production | 83 | | flask/log_level | LOG_LEVEL | INFO | Standard log levels, default 'INFO' | 84 | | flask/secret_key | SECRET_KEY | None | Secret key for activating web client flask debugging tools | 85 | 86 | # Deploymnet 87 | 88 | TODO 89 | 90 | # Web Application Internals OVerview 91 | 92 | The platofrm independent logic for the system is in the directory `./bioc_stats`. 93 | 94 | ## Initialization 95 | 96 | The application is always initialized by invoking `bioc_webstats.app`. 97 | 98 | TODO 99 | 100 | ## Processing a `http get` 101 | 102 | 1. `stats.py` 103 | 104 | 2. `models.py` 105 | 106 | 3. `templates/stats-bioc.html` 107 | 108 | -------------------------------------------------------------------------------- /bioc_webstats/commands.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Click commands.""" 3 | import os 4 | from datetime import date, datetime 5 | from glob import glob 6 | import logging 7 | from subprocess import call 8 | 9 | import boto3 10 | import click 11 | 12 | from flask import current_app 13 | from bioc_webstats.ingest_logs import ingest_logs 14 | from bioc_webstats.packages_table_update import packages_table_update 15 | from bioc_webstats.configmodule import configuration_dictionary 16 | 17 | HERE = os.path.abspath(os.path.dirname(__file__)) 18 | PROJECT_ROOT = os.path.join(HERE, os.pardir) 19 | TEST_PATH = os.path.join(PROJECT_ROOT, "tests") 20 | 21 | 22 | def parse_date(ctx, param, value): 23 | """Helper for parsing click.option dates""" 24 | if value is None: 25 | return value 26 | try: 27 | return datetime.strptime(value, "%Y-%m-%d").date() 28 | except ValueError: 29 | raise click.BadParameter("Date should be in YYYY-MM-DD format.") 30 | 31 | 32 | @click.command() 33 | @click.option( 34 | "-c/-C", 35 | "--coverage/--no-coverage", 36 | default=True, 37 | is_flag=True, 38 | help="Show coverage report", 39 | ) 40 | def test(coverage): 41 | """Run the tests.""" 42 | import pytest 43 | 44 | args = [TEST_PATH, "--verbose"] 45 | if coverage: 46 | args.append("--cov=bioc_webstats") 47 | rv = pytest.main(args) 48 | exit(rv) 49 | 50 | 51 | @click.command() 52 | @click.option( 53 | "-f", 54 | "--fix-imports", 55 | default=True, 56 | is_flag=True, 57 | help="Fix imports using isort, before linting", 58 | ) 59 | @click.option( 60 | "-c", 61 | "--check", 62 | default=False, 63 | is_flag=True, 64 | help="Don't make any changes to files, just confirm they are formatted correctly", 65 | ) 66 | def lint(fix_imports, check): 67 | """Lint and check code style with black, flake8 and isort.""" 68 | skip = ["node_modules", "requirements", "migrations"] 69 | root_files = glob("*.py") 70 | root_directories = [ 71 | name for name in next(os.walk("."))[1] if not name.startswith(".") 72 | ] 73 | files_and_directories = [ 74 | arg for arg in root_files + root_directories if arg not in skip 75 | ] 76 | 77 | def execute_tool(description, *args): 78 | """Execute a checking tool with its arguments.""" 79 | command_line = list(args) + files_and_directories 80 | click.echo(f"{description}: {' '.join(command_line)}") 81 | rv = call(command_line) 82 | if rv != 0: 83 | exit(rv) 84 | 85 | isort_args = [] 86 | black_args = [] 87 | if check: 88 | isort_args.append("--check") 89 | black_args.append("--check") 90 | if fix_imports: 91 | execute_tool("Fixing import order", "isort", *isort_args) 92 | execute_tool("Formatting style", "black", *black_args) 93 | execute_tool("Checking code style", "flake8") 94 | 95 | 96 | @click.command() 97 | def gendb(): 98 | """Generate small test database.""" 99 | from bioc_webstats.database import db 100 | from tests.conftest import generate_small_test_db 101 | 102 | click.echo("Creating small test database") 103 | app = current_app._get_current_object() 104 | 105 | test_db_contents = generate_small_test_db(app) 106 | pass 107 | 108 | 109 | @click.command() 110 | @click.option( 111 | "-s", 112 | "--start", 113 | required=False, 114 | callback=parse_date, 115 | help="Beginning date for upload. Default: first date not already proceessed.", 116 | ) 117 | @click.option( 118 | "-e", 119 | "--end", 120 | required=False, 121 | callback=parse_date, 122 | help="Ending date for upload. Default: yesterday (UTC)", 123 | ) 124 | @click.option( 125 | "-d", 126 | "--database", 127 | required=False, 128 | help="Name of the source database. DefaUlt: default", 129 | ) 130 | @click.option( 131 | "-f", 132 | "--filename", 133 | required=False, 134 | help="Specifies the name of a local file to receive the csv results instead of sending them to the database", 135 | ) 136 | @click.option( 137 | "-c", 138 | "--cloudfront", 139 | required=False, 140 | help="If present, the distribution ID of the CloudFront cachce to refresh. If absent, no refresh", 141 | ) 142 | @click.option( 143 | "--path", 144 | required=False, 145 | help="The CloudFront path to refresh. Default: '/packages/stats/*'", 146 | ) 147 | def ingest(start, end, database, filename, cloudfront, path): 148 | """Read raw weblogs, select valid package downlads, update webstats database""" 149 | 150 | if path is None: 151 | path = "/packages/stats/*" 152 | 153 | ingest_logs( 154 | start_date=start, 155 | end_date=end, 156 | source_database=database, 157 | result_filename=filename, 158 | cloudfront_id=cloudfront, 159 | cloudfront_path=path, 160 | ) 161 | 162 | 163 | @click.command() 164 | @click.option( 165 | "-n", "--namespace", required=False, help="Namespace (parameter path prefix)" 166 | ) 167 | @click.option("-p", "--profile", required=False, help="AWS SSO profile for target") 168 | @click.option("-r", "--region", required=False, help="AWS target region") 169 | def configp(namespace, profile, region): 170 | """Initialize AWS parameter set""" 171 | 172 | if namespace is None: 173 | namespace = '/bioc/webstats/prod/' 174 | if region is None: 175 | region = 'us-east-1' 176 | if profile is None: 177 | profile = 'bioc' 178 | 179 | session = boto3.Session( 180 | profile_name=profile, 181 | region_name=region 182 | ) 183 | 184 | ssm_client = session.client('ssm') 185 | 186 | # TODO Test for previously existing. Create --force parameter 187 | # TODO Add tags 188 | 189 | try: 190 | for p in configuration_dictionary: 191 | q = p 192 | q["Name"] = namespace + p["Name"] 193 | response = ssm_client.put_parameter(**q) 194 | # TODO check response for errors 195 | except Exception as e: 196 | logging.error(f"Failed to store parameters. {e}") 197 | raise e 198 | logging.info("AWs Parameter set configured namespace:{namespace} profile:{profile} {region}.") 199 | 200 | @click.command() 201 | @click.option("-d", "--dry_run", is_flag=True, required=False, help="Report packages changes but do not update database") 202 | @click.option("-v", "--verbose", is_flag=True, required=False, help="More reporting") 203 | @click.option("-f", "--force", is_flag=True, required=False, help="Update table even though the update is suspicously large") 204 | def packages(dry_run:bool = False, verbose: bool = True, force: bool = False): 205 | """Read package information from the Bioconductor infrastructure and update the packages table to reflect current status""" 206 | 207 | packages_table_update(dry_run, verbose, force) 208 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Initialization for pytests.""" 2 | import datetime as dt 3 | import logging 4 | import math 5 | from zlib import crc32 6 | 7 | import pytest 8 | from dateutil.relativedelta import relativedelta 9 | from flask import Flask 10 | from flask_sqlalchemy import SQLAlchemy 11 | from sqlalchemy import create_engine 12 | from sqlalchemy.pool import StaticPool 13 | from webtest import TestApp 14 | 15 | from bioc_webstats.app import create_app 16 | from bioc_webstats.extensions import db as _db 17 | from bioc_webstats.models import PackageType 18 | 19 | from .factories import PackagesFactory, StatsFactory, WebstatsInfoFactory 20 | 21 | 22 | @pytest.fixture(scope="session") 23 | def app(): 24 | """Create application for the tests.""" 25 | _app = create_app("debug") 26 | create_engine( 27 | "sqlite:///:memory:", 28 | connect_args={"check_same_thread": False}, 29 | poolclass=StaticPool, 30 | ) 31 | _app.logger.setLevel(logging.DEBUG) 32 | ctx = _app.test_request_context() 33 | ctx.push() 34 | 35 | yield _app 36 | 37 | ctx.pop() 38 | 39 | 40 | @pytest.fixture(scope="session") 41 | def db(app: Flask): 42 | """Session-wide test database.""" 43 | _db = generate_small_test_db(app) 44 | yield _db 45 | _db.session.close() 46 | _db.drop_all() 47 | 48 | 49 | @pytest.fixture(scope="session") 50 | def webapp(app: Flask, db: SQLAlchemy): 51 | """Fixture for app test.""" 52 | return TestApp(app) 53 | 54 | 55 | @pytest.fixture(scope="function") 56 | def session(db: SQLAlchemy, request: pytest.FixtureRequest): 57 | """Create isolated transaction.""" 58 | db.session.begin_nested() 59 | 60 | def commit(): 61 | db.session.flush() 62 | 63 | # patch commit method 64 | old_commit = db.session.commit 65 | db.session.commit = commit 66 | 67 | def teardown(): 68 | db.session.rollback() 69 | db.session.close() 70 | db.session.commit = old_commit 71 | 72 | request.addfinalizer(teardown) 73 | return db.session 74 | 75 | 76 | database_test_cases = [ 77 | (PackageType.BIOC, "affy", "2023-09-01"), 78 | (PackageType.BIOC, "affydata", "2023-08-01"), 79 | (PackageType.ANNOTATION, "BSgenome.Hsapiens.UCSC.hg38", "2019-01-01"), 80 | (PackageType.ANNOTATION, "BSgenome.Scerevisiae.UCSC.sacCer3", "2021-01-01"), 81 | ] 82 | 83 | database_test_valid_date = dt.date(2023, 10, 4) 84 | 85 | 86 | def create_hashed_counts(d: dict) -> tuple[int, int]: 87 | """Calculate reproducable hashed ip_count and download_count values for test stats rows. 88 | 89 | For small database tests, create ip_count and downlooad count values that are a function of the 90 | other columns of the stats table. This function is used to both generate the test rows and to check 91 | that the test rows return the correct values. 92 | 93 | Arguments: 94 | d -- A dictionary containing the tvalues of a stats record 95 | 96 | Returns: 97 | an ordered pair, the hashed ip_count and the hashed download_count 98 | """ 99 | s = "|".join( 100 | [str(d.get(tag, "")) for tag in ["category", "package", "date", "is_monthly"]] 101 | ) 102 | # 9007 is a prime number of a size to give a reasonable hash for test purposes 103 | download_count = crc32(s.encode("utf-8")) % 9007 104 | ip_count = int(math.ceil(math.sqrt(download_count))) 105 | return (ip_count, download_count) 106 | 107 | 108 | def generate_small_test_db_packages(): 109 | """Create list of package names in the small_test database.""" 110 | packages_dict = [] 111 | for category, package, _ in database_test_cases: 112 | u = { 113 | "category": category, 114 | "package": package, 115 | "first_version": 201, 116 | "last_version": None, 117 | } 118 | packages_dict.append(u) 119 | return packages_dict 120 | 121 | 122 | def generate_small_test_db_stats(): 123 | """Create list of dictionary objects corresponding to Stats columns for small test database.""" 124 | end_date = database_test_valid_date 125 | 126 | def months_sequence(start_date: dt.date, end_date: dt.date): 127 | """Yield the first day of each month from start_date to end_date inclusive.""" 128 | current_date = start_date 129 | 130 | while current_date <= end_date: 131 | yield current_date 132 | current_date += relativedelta(months=1) 133 | 134 | stats_dict = [] 135 | for category, package, start_date in database_test_cases: 136 | for d in months_sequence( 137 | dt.datetime.strptime(start_date, "%Y-%m-%d").date(), end_date 138 | ): 139 | u = { 140 | "category": category, 141 | "package": package, 142 | "date": d, 143 | "is_monthly": True, 144 | } 145 | u["ip_count"], u["download_count"] = create_hashed_counts(u) 146 | stats_dict.append(u) 147 | 148 | return stats_dict 149 | 150 | def generate_small_test_db(app: Flask): 151 | """Session-wide test database.""" 152 | _db.app = app 153 | with app.app_context(): 154 | _db.create_all() 155 | u = generate_small_test_db_stats() 156 | [StatsFactory(**v) for v in u] 157 | u = [{"key": "ValidThru", "value": "2023-10-04"}] 158 | [WebstatsInfoFactory(**v) for v in u] 159 | u = generate_small_test_db_packages() 160 | [PackagesFactory(**v) for v in u] 161 | _db.session.commit() 162 | return _db 163 | 164 | 165 | def check_hashed_counts(d: dict) -> bool: 166 | """Check that a genearted test data with hashed counts are correct. 167 | 168 | Arguments: 169 | d -- Dictionary form of stats table row 170 | 171 | Returns: 172 | True ==> The ip_count and download_count matches the calcuated hash 173 | """ 174 | ip_count, download_count = create_hashed_counts(d) 175 | return ( 176 | d.get("ip_count", -1) == ip_count 177 | and d.get("download_count", -1) == download_count 178 | ) 179 | 180 | 181 | def check_hashed_count_list(d_list: list[dict]) -> bool: 182 | """ 183 | This function checks if all stats rows in a list have the expected hash counts. 184 | 185 | :param d_list: A list of dictionaries derived from Stats Rows 186 | :type d_list: [dict] 187 | :return: The function `check_hashed_hashed_count_list` returns a boolean value. It returns `True` if 188 | all the rows in the input list have the expected count values, and it returns `False` if at least 189 | one row has an incorrect count value. 190 | """ 191 | """Check that all stats rows in this list have expected hash counts. 192 | 193 | Arguments: 194 | d_list -- A list of dictionaries derivd from Stats Rows. 195 | 196 | Returns: 197 | True ==> All the rows have the expected count values. 198 | False ==> At least one row was incorrect. 199 | """ 200 | for r in d_list: 201 | if not check_hashed_counts(r): 202 | return False 203 | return True 204 | 205 | 206 | @pytest.fixture(scope="session") 207 | def webstatsinfo(db: SQLAlchemy): 208 | """Create WebstatsInfo for the tests.""" 209 | return 210 | 211 | 212 | @pytest.fixture(scope="session") 213 | def stats(db: SQLAlchemy): 214 | """Create stats for the tests.""" 215 | return generate_small_test_db_stats() 216 | 217 | 218 | @pytest.fixture(scope="session") 219 | def packages(db: SQLAlchemy): 220 | """Create packages for the tests.""" 221 | return [u['package'] for u in generate_small_test_db_packages()] 222 | -------------------------------------------------------------------------------- /bioc_webstats/app.py: -------------------------------------------------------------------------------- 1 | """ 2 | app.py 3 | 4 | Create the flask application and inialize the environment 5 | 6 | Summary: 7 | This module implments the application factory, 8 | as explained here: http://flask.pocoo.org/docs/patterns/appfactories/. 9 | 10 | Description: 11 | All the run time parameters are read and various services are registered with the Flask infrastructure. 12 | 13 | Notes: 14 | The run time parameters can come from the following sources. They are processed in the order 15 | shown here. A later source will overwrite an earlier source. 16 | 1. Bootstrap parameters. Enviroment variables necessary to get started. 17 | 2. configmodule.py. This defines the invariant default values for each parameter. Also defines manifest constants. 18 | 3. An environmental parameter store. In specific, the AWS Systems Manager (SSM) Parameter Store. 19 | 4. FLASK_* environment variables. For temporary overrides in production. 20 | 5. ".env" files. Provides for parameters to be set based on their presence in this file. 21 | Useful for setting up test environments. Should never be used in production. 22 | 23 | """ 24 | import logging 25 | import logging.handlers 26 | import os 27 | import sys 28 | 29 | from flask import Flask, render_template 30 | from werkzeug.utils import import_string 31 | 32 | import bioc_webstats.aws_functions as aws 33 | from bioc_webstats import commands, splash, stats 34 | from bioc_webstats.configmodule import configuration_dictionary 35 | from bioc_webstats.extensions import ( 36 | cache, 37 | csrf_protect, 38 | db, 39 | debug_toolbar, 40 | flask_static_digest, 41 | migrate, 42 | ) 43 | 44 | 45 | def create_app( 46 | config_type=None, 47 | aws_parameter_path=None, 48 | enable_remote_debugging=False 49 | ): 50 | """The Application Factory. Set up the particular instance of the Flask class. 51 | 52 | Keyword Arguments: 53 | config_type -- The configmodule subclass object to use. Allowed values "production" and "development" (default: {"production"}) 54 | aws_parameter_path -- If present, the the AWS System Manager Parameter Store will be searched for runtime parameters. See function aws.get_parameter_store_values for more information (default: {None}) 55 | enable_remote_debugging -- If True, the system wil attempt to enable the Visual Studio COde remote debugging protocl. See package ptvsd for details. (default: {False}) 56 | 57 | Returns: 58 | A fully configured Flask App object. 59 | """ 60 | 61 | 62 | if enable_remote_debugging: 63 | # This will allow the use of the VS Code remote debugger. 64 | import ptvsd 65 | ptvsd.enable_attach(address=('0.0.0.0', 5678), redirect_output=True) 66 | print("Waiting for debugger to attach...") 67 | ptvsd.wait_for_attach() 68 | 69 | app = Flask(__name__.split(".")[0]) 70 | 71 | # Bootstrap variables 72 | if config_type is None: 73 | config_type = os.getenv('FLASK_ENV', config_type) 74 | app.config["ENV"] = config_type 75 | 76 | config_object_name = f"bioc_webstats.configmodule.{app.config["ENV"]}Config" 77 | 78 | # Populate the configuration from config and its sublcasses 79 | cfg = import_string(config_object_name)() 80 | app.config.from_object(cfg) 81 | 82 | # Next, load parameters from the SSM Parameter store. 83 | if aws_parameter_path is not None: 84 | param_dict = aws.get_parameter_store_values(app.config["AWS_PATH_PARAMETER"]) 85 | xref = {} 86 | for u in configuration_dictionary: 87 | xref[u["Name"]] = u["FlaskName"] 88 | for k, v in param_dict.items(): 89 | try: 90 | app.config[xref[k]] = v 91 | except Exception as e: 92 | logging.error(f"Could not map AWS SSM parameter {k}.") 93 | 94 | # Override with environment variables with FLASK_ prefix 95 | app.config.from_prefixed_env() 96 | 97 | # Exract database credentials from Secrets Manager 98 | if "DBCREDENTIALS" in app.config: 99 | app.config["DATABASE_URL"] = aws.aws_secret_to_psql_url( 100 | param_dict["db/credentials"], "us-east-1", "webstats" 101 | ) 102 | 103 | 104 | 105 | app.config["SQLALCHEMY_DATABASE_URI"] = app.config["DATABASE_URL"] 106 | # TODO SECRET_KEY from paraemter store 107 | app.config[ 108 | "SECRET_KEY" 109 | ] = "1849cb85026145adc5164b9568d6afbde65351264f87c25aebdadc576ae662f5" 110 | 111 | configure_logger(app) 112 | register_extensions(app) 113 | register_blueprints(app) 114 | register_errorhandlers(app) 115 | register_shellcontext(app) 116 | register_commands(app) 117 | return app 118 | 119 | 120 | def register_extensions(app): 121 | """Register Flask extensions.""" 122 | cache.init_app(app) 123 | db.init_app(app) 124 | csrf_protect.init_app(app) 125 | 126 | # No debug toolbar for production 127 | if app.config["ENV"] != "production": 128 | debug_toolbar.init_app(app) 129 | 130 | migrate.init_app(app, db) 131 | flask_static_digest.init_app(app) 132 | return None 133 | 134 | 135 | def register_blueprints(app): 136 | """Register Flask blueprints.""" 137 | app.register_blueprint(stats.bp) 138 | # Exclude debuging tools if this is a production environment 139 | if app.config["ENV"] != "production": 140 | app.register_blueprint(splash.blueprint) 141 | return None 142 | 143 | 144 | def register_errorhandlers(app): 145 | """Register error handlers.""" 146 | 147 | def render_error(error): 148 | """Render error template.""" 149 | # If a HTTPException, pull the `code` attribute; default to 500 150 | error_code = getattr(error, "code", 500) 151 | return render_template(f"{error_code}.html"), error_code 152 | 153 | # Pass through http error codes if this is production 154 | if (app.config['ENV'] != 'production'): 155 | for errcode in [401, 404, 500]: 156 | app.errorhandler(errcode)(render_error) 157 | 158 | return None 159 | 160 | 161 | def register_shellcontext(app): 162 | """Register shell context objects.""" 163 | 164 | def shell_context(): 165 | """Shell context objects.""" 166 | return {"db": db} 167 | 168 | app.shell_context_processor(shell_context) 169 | 170 | 171 | def register_commands(app): 172 | """Register Click commands.""" 173 | app.cli.add_command(commands.test) 174 | app.cli.add_command(commands.lint) 175 | app.cli.add_command(commands.gendb) 176 | app.cli.add_command(commands.ingest) 177 | app.cli.add_command(commands.configp) 178 | app.cli.add_command(commands.packages) 179 | 180 | 181 | def configure_logger(app): 182 | """Configure loggers.""" 183 | 184 | logger = logging.getLogger("webstats") 185 | logger.setLevel(app.config["LOG_LEVEL"]) 186 | log_file = app.config["LOG_FILEPATH"] 187 | 188 | # create the log file if necessasry 189 | dir_path = os.path.dirname(log_file) 190 | if not os.path.exists(dir_path): 191 | os.makedirs(dir_path) 192 | # Create the file if it does not exist 193 | with open(log_file, 'a'): 194 | os.utime(log_file, None) 195 | 196 | # TODO Why do we need a logrotate.d entry if I have the code below 197 | file_handler = logging.handlers.RotatingFileHandler( 198 | log_file, maxBytes=100000000, backupCount=5 199 | ) 200 | file_handler.setLevel(app.config["LOG_LEVEL"]) 201 | formatter = logging.Formatter( 202 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 203 | ) 204 | file_handler.setFormatter(formatter) 205 | logger.addHandler(file_handler) 206 | 207 | app.logger.handlers = logger.handlers 208 | app.logger.setLevel(logger.level) 209 | -------------------------------------------------------------------------------- /docs/stats_replacement_design_brief.md: -------------------------------------------------------------------------------- 1 | # Design Brief: stats.bioconductor.org replacement 2 | 3 | Author: Robert Shear rshear@ds.dfci.harvard.edu 4 | Date: 2023-10-03 5 | Version: rev 1 6 | 7 | ## Change log 8 | ### rev 2 9 | - 04-Oct-2023 10 | - The "Current State:Static Web Assets" section spoke of 9 content types. There are 8 content types. The `_pkg_scores.tab` example [https://bioconductor.org/packages/stats/bioc/bioc_pkg_scores.tab](https://bioconductor.org/packages/stats/bioc/bioc_pkg_scores.tab) appeared twice in the example table. 11 | 12 | ### rev 1 13 | - 03-Oct-2023 14 | - Added coding language information (markdown+html+javascript) to section [[#Server www.bioconductor.org]] 15 | - Resolved Future State S3 design issues 16 | - Current state description of paths to content with links to examples added 17 | - Publishing to www elaborated 18 | - Future state architecture now simply Python data-aware pages. 19 | - Future state Glue issues resolved 20 | - Additional Future State considerations will be determined during development 21 | 22 | ## Initial draft 23 | - 21-Sep-2023 24 | 25 | # Overview 26 | 27 | ![Block Diagram](design_brief_block_diagram.png) 28 | # Current State 29 | ## Repo 30 | - https://github.com/Bioconductor/download_stats 31 | ## Service: S3 Bucket 32 | - The only S3 bucket that is currently active with respect to the stats process is 33 | ``` 34 | s3://bioc-cloudfront-logs 35 | ``` 36 | 37 | - Each object is a gz-compressed W3C Extended Web Log file. The recent files are generated by www CloudFront distribution. The provenance of the early files, which go back as far as 2013, are unknown, but assumed to have the same format as the current files. 38 | - There is no prefix to the objects, i.e., all objects are at the top level with names as seen in the specimen s3 object name below. The code `E1...V3` designates the log source. The `0129f869` is a hash to assure object uniqueness. 39 | ``` 40 | s3://bioc-cloudfront-logs/E1TVLJONPTUXV3.2023-09-17-00.0129f869.gz 41 | ``` 42 | `` 43 | - Objects are destroyed when 6 months old. They are all assigned to the Standard storage class. The object count is ~61k and the bucket size is ~33 GB. 44 | ## Server: stats.bioconductor.org 45 | 46 | ### Process: Log Intake 47 | - Copy logs verbatim from S3 bucket to EBS volume 48 | - Python script: `get_s3_logs.py`, repo `Bioconductor/download_stats` 49 | - Runs weekly Sunday 20:00 local time 50 | 51 | ### Data: WC3 Web Logs 52 | - Space used: 359 GB 53 | - Location `/home/biocadmin/bioc-access-logs/s3/YYYY-MM-DD/E1TVLJONPTUXV3.YYYY-MM-DD-HH.0129f869.gz` 54 | - Date range: 2013-08-28 - 2023-09-18 55 | - Specimen file name: `E1TVLJONPTUXV3.2023-09-17-00.0129f869.gz` 56 | ### Process: Database Load 57 | - Transforms logs from syslog files to rows in sqlite database 58 | - Log line-per-row, various column level transformations 59 | - Python script: `get_s3_logs.py`, repo `Bioconductor/download_stats` 60 | - Runs weekly Sunday 22:00 local time 61 | ### Data: SQLite DBs 62 | - Space used 67 GB 63 | - Directory: `/home/biocadmin/download_dbs/` 64 | - One file for each year: format `download_db_YYYY.sqlite` , e.g.: `download_db_2023.sqlite` 65 | - Range of files: 2009 through 2023 66 | ### Process: Generate Static Web Assets 67 | - Creates static html assets that are surfaced by www.bioconductor.org Apache2 server 68 | - Produced for one topic at a time, weekly. For each topic, there are two scripts, `extractDownloadStats-for-xxx.py` and `makeDownloadStatsHTML-for-xxx.py`, where `xxx` is given in the table below. 69 | 70 | | Topic | When | Code | 71 | | ------------------- | --------------- | --------------- | 72 | | software packages | Monday 12:00 | bioc | 73 | | annotation packages | Wednesday 15:00 | data-annotation | 74 | | experiment packages | Friday 03:00 | data-experiment | 75 | | workflows packages | Saturday 15:00 | workflows | 76 | 77 | 78 | ### Data: Static Web Assets (.html, .txt, .tab) 79 | On stats.bioconductor.org. the web content are stored under directory `/home/biocadmin/public_html/stats/`. When published to master.bioconductor.org, they are stored under `/extra/www/bioc/packages/stats`. 80 | 81 | 82 | The files are organized hierarchically with the keys as shown below. 83 | 84 | | Level | Keys | Example | 85 | | ----- | --------------------------------------- | -------------------------------- | 86 | | 0 | | Aggregate measures | 87 | | 1 | `package_type` | `bioc` | 88 | | 2 | `package_type`, `package_name` | `bioc, S4Vectors` | 89 | | 3 | `package_type`, `package_name`, `year` | `bioc, S4Vectors, 2023` | 90 | 91 | The legal values for `package_type` in are `bioc`, `data-annotation`,` data-experiment`, and `workflows`. 92 | 93 | There are 8 content types, 3 of which are HTML documents, the remainder being text files. Specimens of these files can be retrieved by following the links in the table below. 94 | 95 | | | | 96 | |:-------------------------------------------------------------------------------------------------------------------------- |:------------------------ | 97 | | [ROOT](https://bioconductor.org/packages/stats/) | index.html | 98 | | [bioc/](https://bioconductor.org/packages/stats/bioc/) | index.html | 99 | | [bioc/bioc_packages.txt](https://bioconductor.org/packages/stats/bioc/bioc_packages.txt) | bioc_packages.txt | 100 | | [bioc/bioc_pkg_scores.tab](https://bioconductor.org/packages/stats/bioc/bioc_pkg_scores.tab) | bioc_pkg_scores.tab | 101 | | [bioc/bioc_pkg_stats.tab](https://bioconductor.org/packages/stats/bioc/bioc_pkg_stats.tab) | bioc_pkg_stats.tab | 102 | | [bioc/S4Vectors/](https://bioconductor.org/packages/stats/bioc/S4Vectors/) | index.html | 103 | | [bioc/S4Vectors/S4Vectors_2023_stats.tab](https://bioconductor.org/packages/stats/bioc/S4Vectors/S4Vectors_2023_stats.tab) | S4Vectors_2023_stats.tab | 104 | | [bioc/S4Vectors/S4Vectors_stats.tab](https://bioconductor.org/packages/stats/bioc/S4Vectors/S4Vectors_stats.tab) | S4Vectors_stats.tab | 105 | 106 | ## Server: www.bioconductor.org 107 | - Apache2 Server 108 | # Future State 109 | 110 | ## Service: S3 Bucket 111 | 112 | ### Recovering history 113 | The object retention time in the current bucket is 183 days. However the EBS files in the `/home/biocadmin/bioc-access-logs/s3` directory go back to 2013. These files should be returned to the S3 bucket. S3 storage is archival durability, while EBS storage is not. In addition, S3 storage is less expensive than EBS storage. Furthermore, there are less expensive storage S3 storage classes that are appropriate for this use case. 114 | ### Data privacy 115 | The source IP addresses in the weblog records are subject to privacy requirements in GDPR jurisdictions as well as elsewhere. We should encrypt the source IP addresses with a strong, secret asymmetrical key. This will allow us to continue to count unique IP addresses, while securing the identity of the user. 116 | ### Partition Projection 117 | When an S3 bucket is the source for an Athena query, the strategy is to scan the entire bucket (or prefix). This is barely satisfactory in the current state, with 6 months of history. Once the log history has been returned to S3, it would make date-range limited inquires intractable. 118 | 119 | Athena has the capability of extracting information from the object path and mapping it to database columns. This is known as a partition projection. Specifically, we propose to add a path prefix comprised of the CloudFront distribution ID (for other weblog sources, an alternative unique identifier) followed by the date. So, the file `E1TVLJONPTUXV3.2021-12-26-HH.0129f869.gz` will have an S3 URI like this: 120 | 121 | ``` 122 | s3://bucket_name/E1TVLJONPTUXV3/2021-12-26/`E1TVLJONPTUXV3.2021-12-26-HH.0129f869.gz 123 | ``` 124 | 125 | More information may be found at [Setting up partition projection](https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html). Details on mapping the URI structure to columns may be found in in the document [Specifying custom S3 storage locations](https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html#partition-projection-specifying-custom-s3-storage-locations). 126 | 127 | #### Implementation 128 | 129 | Before commencing the data migration, the performance characteristics of this partition projection should be verified. 130 | 131 | The current CloudFront-to-S3 process will require modification to attach the appropriate prefix to each object. 132 | ### Data Migration 133 | 134 | A new bucket will be established for the weblogs, using the new production account number. The log stream with the new format should be activated and a final synchronization with the old S3 bucket effected. 135 | Due to the file structure of the current state log files, the migration from the current state `stats` server can be migrated with an `rsync` command. 136 | ## Service: ETL 137 | ## AWS Glue 138 | The AWS Glue service provides ETL (extract-transform-load) capabilities. Loading transformed weblogs into a relational database is a central use case for Glue. 139 | 140 | Athena is an AWS service based on the Trino (formerly PrestoSQL) open-source distributed SQL query engine. It mediates the consumption of the weblogs and the loading of transformed results into other database systems. The data transformation rules in the Current State Python ETL code will be converted to SQL. 141 | 142 | Note. In 2021, Martin Morgan developed an Athena based system for analyzing our weblogs directly from S3. The source code can be found at https://github.com/Bioconductor/AthenaLogs. 143 | 144 | ## Service: Relational Database 145 | - AWS Aurora PostgreSQL 146 | ## Service: Data Gateway 147 | - The data gateway will be a Python function run within the AWS Lambda service that will consume http GET URLs and return the appropriate text or HTML document. The Python function will be a composition of the Current State Python code. 148 | ## Server: \www.bioconductor.org 149 | All HTTP requests to `https://bioconductor.org/packages/stats/` will be routed to a new Python application that will decode the URI path and produce a bit-for-bit perfect replica of the current output. 150 | 151 | This program will tentatively use the `Flask` framework. 152 | # Additional Considerations 153 | 154 | The following topics will be determined during development. 155 | - Roles 156 | - Tags 157 | - Test Instance 158 | - System Management 159 | - Terraform 160 | - Ansible 161 | - CloudWatch Alerts and Metrics 162 | -------------------------------------------------------------------------------- /bioc_webstats/stats.py: -------------------------------------------------------------------------------- 1 | """_summary_. 2 | 3 | Raises: 4 | NotImplementedError: _description_ 5 | 6 | Returns: 7 | _description_ 8 | """ 9 | import os 10 | from collections import defaultdict 11 | from datetime import date, timedelta 12 | 13 | from flask import ( 14 | Blueprint, 15 | Response, 16 | abort, 17 | make_response, 18 | render_template, 19 | send_from_directory, 20 | ) 21 | 22 | # TODO mixed pattern! should either import all the models (e.g. Packages) or make everything qualified e.g. db.WebstatsInfo... 23 | import bioc_webstats.models as db 24 | from bioc_webstats.models import Packages, PackageType, WebstatsInfo 25 | 26 | URI_PATH_PREFIX = "/packages/stats" 27 | 28 | # Map from incoming page name name to PackageType 29 | category_map = { 30 | "bioc": { 31 | "category": PackageType.BIOC, 32 | "description": "software", 33 | "index_page": "index.html", 34 | "stem": "bioc", 35 | "tab_page_prefix": "bioc", 36 | "top": 75, 37 | }, 38 | "data-annotation": { 39 | "category": PackageType.ANNOTATION, 40 | "description": "annotation", 41 | "index_page": "data-annotation.html", 42 | "stem": "data-annotation", 43 | "tab_page_prefix": "annotation", 44 | "top": 30, 45 | }, 46 | "data-experiment": { 47 | "category": PackageType.EXPERIMENT, 48 | "description": "experiment", 49 | "index_page": "data-experiment.html", 50 | "stem": "data-experiment", 51 | "tab_page_prefix": "experiment", 52 | "top": 15, 53 | }, 54 | "workflows": { 55 | "category": PackageType.WORKFLOWS, 56 | "description": "workflow", 57 | "index_page": "workflows.html", 58 | "stem": "workflows", 59 | "tab_page_prefix": "workflows", 60 | "top": 0, 61 | }, 62 | } 63 | 64 | bp = Blueprint("stats", __name__, url_prefix=URI_PATH_PREFIX) 65 | 66 | def webstats_response(payload, content_type='text/html') -> Response: 67 | """Create an http Response including response headers required by consuming systems. 68 | 69 | Specifically, the Last-Modified header is set to, which is set to midnight GMT, 70 | one second after the "valid through" date. 71 | 72 | Arguments: 73 | payload -- The body of the http response, either raw text or html 74 | 75 | Keyword Arguments: 76 | content_type -- The value of the Content-Type header (default: {'text/html'}) 77 | 78 | Returns: 79 | A Response variable, ready to be returned from a route or blueprint decorated function 80 | """ 81 | response = make_response(payload) 82 | response.headers['Content-Type'] = content_type 83 | generated_date=WebstatsInfo.get_valid_thru_date() 84 | modified_date = (generated_date + timedelta(days=1)) 85 | response.headers['Last-Modified'] = modified_date.strftime("%a, %d %b %Y %H:%M:%S GMT") 86 | return response 87 | 88 | def split_to_dict_list(lst): 89 | """Transform int a dictionary based on first letter (case insensitive).""" 90 | result = defaultdict(list) 91 | 92 | for item in sorted(lst, key=lambda x: x[0].lower()): 93 | first_char = item[0][0].upper() # Extract the first character of the string 94 | result[first_char].append(item) 95 | 96 | return result 97 | 98 | 99 | def result_list_to_visual_list(rows): 100 | """Transform 3 column databas results to 4 column visual results with dense months.""" 101 | dates = set([u[0] for u in rows]) 102 | y0 = min(dates).year 103 | y1 = max(dates).year 104 | holes = ( 105 | set([date(y, m + 1, 1) for y in range(y0, y1 + 1) for m in range(12)]) - dates 106 | ) 107 | out = sorted(rows + [(w, 0, 0) for w in holes], key=lambda x: (-x[0].year, x[0])) 108 | return [ 109 | {'year': dt.year, 'month': dt.strftime("%b") if dt.day == 1 else "all", 'unique_ips': ip, 'downloads': dl} 110 | for dt, ip, dl in out 111 | ] 112 | 113 | 114 | def query_result_to_text(source): 115 | """Transform tabular query results to string. 116 | 117 | The strings are exact replicas of the .tab files found under 118 | www.bioconductor.org/packages/stats/.../_stats.tab 119 | and _scores.tab. 120 | 121 | The match exactly because they may be consumed by external software. 122 | 123 | Arguments: 124 | source -- A list of tuples in the form 125 | [(package, year, month, IP_count, Download_count)] 126 | or 127 | [(year, month, IP_count, Download_count)] 128 | 129 | Returns: 130 | A string in the format of a tab seperated file with one header row. 131 | 132 | """ 133 | 134 | def process_one_package(package, rows): 135 | """For one package produce the result. If package is None, return 4 columns.""" 136 | if package is None: 137 | k = "" 138 | else: 139 | k = package + "\t" 140 | 141 | out = result_list_to_visual_list(rows) 142 | return "\n".join( 143 | [f"{k}{u['year']}\t{u['month']}\t{u['unique_ips']}\t{u['downloads']}" for u in out]) 144 | 145 | if source == []: 146 | return "" 147 | heading = "Year\tMonth\tNb_of_distinct_IPs\tNb_of_downloads" 148 | match len(source[0]): 149 | case 3: 150 | return heading + "\n" + (process_one_package(None, source)) 151 | 152 | case 4: 153 | result = ["Package\t" + heading] 154 | split = {} 155 | for t in source: 156 | split.setdefault(t[0], []).append(t[1:]) 157 | 158 | for k, v in split.items(): 159 | result.append(process_one_package(k, v)) 160 | 161 | return "\n".join(result) 162 | 163 | case _: 164 | raise AssertionError("query_result_to_text expects 4 or 5 columns") 165 | 166 | @bp.route('/static/') 167 | def static_files(filename): 168 | """Redirect requests to serve static files from root to the actual root of webstats.""" 169 | static_folder = os.path.join(os.path.dirname(__file__), 'static') 170 | return send_from_directory(static_folder, filename) 171 | 172 | 173 | @bp.route("/bioc/bioc_packages.txt") 174 | def show_packages(): 175 | """_summary_.""" 176 | payload = db.Packages.get_package_names() 177 | text = ("\n").join(payload) 178 | return webstats_response(text, content_type="text/plain") 179 | 180 | 181 | @bp.route("/_pkg_scores.tab") 182 | def show_package_scores(category, package): 183 | """_summary_.""" 184 | # We match the legacy system, where both the path and the file_name included the category 185 | 186 | # if for category, in a form like this; /bio/bioc_pkg_scores.tab, or /data-annotation/annotation.pkg_scores.tab 187 | # `category_map` is a dictionary that maps the category names to their 188 | # corresponding information. Each category has a set of attributes such as the 189 | # package type, description, package index page, stem, tab page prefix, and 190 | # top count. This mapping is used in various parts of the code to retrieve the 191 | # relevant information based on the category name. 192 | selected_category = category_map.get(category, None) 193 | if selected_category is not None and package == selected_category["tab_page_prefix"]: 194 | payload = db.Stats.get_download_scores(category=selected_category["category"]) 195 | else: 196 | abort(404) 197 | text = "Package\tDownload_score\n" + "\n".join([f"{x[0]}\t{x[1]}" for x in payload]) 198 | return webstats_response(text, content_type="text/plain") 199 | 200 | 201 | @bp.route("/_stats.tab") 202 | @bp.route("/__stats.tab") 203 | @bp.route("//_stats.tab") 204 | @bp.route("//__stats.tab") 205 | def show_package_stats(category, package, package_path=None, year=None): 206 | """_summary_.""" 207 | selected_category = category_map.get(category, None) 208 | if selected_category is None: 209 | abort(404) 210 | # If there is a second level in the path, then it can only be the package name 211 | # and that name must match the package name at the leaf 212 | # If the url is for all the packages in the repo, 213 | # it will be in the form /bio/bio_pkg_stats.tab and the year parameter will be 'pkg' 214 | 215 | # Helpful to keep the nested conditionals reasonably simple 216 | payload = None 217 | 218 | if package_path is None and selected_category["stem"] == category: 219 | # Here the package is actually the name of the category 220 | package = None 221 | if year == 'pkg': 222 | # Here are /bioc/bioc_stats.tab and /data_exepriment/experiment_pkg.tab 223 | # In this case we will report all packages in the category 224 | year = None 225 | else: 226 | # Here we report combined data, either for a year or for all years 227 | payload = db.Categorystats.get_combined_counts(selected_category["category"], year) 228 | 229 | if payload is None: 230 | payload = db.Stats.get_download_counts(selected_category["category"], package, year) 231 | 232 | if payload == []: 233 | abort(404) 234 | 235 | return webstats_response(query_result_to_text(payload), content_type="text/plain") 236 | 237 | 238 | @bp.route("/") 239 | @bp.route("/index.html") 240 | @bp.route("/.html") 241 | def show_package_summary(category="bioc"): 242 | """Render package summary page.""" 243 | selected_category = category_map.get(category, None) 244 | if selected_category is None: 245 | abort(404) 246 | category_enum = selected_category["category"] 247 | scores = db.Stats.get_download_scores(category_enum) 248 | url_list = [ 249 | [u["index_page"], u["description"]] 250 | for u in category_map.values() 251 | if selected_category["category"] != u["category"] 252 | ] 253 | top_count = selected_category["top"] 254 | top = sorted(scores, key=lambda x: x[-1])[:top_count] 255 | 256 | result = render_template( 257 | "category.html", 258 | top_count=top_count, 259 | category_links=url_list, 260 | category=category_enum, 261 | category_name=selected_category["description"], 262 | category_url_stem=selected_category["stem"], 263 | tab_page_prefix=selected_category["tab_page_prefix"], 264 | generated_date=WebstatsInfo.get_valid_thru_date(), 265 | top=top, 266 | scores=split_to_dict_list(scores), 267 | ) 268 | return webstats_response(result) 269 | 270 | 271 | @bp.route("/") 272 | @bp.route("/") 273 | @bp.route("//") 274 | @bp.route("/index.html") 275 | @bp.route("//index.html") 276 | def show_package_details(category, package=None): 277 | """Display package detials.""" 278 | selected_category = category_map.get(category, None) 279 | if selected_category is None: 280 | abort(404) 281 | 282 | if package is None: 283 | source = db.Categorystats.get_combined_counts(selected_category["category"]) 284 | depver = None 285 | else: 286 | source = db.Stats.get_download_counts(selected_category["category"], package) 287 | if len(source) == 0: 288 | abort(404) 289 | package_info = db.Packages.get_package_details(package) 290 | depver = package_info[3] 291 | 292 | if depver is not None: 293 | depver = depver[0] + "." +str(int(depver[1:3])) 294 | 295 | split = {} 296 | for t in source: 297 | split.setdefault(t[0].year, []).append(t) 298 | 299 | data_by_year = {year: result_list_to_visual_list(data) for year, data in split.items()} 300 | generated_date=WebstatsInfo.get_valid_thru_date() 301 | 302 | result = render_template( 303 | "stats-bioc.html", 304 | category=category, 305 | category_name=selected_category["description"], 306 | category_stem=selected_category["stem"], 307 | category_index_page=('/').join((bp.url_prefix, selected_category["index_page"])), 308 | package=package, 309 | generated_date=generated_date, 310 | data_by_year=data_by_year, 311 | first_year=list(data_by_year.keys())[-1], 312 | last_year=list(data_by_year.keys())[0], 313 | deprecated_version=depver 314 | ) 315 | return webstats_response(result) 316 | 317 | --------------------------------------------------------------------------------