├── assets
    ├── img
    │   ├── .gitkeep
    │   └── favicon.ico
    ├── js
    │   ├── script.js
    │   ├── plugins.js
    │   └── main.js
    └── css
    │   └── style.css
├── bioc_webstats
    ├── webpack
    │   └── .gitkeep
    ├── __init__.py
    ├── templates
    │   ├── nav.html
    │   ├── footer.html
    │   ├── 500.html
    │   ├── 404.html
    │   ├── 401.html
    │   ├── about.html
    │   ├── main.css
    │   ├── home.html
    │   ├── layout.html
    │   ├── category.html
    │   └── stats-bioc.html
    ├── flask_ingest.sh
    ├── flask_ingest_crontab_setup.sh
    ├── app_waitress.py
    ├── extensions.py
    ├── static
    │   ├── cache_manifest.json
    │   └── barchart.js
    ├── splash.py
    ├── database.py
    ├── ingest_logs.py
    ├── configmodule.py
    ├── packages_table_update.py
    ├── aws_functions.py
    ├── commands.py
    ├── app.py
    └── stats.py
├── tests
    ├── __init__.py
    ├── .env
    ├── factories.py
    ├── test_models.py
    └── conftest.py
├── migrations
    ├── README
    ├── script.py.mako
    ├── alembic.ini
    ├── versions
    │   └── 9c266b1a4aa9_.py
    └── env.py
├── .coveragerc
├── e2e
    ├── constants.ts
    ├── .env
    ├── workflow_page.spec.ts
    └── smoke_test.spec.ts
├── test-deployment
    ├── ansible
    │   ├── inventory.ini
    │   ├── ansible.cfg
    │   └── install-packages.yml
    ├── terraform.tfstate.backup
    ├── terra-development
    │   ├── main.tf
    │   └── variables.tf
    ├── terra-production
    │   ├── main.tf
    │   └── variables.tf
    └── main.tf
├── etl
    ├── i_populate_web_downloads.sql
    ├── t_bio _webstats_info.sql
    ├── postgresql
    │   ├── u_webstats_info_psql.sql
    │   ├── t_webstats_info_psql.sql
    │   ├── t_categorystats_postgres.sql
    │   ├── t_stats_postgres.sql
    │   ├── t_packages_postgres.sql
    │   ├── update_stats.sql
    │   ├── v_categorystats_postgres.sql
    │   ├── v_stats_psqlsql.sql
    │   ├── t_bioc_web_downloads_psql.sql
    │   ├── f_stats.sql
    │   └── sp_update_stats.sql
    ├── change-table-location.json
    ├── cli_specimens.sh
    ├── t_stats_parquet.sql
    ├── t_stats_tsv.sql
    ├── t_bioc_web_downloads.sql
    ├── i_populate_category_stats.sql
    ├── legacy-access_log-to-web_downloads.sql
    ├── bioc-www-logreader-prod.json
    ├── legacy-table-def.sql
    ├── retrieve_package_info.py
    ├── t_cloudfront_logs.sql
    ├── s3_move_objects.sh
    ├── v_bioc_web_downloads.sql
    ├── t_bioc_web_logs_partitioned.sql
    └── log_partition_projector.py
├── docs
    ├── webstats-erd-0_1_9.png
    ├── design_brief_block_diagram.png
    ├── bioc-webstats-architecture-v2.excalidraw.png
    ├── test-instance-setup.md
    ├── old_specimen.txt
    ├── webstats-system-overview.md
    └── stats_replacement_design_brief.md
├── conversion
    ├── README.md
    ├── isql_row_counts.sh
    ├── global_package_history.R
    ├── global_package_history.py
    └── access_logs_to_stats_tsv.py
├── dist
    ├── bioc_webstats-0.1.10-py3-none-any.whl
    ├── bioc_webstats-0.1.8-py3-none-any.whl
    └── bioc_webstats-0.1.9-py3-none-any.whl
├── autoapp.py
├── installer_scripts
    ├── build_docker.sh
    ├── logrotate.d
    │   ├── bioc-webstats
    │   └── README.txt
    ├── entrypoint.sh
    ├── bioc-webstats.service
    ├── apache2.service
    ├── flask_environment
    ├── run_docker.sh
    ├── aws_installer.sh
    ├── create_ec2_instance.sh
    ├── Dockerfile
    ├── docker_dev_setup_example.md
    └── installer.sh
├── .env.example
├── test-instance-config.py
├── .github
    └── workflows
    │   └── playwright.yml
├── .gitignore
├── pyproject.toml
├── playwright.config.ts
├── package.json
├── webpack.config.js
└── README.md


/assets/img/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/bioc_webstats/webpack/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Tests for the app."""
2 | 


--------------------------------------------------------------------------------
/assets/js/script.js:
--------------------------------------------------------------------------------
1 | // App initialization code goes here
2 | 


--------------------------------------------------------------------------------
/migrations/README:
--------------------------------------------------------------------------------
1 | Single-database configuration for Flask.
2 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | source = bioc_webstats
3 | omit = 
4 |     tests/*
5 | 
6 | 


--------------------------------------------------------------------------------
/bioc_webstats/__init__.py:
--------------------------------------------------------------------------------
1 | """Main application package."""
2 | __version__ = "0.1.8"
3 | 


--------------------------------------------------------------------------------
/e2e/constants.ts:
--------------------------------------------------------------------------------
1 | // constants.ts
2 | export const URL_STEM = '/packages/stats/';
3 | 


--------------------------------------------------------------------------------
/e2e/.env:
--------------------------------------------------------------------------------
1 | # WEBSTATS_URL='http://localhost:5000'
2 | # WEBSTATS_URL='http://3.217.171.126:8000/'


--------------------------------------------------------------------------------
/test-deployment/ansible/inventory.ini:
--------------------------------------------------------------------------------
1 | [ec2]
2 | ec2-3-142-133-254.us-east-2.compute.amazonaws.com


--------------------------------------------------------------------------------
/test-deployment/ansible/ansible.cfg:
--------------------------------------------------------------------------------
1 | [defaults]
2 | inventory = inventory.ini
3 | remote_user = ubuntu


--------------------------------------------------------------------------------
/assets/img/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/bio-web-stats/devel/assets/img/favicon.ico


--------------------------------------------------------------------------------
/etl/i_populate_web_downloads.sql:
--------------------------------------------------------------------------------
1 | insert into bioc_web_downloads
2 | select *
3 | from v_bioc_web_downloads;


--------------------------------------------------------------------------------
/assets/js/plugins.js:
--------------------------------------------------------------------------------
1 | // place any jQuery/helper plugins in here, instead of separate, slower script files.
2 | 


--------------------------------------------------------------------------------
/docs/webstats-erd-0_1_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/bio-web-stats/devel/docs/webstats-erd-0_1_9.png


--------------------------------------------------------------------------------
/docs/design_brief_block_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/bio-web-stats/devel/docs/design_brief_block_diagram.png


--------------------------------------------------------------------------------
/etl/t_bio _webstats_info.sql:
--------------------------------------------------------------------------------
1 | # TODO operationalize
2 | CREATE TABLE webstats_info
3 | ("key" varchar(23) PRIMARY KEY,
4 | "value" varchar(128));


--------------------------------------------------------------------------------
/conversion/README.md:
--------------------------------------------------------------------------------
1 | This folder contains artifacts for the conversion from the legacy stats.biconductor.org server to the bioc-weblogs system
2 | 


--------------------------------------------------------------------------------
/dist/bioc_webstats-0.1.10-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/bio-web-stats/devel/dist/bioc_webstats-0.1.10-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/bioc_webstats-0.1.8-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/bio-web-stats/devel/dist/bioc_webstats-0.1.8-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/bioc_webstats-0.1.9-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/bio-web-stats/devel/dist/bioc_webstats-0.1.9-py3-none-any.whl


--------------------------------------------------------------------------------
/autoapp.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """Create an application instance."""
3 | from bioc_webstats.app import create_app
4 | app = create_app("development")


--------------------------------------------------------------------------------
/docs/bioc-webstats-architecture-v2.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/bio-web-stats/devel/docs/bioc-webstats-architecture-v2.excalidraw.png


--------------------------------------------------------------------------------
/bioc_webstats/templates/nav.html:
--------------------------------------------------------------------------------
1 | 
2 | <nav class="navbar navbar-expand-lg navbar-dark bg-dark fixed-top">
3 |   <div class="container-fluid">
4 | 
5 |   </div>
6 | </nav>
7 | 
8 | 


--------------------------------------------------------------------------------
/installer_scripts/build_docker.sh:
--------------------------------------------------------------------------------
1 | # To override execution user:group webstats:webstats, add --build-arg OSUSER="yankee" --build-arg OSGROUP="doddle" 
2 | docker build -t webstats-server .
3 | 


--------------------------------------------------------------------------------
/bioc_webstats/flask_ingest.sh:
--------------------------------------------------------------------------------
1 | # TODO what is the correct directory for this script?
2 | . .venv/bin/activate
3 | export FLASK_APP="bioc_webstats.app:create_app('production', 'bioc/webstats/prod')"
4 | flask ingest -c E1TVLJONPTUXV3
5 | 


--------------------------------------------------------------------------------
/installer_scripts/logrotate.d/bioc-webstats:
--------------------------------------------------------------------------------
 1 | /var/log/bioc-webstats/webstats.log {
 2 |     weekly
 3 |     rotate 4
 4 |     compress
 5 |     missingok
 6 |     notifempty
 7 |     create 0640 ubuntu ubuntu
 8 |     delaycompress
 9 | }
10 | 


--------------------------------------------------------------------------------
/conversion/isql_row_counts.sh:
--------------------------------------------------------------------------------
1 | # report row counts for all sqlite3 databases
2 | for file in /mnt/data/home/biocadmin/download_dbs/download_db_*.sqlite; do
3 |    echo $(basename "$file")  $(sqlite3 "$file" "select count(*) from access_log")
4 |  done
5 |  


--------------------------------------------------------------------------------
/etl/postgresql/u_webstats_info_psql.sql:
--------------------------------------------------------------------------------
1 | -- update "Valid through" date
2 | 
3 | INSERT INTO webstats_info (key, value)
4 | VALUES ('ValidThru', (SELECT MAX(date) FROM bioc_web_downloads))
5 | ON CONFLICT (key)
6 | DO UPDATE SET value = EXCLUDED.value;
7 | 


--------------------------------------------------------------------------------
/test-deployment/terraform.tfstate.backup:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 4,
 3 |   "terraform_version": "1.5.5",
 4 |   "serial": 5,
 5 |   "lineage": "ed68b15c-df94-c839-bf44-3a56b77689b5",
 6 |   "outputs": {},
 7 |   "resources": [],
 8 |   "check_results": null
 9 | }
10 | 


--------------------------------------------------------------------------------
/etl/change-table-location.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "DatabaseName": "glue-sup-db",
 3 |   "TableInput": {
 4 |     "Name": "bw-dev_bioc_weblogs_small_test",
 5 |     "StorageDescriptor": {
 6 |       "Location": "s3://dev-bioc-weblogs-small-test/weblogs/"
 7 |     }
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/bioc_webstats/templates/footer.html:
--------------------------------------------------------------------------------
 1 | <footer>
 2 |   <small>
 3 |   <ul class="company">
 4 |     <li>© Bioconductor</li>
 5 |   </ul>
 6 | 
 7 |     <ul class="footer-nav">
 8 | 
 9 |       <li><a href="mailto:rshear@gmail.com">Contact</a></li>
10 |     </ul>
11 |   </small>
12 | </footer>
13 | 


--------------------------------------------------------------------------------
/installer_scripts/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # Start systemd as the init system
 5 | exec /lib/systemd/systemd
 6 | 
 7 | 
 8 | # Wait for systemd to start
 9 | sleep 5
10 | 
11 | # Start ssh service
12 | systemctl start ssh
13 | 
14 | # Wait indefinitely to keep the container running
15 | tail -f /dev/null
16 | 


--------------------------------------------------------------------------------
/tests/.env:
--------------------------------------------------------------------------------
 1 | # Environment variable overrides for local development
 2 | FLASK_APP=autoapp.py
 3 | FLASK_DEBUG=1
 4 | FLASK_ENV=development
 5 | DATABASE_URL=sqlite:///dev.db
 6 | LOG_LEVEL=INFO
 7 | SECRET_KEY=012983901238102381038012381298
 8 | # In production, set to a higher number, like 31556926
 9 | SEND_FILE_MAX_AGE_DEFAULT=0
10 | 


--------------------------------------------------------------------------------
/test-deployment/terra-development/main.tf:
--------------------------------------------------------------------------------
 1 | # development/main.tf
 2 | provider "aws" {
 3 |   region = var.aws_region
 4 | }
 5 | 
 6 | # ...
 7 | 
 8 | resource "aws_instance" "example" {
 9 |   ami           = var.ami_id
10 |   instance_type = var.instance_type
11 | 
12 |   tags = {
13 |     Name = "development-web-server"
14 |   }
15 | }


--------------------------------------------------------------------------------
/test-deployment/terra-production/main.tf:
--------------------------------------------------------------------------------
 1 | # development/main.tf
 2 | provider "aws" {
 3 |   region = var.aws_region
 4 | }
 5 | 
 6 | # ...
 7 | 
 8 | resource "aws_instance" "example" {
 9 |   ami           = var.ami_id
10 |   instance_type = var.instance_type
11 | 
12 |   tags = {
13 |     Name = "development-web-server"
14 |   }
15 | }


--------------------------------------------------------------------------------
/bioc_webstats/flask_ingest_crontab_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # TODO correct directory?
 4 | 
 5 | # Define the script path
 6 | SCRIPT_PATH="flask_ingest.sh"
 7 | 
 8 | # Ensure the script is executable
 9 | chmod +x $SCRIPT_PATH
10 | 
11 | # Add a new cron job
12 | (crontab -l 2>/dev/null; echo "12 1 * * * $SCRIPT_PATH") | crontab -
13 | 


--------------------------------------------------------------------------------
/installer_scripts/bioc-webstats.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Waitress service for www-webstats
 3 | After=network.target
 4 | 
 5 | [Service]
 6 | WorkingDirectory=/home/ubuntu
 7 | EnvironmentFile=/home/ubuntu/flask_environment
 8 | ExecStart=/home/ubuntu/.venv/bin/python3 -m bioc_webstats.app_waitress
 9 | 
10 | [Install]
11 | WantedBy=multi-user.target
12 | 


--------------------------------------------------------------------------------
/test-deployment/terra-development/variables.tf:
--------------------------------------------------------------------------------
 1 | # development/variables.tf
 2 | variable "aws_region" {
 3 |   type    = string
 4 |   default = "us-east-2"
 5 | }
 6 | 
 7 | variable "ami_id" {
 8 |   type    = string
 9 |   default = "ami-09d9029d9fc5e5238"
10 | }
11 | 
12 | variable "instance_type" {
13 |   type    = string
14 |   default = "t2.micro"
15 | }
16 | 


--------------------------------------------------------------------------------
/test-deployment/terra-production/variables.tf:
--------------------------------------------------------------------------------
 1 | # development/variables.tf
 2 | variable "aws_region" {
 3 |   type    = string
 4 |   default = "us-east-2"
 5 | }
 6 | 
 7 | variable "ami_id" {
 8 |   type    = string
 9 |   default = "ami-09d9029d9fc5e5238"
10 | }
11 | 
12 | variable "instance_type" {
13 |   type    = string
14 |   default = "t2.micro"
15 | }
16 | 


--------------------------------------------------------------------------------
/installer_scripts/logrotate.d/README.txt:
--------------------------------------------------------------------------------
 1 | Specimen log rotation configuration for the web app.
 2 | 
 3 | Place the file installer_scripts/logrotate.d/bioc-webstats
 4 | on the target system under /etc/logrotate.d/
 5 | 
 6 | Then set owner to root
 7 | sudo chown root:root bioc-webstats 
 8 | 
 9 | Test by running:
10 | sudo logrotate -d /etc/logrotate.d/bioc-webstats


--------------------------------------------------------------------------------
/bioc_webstats/app_waitress.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from waitress import serve
 4 | from bioc_webstats import app
 5 | import logging
 6 | import sys
 7 | 
 8 | logging.basicConfig(stream=sys.stderr)
 9 | 
10 | # TODO parameterize port 5000
11 | if __name__ == "__main__":
12 |     serve(app.create_app('production', '/bioc/webstats/prod'), host='0.0.0.0', port=8000)


--------------------------------------------------------------------------------
/etl/cli_specimens.sh:
--------------------------------------------------------------------------------
1 |  aws glue get-table --database-name glue-sup-db --name "bw-dev_bioc_weblogs_small_test"
2 | 
3 |  aws glue update-table --cli-input-json file://change-table-location.json
4 | 
5 |  aws glue create-table --database-name glue-sup-db --table-input file://etl/glue_weblog_table_in.json
6 | 
7 | aws glue delete-table  --database-name  glue-sup-db --name bioc_web_logs
8 | 


--------------------------------------------------------------------------------
/installer_scripts/apache2.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=The Apache HTTP Server
 3 | After=network.target remote-fs.target nss-lookup.target
 4 | 
 5 | [Service]
 6 | ExecStart=/usr/sbin/apachectl -D FOREGROUND
 7 | ExecReload=/usr/sbin/apachectl graceful
 8 | ExecStop=/usr/sbin/apachectl stop
 9 | Type=notify
10 | PrivateTmp=true
11 | 
12 | [Install]
13 | WantedBy=multi-user.target
14 | 


--------------------------------------------------------------------------------
/test-deployment/main.tf:
--------------------------------------------------------------------------------
 1 | provider "aws" {
 2 |   region = "us-east-2" # Change this to your desired AWS region
 3 | }
 4 | 
 5 | resource "aws_instance" "example" {
 6 |   ami           = "ami-09d9029d9fc5e5238" # Change this to the desired AMI ID
 7 |   instance_type = "t2.micro" # Change this to the desired instance type
 8 | 
 9 |   tags = {
10 |     Name = "test-bio-web-stats"
11 |   }
12 | }


--------------------------------------------------------------------------------
/installer_scripts/flask_environment:
--------------------------------------------------------------------------------
1 | export FLASK_APP="bioc_webstats.app:create_app('production', '/bioc/webstats/prod')"
2 | export FLASK_AWS_PATH_PARAMETER=/bioc/webstats/prod
3 | export FLASK_OSUSER="webstats"
4 | export FLASK_OSGROUP="webstats"
5 | export FLASK_APPROOT="/var/www/bioc-webstats"
6 | export FLASK_LOGROOT="/var/log/bioc-webstats"
7 | export FLASK_AWS_PATH_PARAMETER='/bioc/webstats/prod'
8 | 


--------------------------------------------------------------------------------
/bioc_webstats/templates/500.html:
--------------------------------------------------------------------------------
 1 | 
 2 | {% extends "layout.html" %}
 3 | 
 4 | {% block page_title %}Server error{% endblock %}
 5 | 
 6 | {% block content %}
 7 | <div class="jumbotron">
 8 |     <div class="text-center">
 9 |         <h1>500</h1>
10 |         <p>Sorry, something went wrong on our system. Don't panic, we are fixing it! Please try again later.</p>
11 |     </div>
12 | </div>
13 | {% endblock %}
14 | 
15 | 


--------------------------------------------------------------------------------
/bioc_webstats/templates/404.html:
--------------------------------------------------------------------------------
 1 | 
 2 | {% extends "layout.html" %}
 3 | 
 4 | {% block page_title %}Page Not Found{% endblock %}
 5 | 
 6 | {% block content %}
 7 | <div class="jumbotron">
 8 |     <div class="text-center">
 9 |         <h1>404</h1>
10 |         <p>Sorry, that page doesn't exist.</p>
11 |         <p>Want to <a href="{{ url_for('spash.home') }}">go home</a> instead?</p>
12 |     </div>
13 | </div>
14 | {% endblock %}
15 | 
16 | 


--------------------------------------------------------------------------------
/installer_scripts/run_docker.sh:
--------------------------------------------------------------------------------
1 | # Start the server
2 | docker run --privileged --name=webstats1 -d webstats-server
3 | # Retrieve the local ip address
4 | docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' webstats1
5 | # Move the most recent whl file to the docker container
6 | docker cp  "$(ls -1ar ../dist/bioc_webstats-*.whl|head -1)" webstats1:/home/ubuntu
7 | # login
8 | docker exec -it --user ubuntu webstats1 /bin/bash
9 | 


--------------------------------------------------------------------------------
/bioc_webstats/templates/401.html:
--------------------------------------------------------------------------------
 1 | 
 2 | {% extends "layout.html" %}
 3 | 
 4 | {% block page_title %}Unauthorized{% endblock %}
 5 | 
 6 | {% block content %}
 7 | <div class="jumbotron">
 8 |     <div class="text-center">
 9 |         <h1>401</h1>
10 |         <p>You are not authorized to see this page.
11 |         </p>
12 |         <p>Want to <a href="{{ url_for('spash.home') }}">go home</a> instead?</p>
13 |     </div>
14 | </div>
15 | {% endblock %}
16 | 
17 | 


--------------------------------------------------------------------------------
/assets/js/main.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Main Javascript file for bioc_webstats.
 3 |  *
 4 |  * This file bundles all of your javascript together using webpack.
 5 |  */
 6 | 
 7 | // JavaScript modules
 8 | require('@fortawesome/fontawesome-free');
 9 | require('jquery');
10 | require('bootstrap');
11 | 
12 | require.context(
13 |   '../img', // context folder
14 |   true, // include subdirectories
15 |   /.*/, // RegExp
16 | );
17 | 
18 | // Your own code
19 | require('./plugins');
20 | require('./script');


--------------------------------------------------------------------------------
/bioc_webstats/templates/about.html:
--------------------------------------------------------------------------------
 1 | 
 2 | {% extends "layout.html" %}
 3 | 
 4 | {% block content %}
 5 | <div class="container">
 6 |     <h1 class="mt-5">About</h1>
 7 |     <div class="row">
 8 |       <p class="lead">This template was created by <a href="http://github.com/sloria/">Steven Loria</a> for use with the <a href="http://github.com/audreyr/cookiecutter/">cookiecutter</a> package by <a href="http://github.com/audreyr/">Audrey Roy</a>.</p>
 9 |     </div>
10 | </div>
11 | {% endblock %}
12 | 
13 | 


--------------------------------------------------------------------------------
/etl/postgresql/t_webstats_info_psql.sql:
--------------------------------------------------------------------------------
 1 | -- Table: public.webstats_info
 2 | 
 3 | -- DROP TABLE IF EXISTS public.webstats_info;
 4 | 
 5 | CREATE TABLE IF NOT EXISTS public.webstats_info
 6 | (
 7 |     key character varying(23) COLLATE pg_catalog."default" NOT NULL,
 8 |     value character varying(128) COLLATE pg_catalog."default",
 9 |     CONSTRAINT webstats_info_pkey PRIMARY KEY (key)
10 | )
11 | 
12 | TABLESPACE pg_default;
13 | 
14 | ALTER TABLE IF EXISTS public.webstats_info
15 |     OWNER to postgres;


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | # Environment variable overrides for local development
 2 | FLASK_APP="bioc_webstats.app:create_app('development')"
 3 | FLASK_DEBUG=1
 4 | FLASK_ENV=development
 5 | #DATABASE_URL=sqlite:///dev.db
 6 | DATABASE_URL=postgresql://postgres@localhost:5432/webstats
 7 | LOG_LEVEL=DEBUG
 8 | SECRET_KEY=not-so-secret
 9 | # In production, set to a higher number, like 31556926
10 | SEND_FILE_MAX_AGE_DEFAULT=0
11 | 
12 | # AWS Roles
13 | AWS_ROLE_WEBRUNNER=bioc-webstats-webrunner
14 | AWS_ROLE_ETL=bioc-webstats-etl
15 | 


--------------------------------------------------------------------------------
/bioc_webstats/templates/main.css:
--------------------------------------------------------------------------------
 1 | <style>
 2 | BODY {
 3 |     font-family: sans-serif;
 4 |     font-size: 10pt;
 5 | }
 6 | 
 7 | H1 {
 8 |     font-size: 16pt;
 9 | }
10 | H2 {
11 |     font-size: 14pt;
12 |     margin-top: 8px;
13 |     margin-bottom: 4px;
14 | }
15 | H3 {
16 |     font-size: 12pt;
17 |     margin-top: 0px;
18 |     margin-bottom: 4px;
19 | }
20 | H4 {
21 |     font-size: 11pt;
22 |     margin-top: 0px;
23 |     margin-bottom: 4px;
24 | }
25 | 
26 | TABLE.stats {
27 |     border: solid black 1px;
28 | }
29 | 


--------------------------------------------------------------------------------
/test-instance-config.py:
--------------------------------------------------------------------------------
 1 | from decouple import config
 2 | 
 3 | # Flask configuration
 4 | ENV = config('FLASK_ENV', default='production')
 5 | DEBUG = ENV == 'development'
 6 | SEND_FILE_MAX_AGE_DEFAULT = 0
 7 | DEBUG_TB_ENABLED = DEBUG
 8 | DEBUG_TB_INTERCEPT_REDIRECTS = False
 9 | 
10 | # SQLAlchemy configuration
11 | SQLALCHEMY_DATABASE_URI = config('SQLALCHEMY_DATABASE_URI', default='sqlite:///development.db')
12 | SQLALCHEMY_TRACK_MODIFICATIONS = False
13 | 
14 | # Cache configuration
15 | CACHE_TYPE = config('CACHE_TYPE', default='SimpleCache')


--------------------------------------------------------------------------------
/etl/postgresql/t_categorystats_postgres.sql:
--------------------------------------------------------------------------------
 1 | -- Table: public.categorystats
 2 | 
 3 | -- DROP TABLE IF EXISTS public.categorystats;
 4 | 
 5 | CREATE TABLE IF NOT EXISTS public.categorystats
 6 | (
 7 |     category character varying(16) COLLATE pg_catalog."default" NOT NULL,
 8 |     date date NOT NULL,
 9 |     is_monthly boolean NOT NULL,
10 |     ip_count integer,
11 |     download_count integer,
12 |     CONSTRAINT categorystats_pkey PRIMARY KEY (category, date)
13 | )
14 | 
15 | TABLESPACE pg_default;
16 | 
17 | ALTER TABLE IF EXISTS public.categorystats
18 |     OWNER to postgres;


--------------------------------------------------------------------------------
/etl/postgresql/t_stats_postgres.sql:
--------------------------------------------------------------------------------
 1 | -- Table: public.stats
 2 | 
 3 | -- DROP TABLE IF EXISTS public.stats;
 4 | 
 5 | CREATE TABLE IF NOT EXISTS public.stats
 6 | (
 7 |     category character varying(16) COLLATE pg_catalog."default",
 8 |     "package" character varying(64) COLLATE pg_catalog."default" NOT NULL,
 9 |     "date" date NOT NULL,
10 |     is_monthly boolean NOT NULL,
11 |     ip_count integer,
12 |     download_count integer,
13 |     CONSTRAINT stats_pkey PRIMARY KEY ("package", "date")
14 | )
15 | 
16 | TABLESPACE pg_default;
17 | 
18 | ALTER TABLE IF EXISTS public.stats
19 |     OWNER to postgres;


--------------------------------------------------------------------------------
/etl/t_stats_parquet.sql:
--------------------------------------------------------------------------------
 1 | CREATE EXTERNAL TABLE `stats`(
 2 |   `category` string, 
 3 |   `package` string,
 4 |   `date` date, 
 5 |   `is_monthly` boolean,
 6 |   `ip_count` bigint, 
 7 |   `download_count` bigint
 8 |   )
 9 | ROW FORMAT SERDE 
10 |   'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 
11 | STORED AS INPUTFORMAT 
12 |   'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' 
13 | OUTPUTFORMAT 
14 |   'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
15 | LOCATION
16 |   's3://web-stats-dev/parquet/ds'
17 | TBLPROPERTIES (
18 |   'parquet.compression'='SNAPPY')


--------------------------------------------------------------------------------
/test-deployment/ansible/install-packages.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Install Python Packages and Apache
 3 |   hosts: ec2
 4 |   become: yes  # Use sudo to gain administrative privileges
 5 | 
 6 |   tasks:
 7 |     - name: Update APT package cache
 8 |       apt:
 9 |         update_cache: yes
10 |       when: ansible_os_family == 'Ubuntu'  
11 | 
12 |     - name: Install system packages
13 |       apt:
14 |         name: "{{ item }}"
15 |         state: present
16 |       with_items:
17 |         - apache2
18 | 
19 |     - name: Install Python packages from dev.txt
20 |       pip:
21 |         requirements: /requirements/dev.txt


--------------------------------------------------------------------------------
/bioc_webstats/extensions.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Extensions module. Each extension is initialized in the app factory located in app.py."""
 3 | 
 4 | from flask_caching import Cache
 5 | from flask_debugtoolbar import DebugToolbarExtension
 6 | from flask_migrate import Migrate
 7 | from flask_sqlalchemy import SQLAlchemy
 8 | from flask_static_digest import FlaskStaticDigest
 9 | from flask_wtf.csrf import CSRFProtect
10 | 
11 | csrf_protect = CSRFProtect()
12 | db = SQLAlchemy()
13 | migrate = Migrate()
14 | cache = Cache()
15 | debug_toolbar = DebugToolbarExtension()
16 | flask_static_digest = FlaskStaticDigest()
17 | 


--------------------------------------------------------------------------------
/migrations/script.py.mako:
--------------------------------------------------------------------------------
 1 | """${message}
 2 | 
 3 | Revision ID: ${up_revision}
 4 | Revises: ${down_revision | comma,n}
 5 | Create Date: ${create_date}
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | ${imports if imports else ""}
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = ${repr(up_revision)}
14 | down_revision = ${repr(down_revision)}
15 | branch_labels = ${repr(branch_labels)}
16 | depends_on = ${repr(depends_on)}
17 | 
18 | 
19 | def upgrade():
20 |     ${upgrades if upgrades else "pass"}
21 | 
22 | 
23 | def downgrade():
24 |     ${downgrades if downgrades else "pass"}
25 | 


--------------------------------------------------------------------------------
/etl/t_stats_tsv.sql:
--------------------------------------------------------------------------------
 1 | CREATE EXTERNAL TABLE `old_stats`(
 2 |   `date` date COMMENT '', 
 3 |   `c-ip` string COMMENT '', 
 4 |   `sc-status` bigint COMMENT '', 
 5 |   `cs-uri-stem` string COMMENT '')
 6 | ROW FORMAT DELIMITED 
 7 |   FIELDS TERMINATED BY '\t' 
 8 | STORED AS INPUTFORMAT 
 9 |   'org.apache.hadoop.mapred.TextInputFormat' 
10 | OUTPUTFORMAT 
11 |   'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
12 | LOCATION
13 |   's3://web-stats-dev/sqlite/'
14 | TBLPROPERTIES (
15 |   'areColumnsQuoted'='false', 
16 |   'classification'='csv', 
17 |   'compressionType'='gzip', 
18 |   'delimiter'='\t', 
19 |   'typeOfData'='file')


--------------------------------------------------------------------------------
/etl/postgresql/t_packages_postgres.sql:
--------------------------------------------------------------------------------
 1 | -- Table: public.packages
 2 | 
 3 | -- DROP TABLE IF EXISTS public.packages;
 4 | 
 5 | CREATE TABLE IF NOT EXISTS public.packages
 6 | (
 7 |     "package" character varying(64) COLLATE pg_catalog."default" NOT NULL,
 8 |     category character varying(16) COLLATE pg_catalog."default" NOT NULL,
 9 |     first_version character varying(8) COLLATE pg_catalog."default" NOT NULL,
10 |     last_version character varying(8) COLLATE pg_catalog."default",
11 |     CONSTRAINT packages_pkey PRIMARY KEY ("package")
12 | )
13 | 
14 | TABLESPACE pg_default;
15 | 
16 | ALTER TABLE IF EXISTS public.packages
17 |     OWNER to postgres;
18 | 


--------------------------------------------------------------------------------
/etl/t_bioc_web_downloads.sql:
--------------------------------------------------------------------------------
 1 | CREATE EXTERNAL TABLE `bioc_web_downloads`(
 2 |   `date` date, 
 3 |   `c-ip` string, 
 4 |   `sc-status` int, 
 5 |   `category` string, 
 6 |   `package` string)
 7 | PARTITIONED BY ( 
 8 |   `year` string, 
 9 |   `month` string, 
10 |   `day` string)
11 |   ROW FORMAT SERDE 
12 |   'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 
13 | STORED AS INPUTFORMAT 
14 |   'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' 
15 | OUTPUTFORMAT 
16 |   'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
17 | LOCATION
18 |   's3://web-stats-dev/parquet'
19 | TBLPROPERTIES (
20 |   'parquet.compression'='SNAPPY')


--------------------------------------------------------------------------------
/etl/i_populate_category_stats.sql:
--------------------------------------------------------------------------------
 1 | insert into categorystats
 2 | with T as (select category,
 3 |     year("date") "yr",
 4 |     count(distinct "c-ip") ip_count,
 5 |     count(*) download_count
 6 | from bioc_web_downloads
 7 | group by category,
 8 |     year("date")
 9 | )
10 | select "category", 
11 |     CAST(CAST(yr AS VARCHAR) || '-12-31' AS DATE) "date", 
12 |     false as is_monthly,
13 |     "ip_count", 
14 |     "download_count" 
15 | from T
16 | UNION ALL
17 | select category,
18 |     date_trunc('MONTH', "date") "date", 
19 |     true as is_monthly,
20 |     count(distinct "c-ip") ip_count,
21 |     count(*) download_count
22 | from bioc_web_downloads
23 | group by category,
24 |     date_trunc('MONTH', "date")
25 | 


--------------------------------------------------------------------------------
/etl/postgresql/update_stats.sql:
--------------------------------------------------------------------------------
 1 | -- PROCEDURE: public.update_stats(date)
 2 | 
 3 | -- DROP PROCEDURE IF EXISTS public.update_stats(date);
 4 | 
 5 | CREATE OR REPLACE PROCEDURE public.update_stats(
 6 | 	IN p_date date)
 7 | LANGUAGE 'plpgsql'
 8 | AS $BODY$
 9 | BEGIN
10 |     DELETE FROM stats WHERE "date" >= p_date;
11 |     INSERT INTO stats
12 |     SELECT * FROM v_stats WHERE "date" >= p_date;
13 | 
14 |     DELETE FROM categorystats WHERE "date" >= p_date;
15 |     INSERT INTO categorystats
16 |     SELECT * FROM v_categorystats WHERE "date" >= p_date;
17 | 
18 |     UPDATE webstats_info 
19 |     SET value = (SELECT MAX(date) FROM bioc_web_downloads)
20 |     WHERE key = 'ValidThru';
21 | END;
22 | $BODY$;
23 | ALTER PROCEDURE public.update_stats(date)
24 |     OWNER TO postgres;
25 | 


--------------------------------------------------------------------------------
/.github/workflows/playwright.yml:
--------------------------------------------------------------------------------
 1 | name: Playwright Tests
 2 | on:
 3 |   push:
 4 |     branches: [ main, master ]
 5 |   pull_request:
 6 |     branches: [ main, master ]
 7 | jobs:
 8 |   test:
 9 |     timeout-minutes: 60
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@v3
13 |     - uses: actions/setup-node@v3
14 |       with:
15 |         node-version: 18
16 |     - name: Install dependencies
17 |       run: npm ci
18 |     - name: Install Playwright Browsers
19 |       run: npx playwright install --with-deps
20 |     - name: Run Playwright tests
21 |       run: npx playwright test
22 |     - uses: actions/upload-artifact@v3
23 |       if: always()
24 |       with:
25 |         name: playwright-report
26 |         path: playwright-report/
27 |         retention-days: 30
28 | 


--------------------------------------------------------------------------------
/etl/legacy-access_log-to-web_downloads.sql:
--------------------------------------------------------------------------------
 1 | select strftime(
 2 |         '%Y-%m-%d',
 3 |         substr(day_month_year, 8, 4) || '-' || case
 4 |             substr(day_month_year, 4, 3)
 5 |             when 'Jan' then '01'
 6 |             when 'Feb' then '02'
 7 |             when 'Mar' then '03'
 8 |             when 'Apr' then '04'
 9 |             when 'May' then '05'
10 |             when 'Jun' then '06'
11 |             when 'Jul' then '07'
12 |             when 'Aug' then '08'
13 |             when 'Sep' then '09'
14 |             when 'Oct' then '10'
15 |             when 'Nov' then '11'
16 |             when 'Dec' then '12'
17 |         end || '-' || substr(day_month_year, 1, 2)
18 |     ) AS "date",
19 |     ips as "c-ip",
20 |     statuscode as "sc-status",
21 |     "url" as "cs-uri-stem"
22 | from access_log
23 | 


--------------------------------------------------------------------------------
/etl/bioc-www-logreader-prod.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Id": "Policy1701459421763",
 4 |     "Statement": [
 5 |         {
 6 |             "Sid": "Stmt1701459373427",
 7 |             "Effect": "Allow",
 8 |             "Principal": {
 9 |                 "AWS": "arn:aws:iam::931729544676:root"
10 |             },
11 |             "Action": [
12 |                 "s3:GetBucketLocation",
13 |                 "s3:GetObject",
14 |                 "s3:ListBucket",
15 |                 "s3:ListBucketMultipartUploads",
16 |                 "s3:ListMultipartUploadParts",
17 |                 "s3:AbortMultipartUpload"
18 |             ],
19 |             "Resource": [
20 |                 "arn:aws:s3:::bioc-cloudfront-logs",
21 |                 "arn:aws:s3:::bioc-cloudfront-logs/*"
22 |             ]
23 |         }
24 |     ]
25 | }


--------------------------------------------------------------------------------
/etl/legacy-table-def.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE access_log (
 2 |     ips TEXT NOT NULL,
 3 |     day_month_year TEXT NOT NULL,
 4 |     month_year TEXT NOT NULL,
 5 |     time TEXT NOT NULL,
 6 |     utc_offset TEXT NOT NULL,
 7 |     method TEXT NOT NULL,
 8 |     url TEXT NOT NULL,
 9 |     protocol TEXT NOT NULL,
10 |     statuscode TEXT NOT NULL,
11 |     bytes INTEGER NULL,
12 |     referer TEXT NULL,
13 |     user_agent TEXT NULL,
14 |     biocrepo_relurl TEXT NULL,
15 |     biocrepo TEXT NULL,
16 |     biocversion TEXT NULL,
17 |     package TEXT NULL,
18 |     pkgversion TEXT NULL,
19 |     pkgtype TEXT NULL
20 | );
21 | CREATE INDEX ipsI ON access_log (ips);
22 | CREATE INDEX month_yearI ON access_log (month_year);
23 | CREATE INDEX packageI ON access_log (package);
24 | 
25 | 
26 | select day_month_year as "date", ips as "c-ip", statuscode as "sc-status", "url" as "cs-uri-stem" from access_log
27 | 


--------------------------------------------------------------------------------
/etl/retrieve_package_info.py:
--------------------------------------------------------------------------------
 1 | """Standalone program to fetch all known package names and store them in S3."""
 2 | import re
 3 | 
 4 | import boto3
 5 | import requests
 6 | 
 7 | source_url = 'https://git.bioconductor.org'
 8 | bucket_name = "web-stats-dev"
 9 | key = "gitdata/packagenames"
10 | 
11 | 
12 | def fetch_packagenames(url):
13 |     """Read the package names from the root of git.bioconductor.org."""
14 |     response = requests.get(url)
15 |     response.raise_for_status()
16 |     result = re.findall(r"^\ R\s+packages/([A-Za-z0-9\.\_]+)$", response.text, re.MULTILINE)
17 |     return result
18 | 
19 | 
20 | def store_data_s3(bucket_name, key, data):
21 |     """Stoe the data in S3."""
22 |     s3 = boto3.client('s3')
23 |     result = s3.put_object(Bucket=bucket_name, Key=key, Body=data)
24 |     return result
25 |     # TODO Test for errors
26 | 
27 | 
28 | data = fetch_packagenames(source_url)
29 | result = store_data_s3(bucket_name=bucket_name, key=key, data='\n'.join(data))
30 | pass
31 | # TODO check for success
32 | 


--------------------------------------------------------------------------------
/migrations/alembic.ini:
--------------------------------------------------------------------------------
 1 | # A generic, single database configuration.
 2 | 
 3 | [alembic]
 4 | # template used to generate migration files
 5 | # file_template = %%(rev)s_%%(slug)s
 6 | 
 7 | # set to 'true' to run the environment during
 8 | # the 'revision' command, regardless of autogenerate
 9 | # revision_environment = false
10 | 
11 | 
12 | # Logging configuration
13 | [loggers]
14 | keys = root,sqlalchemy,alembic,flask_migrate
15 | 
16 | [handlers]
17 | keys = console
18 | 
19 | [formatters]
20 | keys = generic
21 | 
22 | [logger_root]
23 | level = WARN
24 | handlers = console
25 | qualname =
26 | 
27 | [logger_sqlalchemy]
28 | level = WARN
29 | handlers =
30 | qualname = sqlalchemy.engine
31 | 
32 | [logger_alembic]
33 | level = INFO
34 | handlers =
35 | qualname = alembic
36 | 
37 | [logger_flask_migrate]
38 | level = INFO
39 | handlers =
40 | qualname = flask_migrate
41 | 
42 | [handler_console]
43 | class = StreamHandler
44 | args = (sys.stderr,)
45 | level = NOTSET
46 | formatter = generic
47 | 
48 | [formatter_generic]
49 | format = %(levelname)-5.5s [%(name)s] %(message)s
50 | datefmt = %H:%M:%S
51 | 


--------------------------------------------------------------------------------
/bioc_webstats/static/cache_manifest.json:
--------------------------------------------------------------------------------
1 | {"build/fa-solid-900.ttf": "build/fa-solid-900-0248ab19e74fc3731de14d647db4687a.ttf", "build/main_css.bundle.css": "build/main_css.bundle-b0fa8a76264ec2955f5733bcbfdbc6c3.css", "build/favicon.ico": "build/favicon-36131e511cbaacdc9e850bf826d4d54c.ico", "build/main_js.bundle.js": "build/main_js.bundle-21e93841d859ac5322dd368e27dc2a6e.js", "build/main_css.bundle.js": "build/main_css.bundle-95ccaeeecaa3d911a52de81dcbf47bda.js", "build/878f31251d960bd6266f.woff2": "build/878f31251d960bd6266f-005c9aa92b564b73b7582cc4f1fa49cb.woff2", "build/1bac2991f3dbfa237aec.woff2": "build/1bac2991f3dbfa237aec-f20773a753bf1fbe9ccec04f90a470ea.woff2", "build/b041b1fa4fe241b23445.woff2": "build/b041b1fa4fe241b23445-747442fa76f1d9a31f9a54a2e8a4b448.woff2", "build/fa-v4compatibility.ttf": "build/fa-v4compatibility-58f1253e5639d508a6fe310c97327f82.ttf", "build/fa-regular-400.ttf": "build/fa-regular-400-675809e48e35c47d51c7d6fcc687ee28.ttf", "build/b6879d41b0852f01ed5b.woff2": "build/b6879d41b0852f01ed5b-d5e647388e2415268b700d3df2e30a0d.woff2", "build/fa-brands-400.ttf": "build/fa-brands-400-bb8cd014d7a55672934233c354e1c4a3.ttf"}


--------------------------------------------------------------------------------
/etl/postgresql/v_categorystats_postgres.sql:
--------------------------------------------------------------------------------
 1 | -- View: public.v_categorystats
 2 | 
 3 | CREATE OR REPLACE VIEW public.v_categorystats
 4 | AS
 5 |  WITH s AS (
 6 |          SELECT D.date,
 7 |             D."c-ip",
 8 |             D."sc-status",
 9 |             D.category
10 |            FROM bioc_web_downloads as D
11 |            INNER JOIN packages P 
12 |             ON D.package = P.package AND D.category = P.category
13 |             where "sc-status" in (200, 301, 302, 307, 308)
14 |  ), t AS (
15 |          SELECT s.category,
16 |             date_trunc('YEAR', s.date) AS yr,
17 |             count(DISTINCT s."c-ip") AS ip_count,
18 |             count(*) AS download_count
19 |            FROM s
20 |           GROUP BY s.category, date_trunc('YEAR', s.date)
21 |         )
22 |  SELECT t.category,
23 |     (t.yr + '1 YEAR'::interval - '1 day'::interval)::date AS date,
24 |     false AS is_monthly,
25 |     t.ip_count,
26 |     t.download_count
27 |    FROM t
28 | UNION ALL
29 |  SELECT s.category,
30 |     date_trunc('MONTH', s.date)::date AS date,
31 |     true AS is_monthly,
32 |     count(DISTINCT s."c-ip") AS ip_count,
33 |     count(*) AS download_count
34 |    FROM s
35 |   GROUP BY s.category, (date_trunc('MONTH', s.date));


--------------------------------------------------------------------------------
/e2e/workflow_page.spec.ts:
--------------------------------------------------------------------------------
 1 | import { test, expect } from "@playwright/test";
 2 | import { URL_STEM } from "./constants";
 3 | 
 4 | test('explore_workflow-page', async ({ page }) => {
 5 |   await page.goto(URL_STEM + 'workflows.html');
 6 |   await expect(page.locator('h1')).toContainText('Download stats for Bioconductor workflow packages');
 7 |   await page.getByRole('link', { name: 'Bioconductor annotation' }).click();
 8 |   await page.goto(URL_STEM + 'workflows.html');
 9 |   await expect(page.getByText('The number reported next to')).toBeVisible();
10 |   await page.getByRole('link', { name: 'workflows_pkg_stats.tab' }).click();
11 |   await page.goto(URL_STEM + 'workflows.html');
12 |   await page.getByRole('link', { name: 'workflows_pkg_scores.tab' }).click();
13 |   await page.goto(URL_STEM + 'workflows.html');
14 |   await page.getByRole('link', { name: 'See Download stats for' }).click();
15 |   await expect(page.locator('h1')).toContainText('Download stats for Bioconductor workflow repository (all packages combined)');
16 |   await expect(page.locator('body')).toContainText('Back to the "Download stats for Bioconductor workflow packages"');
17 |   await page.getByRole('link', { name: 'Back to the "Download stats' }).click();
18 | });


--------------------------------------------------------------------------------
/bioc_webstats/templates/home.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block content %}
 3 | <!-- Main jumbotron for a primary marketing message or call to action -->
 4 | <div class="jumbotron">
 5 |     <div class="container">
 6 | 
 7 |         <h1 class="display-3">Test site for bioc_webstats</h1>
 8 |         <p>Replacing stats.bioconductor.org</p>
 9 |     </div>
10 | </div><!-- /.jumbotron -->
11 | 
12 | <div class="container">
13 |         <table  class="table table-striped table-content-fit">
14 |             <thead>
15 |                 <tr>
16 |                     <td class="px-1">Sample Path</td>
17 |                     <td class="px-1">Production</td>
18 |                     <td class="px-1">This site</td>
19 |                 </tr>
20 |             </thead>
21 |             <tbody>
22 |                 {% for ref in targets %}
23 |                 <tr>
24 |                     <td class="px-1">{{ ref }}</td>
25 |                     <td class="px-1"> <a href="https://bioconductor.org/packages/stats{{ ref }}">Old</a></td>
26 |                     <td class="px-1"> <a href="/packages/stats{{ ref }}">New</a></td>
27 |                 </tr>
28 |                 {% endfor %}
29 |             </tbody>
30 |         </table>
31 | </div>
32 | {% endblock %}


--------------------------------------------------------------------------------
/installer_scripts/aws_installer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Function to install AWS CLI
 4 | install_aws_cli() {
 5 |   local url=$1
 6 |   local file=$2
 7 | 
 8 |   curl "$url" -o "$file"
 9 |   unzip "$file"
10 |   sudo ./aws/install
11 |   rm -rf aws "$file"
12 | }
13 | 
14 | # Detect the operating system and architecture
15 | OS="$(uname -s)"
16 | ARCH="$(uname -m)"
17 | 
18 | case "$OS" in
19 |   Linux)
20 |     if [ "$ARCH" == "x86_64" ]; then
21 |       URL="https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip"
22 |       FILE="awscliv2-linux-x86_64.zip"
23 |     elif [ "$ARCH" == "aarch64" ]; then
24 |       URL="https://awscli.amazonaws.com/awscli-exe-linux-aarch64.zip"
25 |       FILE="awscliv2-linux-aarch64.zip"
26 |     else
27 |       echo "Unsupported architecture: $ARCH"
28 |       exit 1
29 |     fi
30 |     ;;
31 |   Darwin)
32 |     URL="https://awscli.amazonaws.com/AWSCLIV2.pkg"
33 |     FILE="AWSCLIV2.pkg"
34 |     ;;
35 |   *)
36 |     echo "Unsupported OS: $OS"
37 |     exit 1
38 |     ;;
39 | esac
40 | 
41 | # Install AWS CLI
42 | if [ "$OS" == "Darwin" ]; then
43 |   curl "$URL" -o "$FILE"
44 |   sudo installer -pkg "$FILE" -target /
45 |   rm "$FILE"
46 | else
47 |   install_aws_cli "$URL" "$FILE"
48 | fi
49 | 
50 | # Verify installation
51 | aws --version
52 | 
53 | echo "AWS CLI installation completed."
54 | 


--------------------------------------------------------------------------------
/migrations/versions/9c266b1a4aa9_.py:
--------------------------------------------------------------------------------
 1 | """empty message
 2 | 
 3 | Revision ID: 9c266b1a4aa9
 4 | Revises: 
 5 | Create Date: 2023-10-25 05:13:55.932899
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '9c266b1a4aa9'
14 | down_revision = None
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.create_table('stats',
22 |     sa.Column('category', sa.Enum('BIOC', 'EXPERIMENT', 'ANNOTATION', 'WORKFLOWS', name='packagetype'), nullable=False),
23 |     sa.Column('package', sa.String(), nullable=False),
24 |     sa.Column('date', sa.Date(), nullable=False, comment='Dates repesenting months always have day=1, while years have month=12 and day=31'),
25 |     sa.Column('is_monthly', sa.Boolean(), nullable=False, comment='If true, date span is 1 month, if false, 1 year'),
26 |     sa.Column('ip_count', sa.BigInteger(), nullable=False),
27 |     sa.Column('download_count', sa.BigInteger(), nullable=False),
28 |     sa.PrimaryKeyConstraint('category', 'package', 'date')
29 |     )
30 |     # ### end Alembic commands ###
31 | 
32 | 
33 | def downgrade():
34 |     # ### commands auto generated by Alembic - please adjust! ###
35 |     op.drop_table('stats')
36 |     # ### end Alembic commands ###
37 | 


--------------------------------------------------------------------------------
/etl/t_cloudfront_logs.sql:
--------------------------------------------------------------------------------
 1 | CREATE EXTERNAL TABLE `cloudfront_logs`(
 2 |   `date` date, 
 3 |   `time` string, 
 4 |   `location` string, 
 5 |   `bytes` bigint, 
 6 |   `request_ip` string, 
 7 |   `method` string, 
 8 |   `host` string, 
 9 |   `uri` string, 
10 |   `status` int, 
11 |   `referrer` string, 
12 |   `user_agent` string, 
13 |   `query_string` string, 
14 |   `cookie` string, 
15 |   `result_type` string, 
16 |   `request_id` string, 
17 |   `host_header` string, 
18 |   `request_protocol` string, 
19 |   `request_bytes` bigint, 
20 |   `time_taken` float, 
21 |   `xforwarded_for` string, 
22 |   `ssl_protocol` string, 
23 |   `ssl_cipher` string, 
24 |   `response_result_type` string, 
25 |   `http_version` string, 
26 |   `fle_status` string, 
27 |   `fle_encrypted_fields` int, 
28 |   `c_port` int, 
29 |   `time_to_first_byte` float, 
30 |   `x_edge_detailed_result_type` string, 
31 |   `sc_content_type` string, 
32 |   `sc_content_len` bigint, 
33 |   `sc_range_start` bigint, 
34 |   `sc_range_end` bigint)
35 | ROW FORMAT DELIMITED 
36 |   FIELDS TERMINATED BY '\t' 
37 | STORED AS INPUTFORMAT 
38 |   'org.apache.hadoop.mapred.TextInputFormat' 
39 | OUTPUTFORMAT 
40 |   'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
41 | LOCATION
42 |   's3://bioc-cloudfront-logs/'
43 | TBLPROPERTIES (
44 |   'skip.header.line.count'='2', 
45 |   'transient_lastDdlTime'='1617227740')


--------------------------------------------------------------------------------
/bioc_webstats/splash.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """User views."""
 3 | import os
 4 | 
 5 | from flask import Blueprint, render_template
 6 | 
 7 | blueprint = Blueprint("spash", __name__)
 8 | 
 9 | 
10 | @blueprint.route("/")
11 | def home():
12 |     """Home page for testing only."""
13 |     targets = [
14 |         "/bioc/",
15 |         "/bioc/affy/",
16 |         "/bioc/affy/affy_2023_stats.tab",
17 |         "/bioc/affy/affy_stats.tab",
18 |         "/bioc/bioc_2023_stats.tab",
19 |         "/bioc/bioc_packages.txt",
20 |         "/bioc/bioc_pkg_scores.tab",
21 |         "/bioc/bioc_pkg_stats.tab",
22 |         "/bioc/bioc_stats.tab",
23 |         "/bioc/bioc_stats.tab",
24 |         "/data-experiment.html",
25 |         "/data-experiment/ABAData/",
26 |         "/data-experiment/ABAData/ABAData_2024_stats.tab",
27 |         "/data-experiment/ABAData/ABAData_stats.tab",
28 |         "/data-experiment/experiment_2023_stats.tab",
29 |         "/data-experiment/experiment_2023_stats.tab",
30 |         "/data-experiment/experiment_pkg_scores.tab",
31 |         "/data-experiment/experiment_pkg_stats.tab",
32 |         "/data-experiment/experiment_stats.tab",
33 |             "/data-experiment/ABAData/",
34 |         "/data-experiment/ABAData/ABAData_2023_stats.tab",
35 |         "/data-experiment/ABAData/ABAData_stats.tab",
36 |         ]
37 | 
38 |     return render_template("home.html", targets=targets)


--------------------------------------------------------------------------------
/tests/factories.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Factories to help in tests."""
 3 | import datetime as dt
 4 | 
 5 | from factory import Sequence
 6 | from factory.alchemy import SQLAlchemyModelFactory
 7 | 
 8 | from bioc_webstats.database import db
 9 | from bioc_webstats.models import Packages, PackageType, Stats, WebstatsInfo
10 | 
11 | 
12 | class BaseFactory(SQLAlchemyModelFactory):
13 |     """Base factory."""
14 | 
15 |     class Meta:
16 |         """Factory configuration."""
17 | 
18 |         abstract = True
19 |         sqlalchemy_session = db.session
20 | 
21 | 
22 | class WebstatsInfoFactory(BaseFactory):
23 |     """WebstatsInfo factory."""
24 | 
25 |     key = "ValidThru"
26 |     value = "2023-10-04"
27 | 
28 |     class Meta:
29 |         """Factory configuration."""
30 | 
31 |         model = WebstatsInfo
32 | 
33 | 
34 | class StatsFactory(BaseFactory):
35 |     """Stats factory."""
36 |     
37 |     package = Sequence(lambda n: f"pack{n}")
38 |     date = Sequence(lambda n: dt.date(2015, 1, 1) + dt.timedelta(days=n))
39 |     is_monthly = True
40 |     ip_count = Sequence(lambda n: (n + 1) * 10)
41 |     download_count = Sequence(lambda n: (n + 1) * 20)
42 | 
43 |     class Meta:
44 |         """Factory configuration."""
45 | 
46 |         model = Stats
47 | 
48 | 
49 | class PackagesFactory(BaseFactory):
50 |     """Stats factory."""
51 | 
52 |     class Meta:
53 |         """Factory configuration."""
54 | 
55 |         model = Packages
56 | 


--------------------------------------------------------------------------------
/etl/postgresql/v_stats_psqlsql.sql:
--------------------------------------------------------------------------------
 1 | -- View: public.v_stats
 2 | 
 3 | -- DROP VIEW public.v_stats;
 4 | 
 5 | CREATE OR REPLACE VIEW public.v_stats
 6 |  AS
 7 |  WITH t AS (
 8 |          SELECT D.category,
 9 |             D.package,
10 |             date_trunc('year'::text, D.date::timestamp) AS yr,
11 |             count(DISTINCT D."c-ip") AS ip_count,
12 |             count(*) AS download_count
13 |            FROM bioc_web_downloads as D
14 |            INNER JOIN packages as P 
15 |             ON D.package = P.package and D.category = P.category
16 |             where "sc-status" in (200, 301, 302, 307, 308)
17 |           GROUP BY D.category, D.package, (date_trunc('year'::text, D.date::timestamp))
18 |         )
19 |  SELECT t.category,
20 |     t.package,
21 |     t.yr + '1 year'::interval - '1 day'::interval AS date,
22 |     false AS is_monthly,
23 |     t.ip_count,
24 |     t.download_count
25 |    FROM t
26 | UNION ALL
27 |         SELECT D.category,
28 |             D.package,
29 |     date_trunc('MONTH'::text, D.date::timestamp) AS date,
30 |     true AS is_monthly,
31 |     count(DISTINCT D."c-ip") AS ip_count,
32 |     count(*) AS download_count
33 |            FROM bioc_web_downloads as D
34 |            INNER JOIN packages as P 
35 |             ON D.package = P.package and D.category = P.category
36 |             where "sc-status" in (200, 301, 302, 307, 308)
37 |   GROUP BY D.category, D.package, (date_trunc('MONTH'::text, D.date::timestamp));
38 | 
39 | ALTER TABLE public.v_stats
40 |     OWNER TO postgres;
41 | 
42 | 


--------------------------------------------------------------------------------
/etl/s3_move_objects.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Specify the source S3 bucket and prefix
 4 | SOURCE_BUCKET="dev-bioc-weblogs-small-test"
 5 | SOURCE_PREFIX=""
 6 | # Specify the destination prefix
 7 | DEST_PREFIX="source"
 8 | 
 9 | # Set the page size for listing objects
10 | PAGE_SIZE=1000
11 | 
12 | # Initialize a token variable for pagination
13 | token=""
14 | 
15 | # Loop until there are no more objects to list
16 | while true
17 | do
18 |   # List objects in the source bucket with a page size and token
19 |   objects=$(aws s3api list-objects-v2 --bucket "$SOURCE_BUCKET" --prefix "$SOURCE_PREFIX" --page-size "$PAGE_SIZE" --query 'Contents[].Key' --starting-token "$token")
20 | 
21 |   # Loop through the object keys and move them to the destination prefix
22 |   for key in $objects
23 |   do
24 |     # Construct the source and destination paths
25 |     source_path="s3://${SOURCE_BUCKET}/${key}"
26 |     dest_key="${DEST_PREFIX}/${key#*/}" # Remove the existing prefix
27 |     dest_path="s3://${SOURCE_BUCKET}/${dest_key}"
28 | 
29 |     # Move the object to the destination path
30 |     aws s3 mv "$source_path" "$dest_path"
31 |   done
32 | 
33 |   # Check if there are more objects to list
34 |   token=$(aws s3api list-objects-v2 --bucket "$SOURCE_BUCKET" --prefix "$SOURCE_PREFIX" --page-size "$PAGE_SIZE" --query 'NextToken' --output text)
35 | 
36 |   if [ -z "$token" ] || [ "$token" == "None" ]; then
37 |     break  # No more objects to list, exit the loop
38 |   fi
39 | done
40 | 
41 | echo "Objects at the top level moved to the '$DEST_PREFIX' prefix."
42 | 


--------------------------------------------------------------------------------
/assets/css/style.css:
--------------------------------------------------------------------------------
 1 | /* =============================================================================
 2 |    App specific CSS file.
 3 |    ========================================================================== */
 4 | 
 5 | /* universal */
 6 | 
 7 | html {
 8 |   overflow-y: scroll;
 9 | }
10 | 
11 | body {
12 |   padding-top: 60px;
13 | }
14 | 
15 | section {
16 |   overflow: auto;
17 | }
18 | 
19 | textarea {
20 |   resize: vertical;
21 | }
22 | 
23 | .container-narrow{
24 |   margin: 0 auto;
25 |   max-width: 700px;
26 | }
27 | 
28 | /* Forms */
29 | 
30 | .navbar-form input[type="text"],
31 | .navbar-form input[type="password"] {
32 |   width: 180px;
33 | }
34 | 
35 | 
36 | .form-register {
37 |   width: 50%;
38 | }
39 | 
40 | .form-register .form-control {
41 |   position: relative;
42 |   font-size: 16px;
43 |   height: auto;
44 |   padding: 10px;
45 |   -webkit-box-sizing: border-box;
46 |      -moz-box-sizing: border-box;
47 |           box-sizing: border-box;
48 | }
49 | 
50 | /* footer */
51 | 
52 | footer {
53 |   margin-top: 45px;
54 |   padding-top: 5px;
55 |   border-top: 1px solid #eaeaea;
56 |   color: #999999;
57 | }
58 | 
59 | footer a {
60 |   color: #999999;
61 | }
62 | 
63 | footer p {
64 |   float: right;
65 |   margin-right: 25px;
66 | }
67 | 
68 | footer ul {
69 |   list-style: none;
70 | }
71 | 
72 | footer ul li {
73 |   float: left;
74 |   margin-left: 10px;
75 | }
76 | 
77 | footer .company {
78 |   float: left;
79 |   margin-left: 25px;
80 | }
81 | 
82 | footer .footer-nav {
83 |   float: right;
84 |   margin-right: 25px;
85 |   list-style: none;
86 | }
87 | 


--------------------------------------------------------------------------------
/e2e/smoke_test.spec.ts:
--------------------------------------------------------------------------------
 1 | import { test, expect } from "@playwright/test";
 2 | import { URL_STEM } from "./constants";
 3 | 
 4 | test("Smoke Test", async ({ page }) => {
 5 |   await page.goto(URL_STEM);
 6 |   await expect(
 7 |     page.getByRole("heading", { name: "Download stats for" })
 8 |   ).toBeVisible();
 9 |   await expect(page.locator("h1")).toContainText(
10 |     "Download stats for Bioconductor software packages"
11 |   );
12 |   await page.getByRole("link", { name: "Bioconductor annotation" }).click();
13 |   await expect(page.locator("h1")).toContainText(
14 |     "Download stats for Bioconductor annotation packages"
15 |   );
16 |   await page.getByRole("heading", { name: "Download stats for" }).click();
17 |   await expect(
18 |     page.getByRole("heading", { name: "Download stats for" })
19 |   ).toBeVisible();
20 |   await page
21 |     .getByRole("link", { name: "Bioconductor workflow packages" })
22 |     .click();
23 |   await page.getByRole("heading", { name: "Download stats for" }).click();
24 |   await expect(page.locator("h1")).toContainText(
25 |     "Download stats for Bioconductor workflow packages"
26 |   );
27 |   await expect(
28 |     page.getByRole("heading", { name: "Download stats for" })
29 |   ).toBeVisible();
30 |   await page.getByRole("link", { name: "Bioconductor annotation" }).click();
31 |   await page.getByRole("heading", { name: "Download stats for" }).click();
32 |   await expect(
33 |     page.getByRole("heading", { name: "Download stats for" })
34 |   ).toBeVisible();
35 |   await expect(page.locator("h1")).toContainText(
36 |     "Download stats for Bioconductor annotation packages"
37 |   );
38 | });
39 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Python artifacts
  2 | *.pyc
  3 | *.pyo
  4 | *.egg-info/
  5 | __pycache__/
  6 | *.log
  7 | *.pot
  8 | *.py[co]
  9 | *.sqlite3
 10 | *.tar.gz
 11 | # Virtual environment and dependency management
 12 | Pipfile.lock
 13 | *.pip
 14 | pip-log.txt
 15 | pip-delete-this-directory.txt
 16 | *.pipenv
 17 | # IDEs and editors
 18 | *.pycharm/
 19 | .idea/
 20 | *.swp
 21 | *.swo
 22 | *.swn
 23 | *.code-workspace
 24 | .vscode/
 25 | 
 26 | # RStudio files
 27 | .Rproj.user
 28 | .Rhistory
 29 | .RData
 30 | .Ruserdata
 31 | 
 32 | # SQL
 33 | *.sql.sqlite3
 34 | # AWS
 35 | *.pem
 36 | *.ppk
 37 | *.tfstate
 38 | *.tfstate.*.backup
 39 | *.tfvars
 40 | terraform.tfstate.d/
 41 | secrets.yml
 42 | # leave whl files out will development is active
 43 | # OS-specific
 44 | .DS_Store
 45 | Thumbs.db
 46 | node_modules
 47 | # Samples and input data
 48 | /etl/data
 49 | # C extensions
 50 | *.so
 51 | # Packages
 52 | build
 53 | eggs
 54 | parts
 55 | bin
 56 | var
 57 | /sdist
 58 | develop-eggs
 59 | .installed.cfg
 60 | lib
 61 | lib64
 62 | # Installer logs
 63 | pip-log.txt
 64 | # Unit test / coverage reports
 65 | .coverage
 66 | .tox
 67 | nosetests.xml
 68 | # Translations
 69 | *.mo
 70 | # Mr Developer
 71 | .mr.developer.cfg
 72 | .project
 73 | .pydevproject
 74 | # Complexity
 75 | output/*.html
 76 | output/*/index.html
 77 | # Sphinx
 78 | docs/_build
 79 | .webassets-cache
 80 | # Virtualenvs
 81 | env/
 82 | /.env
 83 | # npm
 84 | /node_modules/
 85 | # webpack-built files
 86 | /bioc_webstats/static/build/
 87 | # development database
 88 | /instance
 89 | /test-results/
 90 | /playwright-report/
 91 | /blob-report/
 92 | /playwright/.cache/
 93 | /test-results/
 94 | /playwright-report/
 95 | /blob-report/
 96 | /playwright/.cache/
 97 | bio-web-stats.Rproj
 98 | # charts prototype
 99 | /bioc_webstats/chart
100 | /test-results/
101 | /playwright-report/
102 | /blob-report/
103 | /playwright/.cache/
104 | node_modules
105 | node_modules
106 | 


--------------------------------------------------------------------------------
/etl/v_bioc_web_downloads.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | view bioc_web_downloads
 3 | package and categories are inferred from the uri
 4 | Select only package downloads, which must have a valid category and end with a package tarball
 5 | The package tarball must be in the form 
 6 |     :<package_name>_<version>.tar.<compression>,
 7 |     where compression is gz, zip, or tgz
 8 | The category is extracted from the start of the uri, and must be in the form:
 9 |     /packages/<version>/<category>/...
10 | Valid categories are: bioc, workflows, data/experiment, data/annotation
11 | 
12 | Note: Wherever a '//' appears in the uri-stem, it is treated as a single '/'
13 | Note: standard name for uri is "cs-uri-stem"
14 | Note: The RE string is repeated multiple times becasue Athena does not have reasonable way
15 | to handle manifest constants (as of Oct-2023) 
16 | */
17 | 
18 | CREATE OR REPLACE VIEW "v_bioc_web_downloads" AS 
19 | WITH
20 |   T AS (
21 |    SELECT
22 |      "date"
23 |    , request_ip "c-ip"
24 |    , "status" "sc-status"
25 |    , replace(regexp_extract("uri", '^/+packages/+[^/]*/+(bioc|workflows|data/+experiment|data/+annotation)/+(?:bin|src)/+(?:[^/]*/+)*([^_]*)_.*\.(?:tar|gz|zip|tgz)$', 1), 'data/', '') category
26 |    , regexp_extract("uri", '^/+packages/+[^/]*/+(bioc|workflows|data/+experiment|data/+annotation)/+(?:bin|src)/+(?:[^/]*/+)*([^_]*)_.*\.(?:tar|gz|zip|tgz)$', 2) package
27 |    , LPAD(CAST(year("date") AS VARCHAR), 4, '0') "year"
28 |    , LPAD(CAST(month("date") AS VARCHAR), 2, '0') "month"
29 |    , LPAD(CAST(day("date") AS VARCHAR), 2, '0') "day"
30 |    FROM
31 |      "cloudfront_logs"
32 |    WHERE (("status" IN (200, 301, 302, 307, 308)) AND regexp_like("uri", '^/+packages/+[^/]*/+(bioc|workflows|data/+experiment|data/+annotation)/+(?:bin|src)/+(?:[^/]*/+)*([^_]*)_.*\.(?:tar|gz|zip|tgz)$'))
33 | ) 
34 | SELECT *
35 | FROM T
36 | WHERE package <> ''
37 |     AND length("c-ip") <= 40
38 |     AND length(category) <= 16
39 |     AND length("package") <= 64
40 | 
41 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["poetry-core>=1.0.0"]
 3 | build-backend = "poetry.core.masonry.api"
 4 | 
 5 | [tool.poetry]
 6 | name = "bioc_webstats"
 7 | version = "0.1.10"
 8 | description = "A web application to serve the content under www.bioconductor.org/packages/stats/"
 9 | authors = ["Robert Shear <rshear@ds.dfck.harvard.edu>"]
10 | license = "MIT"
11 | readme = "README.md"
12 | repository = "https://github.com/bioconductor/bio-web-stats"
13 | classifiers = [
14 |     "Framework :: Flask",
15 |     "Programming Language :: Python :: 3",
16 | ]
17 | 
18 | include = [
19 |     "pyproject.toml",
20 |     "installer_scripts/*",
21 | ]
22 | 
23 | [tool.poetry.dependencies]
24 | python = "^3.12.3"
25 | click = ">=7.0"
26 | Flask = ">=2.3,<3.0"
27 | Werkzeug = "*"
28 | python-dotenv = "*"
29 | Flask-SQLAlchemy = ">=3.1,<4.0"
30 | psycopg2-binary = ">=2.9,<3.0"
31 | SQLAlchemy = ">=2.0,<3.0"
32 | pandas = "*"
33 | pyarrow = "*"
34 | s3fs = "*"
35 | awswrangler = "*"
36 | Flask-Migrate = ">=4.0,<5.0"
37 | email-validator = ">=2.0,<3.0"
38 | Flask-WTF = ">=1.2,<2.0"
39 | WTForms = ">=3.1,<4.0"
40 | gevent = "*"
41 | supervisor = ">=4.2,<5.0"
42 | Flask-Static-Digest = ">=0.4,<1.0"
43 | Flask-Caching = ">=2.0,<3.0"
44 | Flask-DebugToolbar = ">=0.13,<1.0"
45 | environs = "*"
46 | boto3 = "*"
47 | ptvsd = "*"
48 | waitress = "*"
49 | pyyaml = "^6.0.2"
50 | requests = "^2.32.3"
51 | 
52 | [tool.poetry.group.dev.dependencies]
53 | factory-boy = ">=3.3,<4.0"
54 | pytest = ">=7.4,<8.0"
55 | pytest-cov = ">=4.1,<5.0"
56 | WebTest = ">=3.0,<4.0"
57 | decouple = "*"
58 | flake8 = "*"
59 | isort = "*"
60 | pep8-naming = "*"
61 | pytest-playwright = "*"
62 | fastparquet = "*"
63 | 
64 | [tool.coverage.run]
65 | branch = true
66 | source = ["bioc_webstats"]
67 | 
68 | [tool.flake8]
69 | max-line-length = 120
70 | exclude = ["migrations/*", "etl/*"]
71 | max-complexity = 18
72 | 
73 | [tool.isort]
74 | profile = "black"
75 | skip_glob = ["migrations/*"]
76 | 
77 | [tool.pytest.ini_options]
78 | testpaths = ["tests"]
79 | 


--------------------------------------------------------------------------------
/etl/postgresql/t_bioc_web_downloads_psql.sql:
--------------------------------------------------------------------------------
 1 | -- Table: public.bioc_web_downloads
 2 | 
 3 | -- DROP TABLE IF EXISTS public.bioc_web_downloads;
 4 | 
 5 | CREATE TABLE IF NOT EXISTS public.bioc_web_downloads
 6 | (
 7 |     date date,
 8 |     "c-ip" character varying(40) COLLATE pg_catalog."default",
 9 |     "sc-status" integer,
10 |     category character varying(16) COLLATE pg_catalog."default",
11 |     "package" character varying(64) COLLATE pg_catalog."default"
12 | )
13 | 
14 | TABLESPACE pg_default;
15 | 
16 | ALTER TABLE IF EXISTS public.bioc_web_downloads
17 |     OWNER to postgres;
18 | 
19 | GRANT ALL ON TABLE public.bioc_web_downloads TO postgres;
20 | 
21 | GRANT ALL ON TABLE public.bioc_web_downloads TO webstats_runner;
22 | -- Index: idx_bioc_web_downloads_category_package
23 | 
24 | -- DROP INDEX IF EXISTS public.idx_bioc_web_downloads_category_package;
25 | 
26 | CREATE INDEX IF NOT EXISTS idx_bioc_web_downloads_category_package
27 |     ON public.bioc_web_downloads USING btree
28 |     (category COLLATE pg_catalog."default" ASC NULLS LAST, package COLLATE pg_catalog."default" ASC NULLS LAST)
29 |     TABLESPACE pg_default;
30 | -- Index: idx_bioc_web_downloads_cip
31 | 
32 | -- DROP INDEX IF EXISTS public.idx_bioc_web_downloads_cip;
33 | 
34 | CREATE INDEX IF NOT EXISTS idx_bioc_web_downloads_cip
35 |     ON public.bioc_web_downloads USING btree
36 |     ("c-ip" COLLATE pg_catalog."default" ASC NULLS LAST)
37 |     TABLESPACE pg_default;
38 | -- Index: idx_bioc_web_downloads_date
39 | 
40 | -- DROP INDEX IF EXISTS public.idx_bioc_web_downloads_date;
41 | 
42 | CREATE INDEX IF NOT EXISTS idx_bioc_web_downloads_date
43 |     ON public.bioc_web_downloads USING btree
44 |     (date ASC NULLS LAST)
45 |     TABLESPACE pg_default;
46 | -- Index: idx_bioc_web_downloads_scstatus
47 | 
48 | -- DROP INDEX IF EXISTS public.idx_bioc_web_downloads_scstatus;
49 | 
50 | CREATE INDEX IF NOT EXISTS idx_bioc_web_downloads_scstatus
51 |     ON public.bioc_web_downloads USING btree
52 |     ("sc-status" ASC NULLS LAST)
53 |     TABLESPACE pg_default;


--------------------------------------------------------------------------------
/bioc_webstats/static/barchart.js:
--------------------------------------------------------------------------------
 1 | function plot_bars(canvas_id, year, data) {
 2 |   var ctx = document.getElementById(canvas_id).getContext("2d");
 3 |   var gradient = ctx.createLinearGradient(0, 0, 0, 400);
 4 |   gradient.addColorStop(0, "rgba(255, 0, 0, 0.5)");
 5 |   gradient.addColorStop(1, "rgba(0, 0, 255, 0.5)");
 6 | 
 7 |   const downloadsList = [];
 8 |   const monthList = [];
 9 |   const uniqueIpsList = [];
10 | 
11 |   const maxRows = Math.min(data.length, 12);
12 | 
13 |   for (let i = 0; i < maxRows; i++) {
14 |     downloadsList.push(data[i].downloads);
15 |     monthList.push(data[i].month);
16 |     uniqueIpsList.push(data[i].unique_ips);
17 |   }
18 |   var theChart = new Chart(ctx, {
19 |     type: "bar",
20 |     data: {
21 |       labels: monthList,
22 |       datasets: [
23 |         {
24 |           label: "Distinct IPs",
25 |           data: uniqueIpsList,
26 |           backgroundColor: '#aaaaff',
27 |           borderWidth: 1,
28 |         },
29 |         {
30 |           label: "Downloads",
31 |           data: downloadsList,
32 |           backgroundColor: '#ddddff',
33 |           borderWidth: 1,
34 |         },
35 |       ],
36 |     },
37 |     options: {
38 |       scales: {
39 |         y: {
40 |           type: "logarithmic",
41 |           min: 0.9,
42 |           ticks: {
43 |             // Generate the labels for each power of ten
44 |             callback: function (value, index, values) {
45 |               // Return the value if it's an exact power of ten
46 |               if (Math.log10(value) % 1 === 0) {
47 |                 return value.toString();
48 |               }
49 |             },
50 |           },
51 |         },
52 |       },
53 |     },
54 |   });
55 | }
56 | 
57 | document.addEventListener("DOMContentLoaded", (event) => {
58 |   const elements = document.getElementsByClassName("barchart");
59 |   for (let i = 0; i < elements.length; i++) {
60 |     const canvas_id = elements[i].id;
61 |     const year = canvas_id.replace("barchart_", "");
62 |     plot_bars(canvas_id, year, data_table[year]);
63 |   }
64 | });
65 | 


--------------------------------------------------------------------------------
/docs/test-instance-setup.md:
--------------------------------------------------------------------------------
 1 | # Stats Test Instance Setup
 2 | 
 3 | ## Deploy EC2 Instance
 4 | - Deploy an EC2 Instance with Ubuntu 22.04, Instance type: t2.micro.
 5 | 
 6 | ## 1. Prepare Ubuntu Server
 7 | - Update the system and install Apache and mod_wsgi:
 8 |   - `sudo apt update`
 9 |   - `sudo apt upgrade`
10 |   - `sudo apt install apache2 libapache2-mod-wsgi-py3`
11 | 
12 | ## 2. Install Required Software
13 | - Install Python 3 and pip:
14 |   - `sudo apt install python3-pip`
15 | 
16 | ## 3. Configure Apache
17 | - Enable the mod_wsgi module for Apache:
18 |   - `sudo a2enmod wsgi`
19 | 
20 | ## 4. Create Flask Application
21 | - Place the Flask application code on the server.
22 | - Create a directory for your Flask app and clone your code into it.
23 | - Create a Python virtual environment to manage your dependencies:
24 |   - TBS
25 | ## 5. Configure Apache Virtual Host
26 | - Create an Apache Virtual Host configuration file for the Flask app.
27 | - Modify the configuration to suit your specific paths:
28 |   - `sudo nano /etc/apache2/sites-available/000-default.conf`
29 | 
30 |   ```apache
31 |   <VirtualHost *:80>
32 |       ServerName 18.117.254.107
33 |       WSGIDaemonProcess app user=www-data group=www-data threads=5
34 |       WSGIScriptAlias / /var/www/bioc-webstats/app.wsgi
35 |       <Directory /var/www/bioc-webstats/bioc_webstats>
36 |           WSGIProcessGroup app
37 |           WSGIApplicationGroup %{GLOBAL}
38 |           Require all granted
39 |       </Directory>
40 |       ErrorLog ${APACHE_LOG_DIR}/error.log
41 |       CustomLog ${APACHE_LOG_DIR}/access.log combined
42 |   </VirtualHost>
43 | 
44 | 
45 | ## 6. Create the WSGI File
46 | Create a .wsgi file for your Flask app. This file will load the Flask app into the Apache WSGI process:
47 | 
48 | - `sudo nano /var/www/bioc-webstats/app.wsgi`
49 | 
50 | ```python
51 | #!/usr/bin/python3
52 | import sys
53 | import logging
54 | logging.basicConfig(stream=sys.stderr)
55 | sys.path.insert(0, '/var/www/bioc-webstats')
56 | from bioc_webstats import app
57 | application = app.create_app("production")
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/etl/postgresql/f_stats.sql:
--------------------------------------------------------------------------------
 1 | -- FUNCTION: public.f_stats(date, boolean, boolean)
 2 | 
 3 | -- DROP FUNCTION IF EXISTS public.f_stats(date, boolean, boolean);
 4 | 
 5 | CREATE OR REPLACE FUNCTION public.f_stats(
 6 | 	start_date date,
 7 | 	monthly boolean,
 8 | 	by_package boolean)
 9 |     RETURNS TABLE(category character varying, package character varying, date timestamp without time zone, is_monthly boolean, ip_count bigint, download_count bigint) 
10 |     LANGUAGE 'plpgsql'
11 |     COST 100
12 |     VOLATILE PARALLEL UNSAFE
13 |     ROWS 1000
14 | 
15 | AS $BODY$
16 | DECLARE
17 | 	adjusted_start_date DATE;
18 | BEGIN
19 | 	adjusted_start_date := 
20 | 		CASE
21 | 			WHEN monthly
22 | 			THEN
23 | 				date_trunc('MONTH'::text, start_date::timestamp without time zone)
24 | 			ELSE
25 | 				date_trunc('year', start_date::timestamp without time zone)
26 | 		END;
27 | 
28 |     RETURN QUERY -- Use RETURN QUERY to return a set of rows
29 |         SELECT d.category,
30 | 			CASE
31 | 				WHEN by_package
32 | 				THEN d.package
33 | 				ELSE '*'
34 | 			END as package,
35 | 			CASE
36 | 				WHEN monthly
37 | 				THEN
38 | 					date_trunc('MONTH'::text, D.date::timestamp without time zone)
39 | 				ELSE
40 | 					date_trunc('year', d.date::timestamp without time zone) + 
41 | 						'1 year'::interval - '1 day'::interval
42 | 			END AS "date",
43 | 			MONTHLY as is_monthly,
44 | 			COUNT(DISTINCT "c-ip") ip_count,
45 | 			count(*) as download_count
46 |         FROM bioc_web_downloads d
47 |         JOIN packages p ON d.package::text = p.package::text AND d.category::text = p.category::text
48 |         WHERE d.date >= adjusted_start_date
49 | 		GROUP BY d.category, 
50 | 			CASE
51 | 				WHEN by_package
52 | 				THEN d.package
53 | 				ELSE '*'
54 | 			END,
55 | 			CASE
56 | 				WHEN monthly
57 | 				THEN
58 | 					date_trunc('MONTH'::text, D.date::timestamp without time zone)
59 | 				ELSE
60 | 					date_trunc('year', d.date::timestamp without time zone) + '1 year'::interval - '1 day'::interval
61 | 			END
62 | 
63 | ;
64 | END;
65 | $BODY$;
66 | 
67 | ALTER FUNCTION public.f_stats(date, boolean, boolean)
68 |     OWNER TO postgres;
69 | 


--------------------------------------------------------------------------------
/docs/old_specimen.txt:
--------------------------------------------------------------------------------
 1 | ips|day_month_year|month_year|time|utc_offset|method|url|protocol|statuscode|bytes|referer|user_agent|biocrepo_relurl|biocrepo|biocversion|package|pkgversion|pkgtype
 2 | 128.118.250.14|01/Jan/2021|Jan/2021|08:02:03|-0000|GET|/packages/3.12/data/experiment/src/contrib/dsQTL_0.28.0.tar.gz|HTTP/1.1|200|991465136|||/packages/3.12/data/experiment|experiment||dsQTL|0.28.0|tar.gz
 3 | 128.118.250.14|01/Jan/2021|Jan/2021|08:02:14|-0000|GET|/packages/3.12/data/experiment/src/contrib/dyebiasexamples_1.30.0.tar.gz|HTTP/1.1|200|15940362|||/packages/3.12/data/experiment|experiment||dyebiasexamples|1.30.0|tar.gz
 4 | 128.118.250.14|01/Jan/2021|Jan/2021|08:02:15|-0000|GET|/packages/3.12/bioc/src/contrib/EBarrays_2.54.0.tar.gz|HTTP/1.1|200|866870|||/packages/3.12/bioc|bioc||EBarrays|2.54.0|tar.gz
 5 | 128.118.250.14|01/Jan/2021|Jan/2021|08:02:16|-0000|GET|/packages/3.12/bioc/src/contrib/EBSeqHMM_1.24.0.tar.gz|HTTP/1.1|200|1469038|||/packages/3.12/bioc|bioc||EBSeqHMM|1.24.0|tar.gz
 6 | 128.118.250.14|01/Jan/2021|Jan/2021|08:02:15|-0000|GET|/packages/3.12/data/experiment/src/contrib/EatonEtAlChIPseq_0.28.0.tar.gz|HTTP/1.1|200|21495848|||/packages/3.12/data/experiment|experiment||EatonEtAlChIPseq|0.28.0|tar.gz
 7 | 128.118.250.14|01/Jan/2021|Jan/2021|08:02:12|-0000|GET|/packages/3.12/bioc/src/contrib/dupRadar_1.20.0.tar.gz|HTTP/1.1|200|3886088|||/packages/3.12/bioc|bioc||dupRadar|1.20.0|tar.gz
 8 | 128.118.250.14|01/Jan/2021|Jan/2021|08:02:17|-0000|GET|/packages/3.12/data/annotation/src/contrib/ecoliK12.db0_3.12.0.tar.gz|HTTP/1.1|200|5273918|||/packages/3.12/data/annotation|annotation||ecoliK12.db0|3.12.0|tar.gz
 9 | 128.118.250.14|01/Jan/2021|Jan/2021|08:02:10|-0000|GET|/packages/3.12/bioc/src/contrib/DSS_2.38.0.tar.gz|HTTP/1.1|200|2124925|||/packages/3.12/bioc|bioc||DSS|2.38.0|tar.gz
10 | 128.118.250.14|01/Jan/2021|Jan/2021|08:02:11|-0000|GET|/packages/3.12/bioc/src/contrib/DTA_2.36.0.tar.gz|HTTP/1.1|200|11006030|||/packages/3.12/bioc|bioc||DTA|2.36.0|tar.gz
11 | 52.156.87.181|01/Jan/2021|Jan/2021|08:01:57|-0000|GET|/packages/3.12/bioc/src/contrib/graph_1.68.0.tar.gz|HTTP/1.1|200|957917|||/packages/3.12/bioc|bioc||graph|1.68.0|tar.gz
12 | 


--------------------------------------------------------------------------------
/etl/postgresql/sp_update_stats.sql:
--------------------------------------------------------------------------------
 1 | -- PROCEDURE: public.sp_update_stats()
 2 | 
 3 | -- DROP PROCEDURE IF EXISTS public.sp_update_stats();
 4 | 
 5 | CREATE OR REPLACE PROCEDURE public.sp_update_stats(
 6 | 	)
 7 | LANGUAGE 'plpgsql'
 8 | AS $BODY$
 9 | DECLARE
10 |     start_date DATE;
11 |     end_date DATE;
12 | BEGIN
13 |     SELECT TO_DATE(value, 'YYYY-MM-DD') INTO start_date
14 |     FROM webstats_info
15 |     WHERE key = 'ValidThru';
16 |     RAISE NOTICE 'Start Date %', start_date;
17 | 
18 | 	INSERT INTO stats (category, package, date, is_monthly, ip_count, download_count)
19 | 		select category, package, date, is_monthly, ip_count, download_count
20 | 			from f_stats(start_date, false, true)
21 | 		union all
22 | 			select category, package, date, is_monthly, ip_count, download_count
23 | 			from f_stats(start_date, true, true)
24 | 		ON CONFLICT (package, "date")
25 | 		DO UPDATE SET
26 | 		category  = EXCLUDED.category,
27 | 		"package" = EXCLUDED."package", 
28 | 		"date" = EXCLUDED."date",
29 | 		is_monthly = EXCLUDED.is_monthly, 
30 | 		ip_count = EXCLUDED.ip_count, 
31 | 		download_count = EXCLUDED.download_count;
32 | 		
33 | 
34 | 	INSERT INTO categorystats (category, date, is_monthly, ip_count, download_count)
35 | 		select category, date, is_monthly, ip_count, download_count
36 | 			from f_stats(start_date, false, false)
37 | 		union all
38 | 			select category, date, is_monthly, ip_count, download_count
39 | 			from f_stats(start_date, true, false)
40 | 		ON CONFLICT (category, "date")
41 | 		DO UPDATE SET
42 | 		category  = EXCLUDED.category,
43 | 		"date" = EXCLUDED."date",
44 | 		is_monthly = EXCLUDED.is_monthly, 
45 | 		ip_count = EXCLUDED.ip_count, 
46 | 		download_count = EXCLUDED.download_count;
47 | 
48 |     DELETE FROM stats WHERE date >= start_date;
49 | 
50 |     INSERT INTO stats
51 |     SELECT * FROM v_stats WHERE date >= start_date;
52 | 
53 |     DELETE FROM categorystats WHERE date >= start_date;
54 | 
55 |     INSERT INTO categorystats
56 |     SELECT * FROM v_categorystats WHERE date >= start_date;
57 | 
58 |     SELECT MAX(date) INTO end_date FROM bioc_web_downloads;
59 | 
60 |     UPDATE webstats_info
61 |     SET value = end_date
62 |     WHERE key = 'ValidThru';
63 |     RAISE NOTICE 'Update completed through %', end_date;
64 | 	
65 | END;
66 | $BODY$;
67 | ALTER PROCEDURE public.sp_update_stats()
68 |     OWNER TO postgres;
69 | 


--------------------------------------------------------------------------------
/playwright.config.ts:
--------------------------------------------------------------------------------
 1 | import { defineConfig, devices } from '@playwright/test';
 2 | 
 3 | /**
 4 |  * Read environment variables from file.
 5 |  * https://github.com/motdotla/dotenv
 6 |  */
 7 | require('dotenv').config();
 8 | 
 9 | /**
10 |  * See https://playwright.dev/docs/test-configuration.
11 |  */
12 | export default defineConfig({
13 |   testDir: './e2e',
14 |   /* Run tests in files in parallel */
15 |   fullyParallel: true,
16 |   /* Fail the build on CI if you accidentally left test.only in the source code. */
17 |   forbidOnly: !!process.env.CI,
18 |   /* Retry on CI only */
19 |   retries: process.env.CI ? 2 : 0,
20 |   /* Opt out of parallel tests on CI. */
21 |   workers: process.env.CI ? 1 : undefined,
22 |   /* Reporter to use. See https://playwright.dev/docs/test-reporters */
23 |   reporter: 'html',
24 |   /* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */
25 |   use: {
26 |     /* Base URL to use in actions like `await page.goto('/')`. */
27 |     baseURL: process.env.WEBSTATS_URL || 'http://master.bioconductor.org',
28 |     /* Collect trace when retrying the failed test. See https://playwright.dev/docs/trace-viewer */
29 |     trace: 'on-first-retry',
30 |   },
31 | 
32 |   /* Configure projects for major browsers */
33 |   projects: [
34 |     {
35 |       name: 'chromium',
36 |       use: { ...devices['Desktop Chrome'] },
37 |     },
38 | 
39 |     {
40 |       name: 'firefox',
41 |       use: { ...devices['Desktop Firefox'] },
42 |     },
43 | 
44 |     {
45 |       name: 'webkit',
46 |       use: { ...devices['Desktop Safari'] },
47 |     },
48 | 
49 |     /* Test against mobile viewports. */
50 |     // {
51 |     //   name: 'Mobile Chrome',
52 |     //   use: { ...devices['Pixel 5'] },
53 |     // },
54 |     // {
55 |     //   name: 'Mobile Safari',
56 |     //   use: { ...devices['iPhone 12'] },
57 |     // },
58 | 
59 |     /* Test against branded browsers. */
60 |     // {
61 |     //   name: 'Microsoft Edge',
62 |     //   use: { ...devices['Desktop Edge'], channel: 'msedge' },
63 |     // },
64 |     // {
65 |     //   name: 'Google Chrome',
66 |     //   use: { ...devices['Desktop Chrome'], channel: 'chrome' },
67 |     // },
68 |   ],
69 | 
70 |   /* Run your local dev server before starting the tests */
71 |   // webServer: {
72 |   //   command: 'npm run start',
73 |   //   url: 'http://127.0.0.1:3000',
74 |   //   reuseExistingServer: !process.env.CI,
75 |   // },
76 | });
77 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "bioc_webstats",
 3 |   "version": "1.0.0",
 4 |   "description": "Public accss to Bioconductor package download history",
 5 |   "scripts": {
 6 |     "build": "run-script-os",
 7 |     "build:win32": "SET NODE_ENV=production && webpack --progress --color --optimization-minimize && npm run flask-static-digest",
 8 |     "build:default": "NODE_ENV=production webpack --progress --color --optimization-minimize && npm run flask-static-digest",
 9 |     "start": "run-script-os",
10 |     "start:win32": "concurrently -n \"WEBPACK,FLASK\" -c \"bgBlue.bold,bgMagenta.bold\" \"npm run webpack-watch:win32\" \"npm run flask-server\"",
11 |     "start:default": "concurrently -n \"WEBPACK,FLASK\" -c \"bgBlue.bold,bgMagenta.bold\" \"npm run webpack-watch\" \"npm run flask-server\"",
12 |     "webpack-watch": "run-script-os",
13 |     "webpack-watch:win32": "SET NODE_ENV=debug && webpack --mode development --watch",
14 |     "webpack-watch:default": "NODE_ENV=debug webpack --mode development --watch",
15 |     "flask-server": "flask run --host=0.0.0.0",
16 |     "flask-static-digest": "flask digest compile",
17 |     "lint": "eslint \"assets/js/*.js\""
18 |   },
19 |   "repository": {
20 |     "type": "git",
21 |     "url": "git+https://github.com/rdshear/bioc_webstats.git"
22 |   },
23 |   "author": "Robert Shear",
24 |   "license": "MIT",
25 |   "engines": {
26 |     "node": ">=20"
27 |   },
28 |   "bugs": {
29 |     "url": "https://github.com/rdshear/bioc_webstats/issues"
30 |   },
31 |   "homepage": "https://github.com/rdshear/bioc_webstats#readme",
32 |   "dependencies": {
33 |     "@fortawesome/fontawesome-free": "6.4.2",
34 |     "@popperjs/core": "2.11.8",
35 |     "bootstrap": "5.3.2",
36 |     "copy.js": "^0.1.0",
37 |     "dotenv": "^16.4.5",
38 |     "jquery": "3.7.1"
39 |   },
40 |   "devDependencies": {
41 |     "@babel/core": "7.23.2",
42 |     "@babel/eslint-parser": "7.22.15",
43 |     "@babel/preset-env": "7.23.2",
44 |     "@playwright/test": "^1.44.1",
45 |     "@types/node": "^20.14.9",
46 |     "babel-loader": "9.1.3",
47 |     "chokidar": "3.5.3",
48 |     "concurrently": "8.2.2",
49 |     "css-loader": "6.8.1",
50 |     "eslint": "8.51.0",
51 |     "eslint-config-airbnb-base": "15.0.0",
52 |     "eslint-plugin-import": "2.28.1",
53 |     "less": "4.2.0",
54 |     "less-loader": "11.1.3",
55 |     "mini-css-extract-plugin": "2.7.6",
56 |     "run-script-os": "1.1.6",
57 |     "webpack": "5.89.0",
58 |     "webpack-cli": "5.1.4"
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/etl/t_bioc_web_logs_partitioned.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Athena SQL to expose www.biconductor.org web logs as a SQL table
 3 | */
 4 | CREATE EXTERNAL TABLE `bioc_web_logs_p`(
 5 |   `date` date COMMENT '', 
 6 |   `time` string COMMENT '', 
 7 |   `x-edge-location` string COMMENT '', 
 8 |   `sc-bytes` bigint COMMENT '', 
 9 |   `c-ip` string COMMENT '', 
10 |   `cs-method` string COMMENT '', 
11 |   `cs(host)` string COMMENT '', 
12 |   `cs-uri-stem` string COMMENT '', 
13 |   `sc-status` bigint COMMENT '', 
14 |   `cs(referer)` string COMMENT '', 
15 |   `cs(user-agent)` string COMMENT '', 
16 |   `cs-uri-query` string COMMENT '', 
17 |   `cs(cookie)` string COMMENT '', 
18 |   `x-edge-result-type` string COMMENT '', 
19 |   `x-edge-request-id` string COMMENT '', 
20 |   `x-host-header` string COMMENT '', 
21 |   `cs-protocol` string COMMENT '', 
22 |   `cs-bytes` bigint COMMENT '', 
23 |   `time-taken` double COMMENT '', 
24 |   `x-forwarded-for` string COMMENT '', 
25 |   `ssl-protocol` string COMMENT '', 
26 |   `ssl-cipher` string COMMENT '', 
27 |   `x-edge-response-result-type` string COMMENT '', 
28 |   `cs-protocol-version` string COMMENT '', 
29 |   `fle-status` bigint COMMENT '', 
30 |   `fle-encrypted-fields` bigint COMMENT '', 
31 |   `c-port` bigint COMMENT '', 
32 |   `time-to-first-byte` double COMMENT '', 
33 |   `x-edge-detailed-result-type` string COMMENT '', 
34 |   `sc-content-type` string COMMENT '', 
35 |   `sc-content-len` bigint COMMENT '', 
36 |   `sc-range-start` bigint COMMENT '', 
37 |   `sc-range-end` bigint COMMENT '')
38 | PARTITIONED BY (year string, month string, day string)
39 | ROW FORMAT DELIMITED 
40 |   FIELDS TERMINATED BY '\t' 
41 | LOCATION
42 |   's3://web-stats-dev/weblogs/'
43 | TBLPROPERTIES (
44 |     'skip.header.line.count'='2'
45 | 
46 |   "projection.enabled" = "true",
47 |   "projection.year.type" = "integer",
48 |   "projection.year.range" = "2009,2027",
49 |   "projection.year.format" = "yyyy",
50 |   "projection.year.interval" = "1",
51 |   "projection.year.interval.unit" = "YEARS",
52 |   "projection.month.type" = "integer",
53 |   "projection.month.range" = "1,12",
54 |   "projection.month.format" = "MM",
55 |   "projection.month.interval" = "1",
56 |   "projection.month.interval.unit" = "MONTHS",
57 |   "projection.day.type" = "integer",
58 |   "projection.day.range" = "1,31",
59 |   "projection.day.format" = "dd",
60 |   "projection.day.interval" = "1",
61 |   "projection.day.interval.unit" = "DAYS",
62 |   "storage.location.template" = "s3://web-stats-dev/weblogs/${year}/${month}/${day}")
63 | 


--------------------------------------------------------------------------------
/bioc_webstats/templates/layout.html:
--------------------------------------------------------------------------------
 1 | 
 2 | <!DOCTYPE html>
 3 | <!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ -->
 4 | <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->
 5 | <!--[if IE 7]>    <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->
 6 | <!--[if IE 8]>    <html class="no-js lt-ie9" lang="en"> <![endif]-->
 7 | <!--[if gt IE 8]><!-->
 8 | <html class="no-js" lang="en">
 9 |   <!--<![endif]-->
10 |   <head>
11 |     <meta charset="utf-8" />
12 | 
13 |     <link rel="shortcut icon" href="{{url_for('static', filename='build/favicon.ico') }}">
14 |     <title>
15 |       {% block page_title %} 
16 |       bioc_webstats
17 |        {% endblock %}
18 |     </title>
19 |     <meta
20 |       name="description"
21 |       content="{% block meta_description %}{% endblock %}"
22 |     />
23 |     <meta name="author" content="{% block meta_author %}{% endblock %}" />
24 | 
25 |     <!-- Mobile viewport optimized: h5bp.com/viewport -->
26 |     <meta name="viewport" content="width=device-width" />
27 | 
28 |     <link
29 |       rel="stylesheet"
30 |       type="text/css"
31 |       href="{{ url_for('static', filename='build/main_css.bundle.css') }}"
32 |     />
33 | 
34 |     {% block css %}{% endblock %}
35 |   </head>
36 |   <body class="{% block body_class %}{% endblock %}">
37 |     {% block body %} {% with form=form %} {% include "nav.html" %} {% endwith %}
38 | 
39 |     <header>{% block header %}{% endblock %}</header>
40 | 
41 |     <main role="main">
42 |       {% with messages = get_flashed_messages(with_categories=true) %} {% if
43 |       messages %}
44 |       <div class="row">
45 |         <div class="col-md-12">
46 |           {% for category, message in messages %}
47 |           <div class="alert alert-{{ category }}">
48 |             <a class="close" title="Close" href="#" data-dismiss="alert"
49 |               >&times;</a
50 |             >
51 |             {{ message }}
52 |           </div>
53 |           <!-- end .alert -->
54 |           {% endfor %}
55 |         </div>
56 |         <!-- end col-md -->
57 |       </div>
58 |       <!-- end row -->
59 |       {% endif %} {% endwith %} {% block content %}{% endblock %}
60 |     </main>
61 | 
62 |     {% include "footer.html" %}
63 | 
64 |     <!-- JavaScript at the bottom for fast page loading -->
65 |     <script src="{{ url_for('static', filename='build/main_js.bundle.js') }}"></script>
66 |     <script src="{{ url_for('static', filename='build/main_css.bundle.js') }}"></script>
67 |     {% block js %}{% endblock %}
68 |     <!-- end scripts -->
69 |     {% endblock %}
70 |   </body>
71 | </html>
72 | 
73 | 


--------------------------------------------------------------------------------
/webpack.config.js:
--------------------------------------------------------------------------------
 1 | const path = require('path');
 2 | const webpack = require('webpack');
 3 | 
 4 | /*
 5 |  * Webpack Plugins
 6 |  */
 7 | const MiniCssExtractPlugin = require('mini-css-extract-plugin');
 8 | 
 9 | const ProductionPlugins = [
10 |   // production webpack plugins go here
11 |   new webpack.DefinePlugin({
12 |     "process.env": {
13 |       NODE_ENV: JSON.stringify("production")
14 |     }
15 |   })
16 | ]
17 | 
18 | const debug = process.env.NODE_ENV === 'Production' ? 'Production' : 'Development';
19 | const rootAssetPath = path.join(__dirname, 'assets');
20 | 
21 | module.exports = {
22 |   // configuration
23 |   context: __dirname,
24 |   entry: {
25 |     main_js: './assets/js/main',
26 |     main_css: [
27 |       path.join(__dirname, 'node_modules', '@fortawesome', 'fontawesome-free', 'css', 'all.css'),
28 |       path.join(__dirname, 'node_modules', 'bootstrap', 'dist', 'css', 'bootstrap.css'),
29 |       path.join(__dirname, 'assets', 'css', 'style.css'),
30 |     ],
31 |   },
32 |   mode: debug,
33 |   output: {
34 |     chunkFilename: "[id].js",
35 |     filename: "[name].bundle.js",
36 |     path: path.join(__dirname, "bioc_webstats", "static", "build"),
37 |     publicPath: "/static/build/"
38 |   },
39 |   resolve: {
40 |     extensions: [".js", ".jsx", ".css"]
41 |   },
42 |   devtool: debug ? "eval-source-map" : false,
43 |   plugins: [
44 |     new MiniCssExtractPlugin({ filename: "[name].bundle.css" }),
45 |     new webpack.ProvidePlugin({ $: "jquery", jQuery: "jquery" })
46 |   ].concat(debug ? [] : ProductionPlugins),
47 |   module: {
48 |     rules: [
49 |       {
50 |         test: /\.less$/,
51 |         use: [
52 |           {
53 |             loader: MiniCssExtractPlugin.loader,
54 |             options: {
55 |             },
56 |           },
57 |           'css-loader!less-loader',
58 |         ],
59 |       },
60 |       {
61 |         test: /\.css$/,
62 |         use: [
63 |           {
64 |             loader: MiniCssExtractPlugin.loader,
65 |             options: {
66 |             },
67 |           },
68 |           'css-loader',
69 |         ],
70 |       },
71 |       { test: /\.html$/, type: 'asset/source' },
72 |       { test: /\.woff(2)?(\?v=[0-9]\.[0-9]\.[0-9])?$/, type: 'asset/resource', mimetype: 'application/font-woff' },
73 |       {
74 |         test: /\.(ttf|eot|svg|png|jpe?g|gif|ico)(\?.*)?$/i,
75 |         type: 'asset/resource',
76 |         generator: {
77 |           filename: '[name][ext]'
78 |         }
79 |       },
80 |       { test: /\.js$/, exclude: /node_modules/, loader: 'babel-loader', options: { presets: ["@babel/preset-env"], cacheDirectory: true } },
81 |     ],
82 |   }
83 | };
84 | 


--------------------------------------------------------------------------------
/conversion/global_package_history.R:
--------------------------------------------------------------------------------
 1 | # global_package_history.R
 2 | library(tidyverse)
 3 | library(glue)
 4 | library(rvest)
 5 | library(xml2)
 6 | library(curl)
 7 | library(yaml)
 8 | library(kableExtra)
 9 | # TODO CLeanup and operationalize
10 | 
11 | # create a data frame consisting of release, biocViews category, package for all history
12 | # the package version are from http://bioconductor.org/config.yaml, element r_ver_for_bioc_ver
13 | 
14 | outfile_location <- "conversion/manifest_to_packages_table.csv"
15 | 
16 | manifest_template <- "https://www.bioconductor.org/packages/{version}/{category}/src/contrib/PACKAGES"
17 | 
18 | # get all packages for each release 
19 | manifest <- \( major, minor) {
20 |   categories = c("bioc", "data/annotation", "data/experiment")
21 |   # the workflows category first appeared in 2.13
22 |   if (major > 2 || (major == 2 && minor > 12))
23 |   {
24 |     categories <- append(categories, "workflows")
25 |   }
26 |   list_rbind(mapply(\(category, version) {
27 |   
28 |     print(glue("{category} / {version}"))
29 | 
30 |     con <- curl(glue(manifest_template))
31 |     result <- read.dcf(con)
32 |     close(con)
33 |     tibble(category, package = result[,1])
34 |     },
35 |          categories, glue("{major}.{minor}"), SIMPLIFY = FALSE))
36 | }
37 | 
38 | file_path <- 'http://bioconductor.org/config.yaml'
39 | bioc_ver <- yaml.load_file(file_path)
40 | bioc_ver <- data.frame(Release = names(bioc_ver$r_ver_for_bioc_ver))
41 | df <- cbind(bioc_ver, 
42 |          list_rbind(lapply(strsplit(bioc_ver$Release, ".", fixed = TRUE), 
43 |                               (\(u) data.frame(major = as.integer(u[1]), 
44 |                                                minor = as.integer(u[2])))))) |>
45 |   filter(major > 1 | (major == 1 & minor > 7)) |>
46 |   mutate(packages = map2(major, minor, manifest))
47 | 
48 | 
49 | z <- df |> unnest_longer(packages) |> 
50 |   mutate(version = major * 100 + minor, category = packages$category, package = packages$package, .keep = "none")
51 | 
52 | # Are there any duplicate entries by category
53 | z |> group_by(category, package, version) |> summarize(N = n()) |> filter(N > 1)
54 | # nrow is zero. All okay
55 | 
56 | # the packages that have switched versions
57 | z |> select(-version) |> distinct() |> group_by(package) |> summarize(N = n()) |> filter(N > 1) |> kable(format = 'pipe')
58 | 
59 | current_version <- max(z$version)
60 | z |> 
61 |   group_by(package) |>
62 |   summarize(category = category[which.max(version)],
63 |             first_version = min(version),
64 |             last_version = max(version)) |>
65 |   arrange(package) -> w
66 | 
67 | write.csv(w, outfile_location, quote=FALSE, row.names = FALSE)
68 | 


--------------------------------------------------------------------------------
/conversion/global_package_history.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | # Change the current directory
 5 | os.chdir('/Users/robertshear/temp/manifest')
 6 | 
 7 | # Get the list of remote branches
 8 | branches = subprocess.check_output(['git', 'branch', '-r']).decode().splitlines()
 9 | output_lines = []
10 | 
11 | # Loop through each remote branch
12 | for branch in branches:
13 |     # Remove the "origin/" prefix from the branch name
14 |     branch_name = branch.split('origin/')[1]
15 |     
16 |     # Check if the branch name matches the format "RELEASE_x_y"
17 |     if branch_name.startswith('RELEASE_') and len(branch_name.split('_')) == 3:
18 |         # Extract the x and y values from the branch name
19 |         x, y = branch_name.split('_')[1:]
20 |         version_number = f"{x}.{y}"
21 | 
22 |         # Perform your desired operations on each branch
23 |         print(f"Processing branch: {branch} / {branch_name} / {version_number}")
24 |         
25 |         subprocess.Popen(['git', 'checkout', branch]).wait()
26 |         
27 |         # Loop through all files with .txt extension in the current directory
28 |         for file in os.listdir('.'):
29 |             if file.endswith('.txt'):
30 |                 # Extract the filename without the extension
31 |                 filename = os.path.splitext(file)[0]
32 |                 # Open the file for reading
33 |                 with open(file, 'r') as f:
34 |                     # Loop through each line in the file
35 |                     for line in f:
36 |                         # Process each line
37 |                         if line.strip().startswith('Package:'):
38 |                             identifier = line.strip().split('Package:')[1].strip()
39 | 
40 |                             # ...
41 | 
42 |                             # Loop through all files with .txt extension in the current directory
43 |                             for file in os.listdir('.'):
44 |                                 if file.endswith('.txt'):
45 |                                     # Extract the filename without the extension
46 |                                     filename = os.path.splitext(file)[0]
47 |                                     # Open the file for reading
48 |                                     with open(file, 'r') as f:
49 |                                         # Loop through each line in the file
50 |                                         for line in f:
51 |                                             # Process each line
52 |                                             if line.strip().startswith('Package:'):
53 |                                                 identifier = line.strip().split('Package:')[1].strip()
54 |                                                 output_lines.append(f"{version_number}|{filename}|{identifier}\n")
55 | 
56 | # Write all the lines to a single output file
57 | with open('manifest_records.txt', 'w') as output_file:
58 |     output_file.writelines(output_lines)
59 |                 
60 | 


--------------------------------------------------------------------------------
/conversion/access_logs_to_stats_tsv.py:
--------------------------------------------------------------------------------
 1 | """Export access_log tables from server stats.bioconductor.org."""
 2 | import csv
 3 | import gzip
 4 | import io
 5 | import sqlite3
 6 | from datetime import datetime as dt
 7 | 
 8 | import psycopg2
 9 | 
10 | from bioc_webstats.aws_functions import psql_get_connection
11 | 
12 | # TODO Use AWS paraemter store
13 | secret_name = "rds-db-credentials/cluster-BFK2EVT2EFIRT4B5XVC6PDOXIQ/postgres/1701682809099"
14 | region_name = "us-east-1"
15 | database_name = 'webstats'
16 | 
17 | 
18 | def export_chunked_tsv(db_path, query, chunk_size, start_at=0):
19 |     """Export the table in chunks."""
20 | 
21 | 
22 |     target_connection = psql_get_connection(secret_name, region_name, database_name)
23 |     
24 |     conn = sqlite3.connect(db_path)
25 |     cursor = conn.cursor()
26 | 
27 |     record_count = 0
28 |     offset = start_at
29 |     while True:
30 |         with io.StringIO() as f:
31 |             writer = csv.writer(f, delimiter='\t')
32 |             cursor.execute(f"{query} LIMIT {chunk_size} OFFSET {offset}")
33 |             rows = cursor.fetchall()
34 | 
35 |             if not rows:
36 |                 break
37 | 
38 |             writer.writerows(rows)
39 |             f.seek(0)
40 |             target_cursor = target_connection.cursor()
41 |             target_cursor.copy_from(f, 'bioc_web_downloads')
42 |             target_connection.commit()
43 |             record_count += len(rows)
44 |             target_cursor.close()
45 |             # end with
46 |             print(offset)
47 |         offset += chunk_size
48 |         # end while
49 | 
50 |     conn.close()
51 |     target_connection.close()
52 |     return record_count
53 | 
54 | chunk_size = 100000
55 | 
56 | for year in range(2009, 2021):
57 |     source_db = f'/mnt/data/home/biocadmin/download_dbs/download_db_{year}.sqlite'
58 |     status_colname = 'errorcode' if year < 2020 else 'statuscode'
59 |     # left(ips) due to cruft in the ip address column
60 |     # name change as of 2019 errorcode -> statuscode
61 |     sql_select_command = f"""
62 |         select strftime(
63 |                 '%Y-%m-%d',
64 |                 substr(day_month_year, 8, 4) || '-' || case
65 |                     substr(day_month_year, 4, 3)
66 |                     when 'Jan' then '01'
67 |                     when 'Feb' then '02'
68 |                     when 'Mar' then '03'
69 |                     when 'Apr' then '04'
70 |                     when 'May' then '05'
71 |                     when 'Jun' then '06'
72 |                     when 'Jul' then '07'
73 |                     when 'Aug' then '08'
74 |                     when 'Sep' then '09'
75 |                     when 'Oct' then '10'
76 |                     when 'Nov' then '11'
77 |                     when 'Dec' then '12'
78 |                 end || '-' || substr(day_month_year, 1, 2)
79 |             ) AS "date",
80 |             SUBSTR(ips, 1, 30) as "c-ip",
81 |             {status_colname} as "sc-status",
82 |             biocrepo as "category",
83 |             "package"
84 |         from access_log
85 |         """
86 |     print('start at ' +dt.now().strftime("%Y-%m-%d %H:%M:%S" + "  " + str(year)))
87 |     record_count = export_chunked_tsv(source_db, sql_select_command, chunk_size)
88 |     print('end at ' +dt.now().strftime("%Y-%m-%d %H:%M:%S") + "Year=" + str(year) + " Total records=" + str(record_count))
89 | 


--------------------------------------------------------------------------------
/installer_scripts/create_ec2_instance.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # A script to create a stand-alone EC2 instance for webstats
 3 | 
 4 | # The profile_ID for the aws crednetials
 5 | PROFILE_ID=bioc
 6 | # Allocate an Elastic IP (uncomment if you haven't allocated one yet)
 7 | # EIP_ALLOCATION_ID=$(aws --profile=$PROFILE_ID ec2 allocate-address --domain vpc --query 'AllocationId' --output text)
 8 | # or uncomment the line below one if we already ahve an EIP
 9 | EIP_ALLOCATION_ID="eipalloc-09836615eabe474d9"  # Replace with your Elastic IP allocation ID
10 | 
11 | # TODO MOdify creation so that EIP will survive termination of Instance
12 | # Parameterize the Name tag value
13 | INSTANCE_NAME="bioc-webstats-server4"  # Replace with your desired instance name
14 | 
15 | # Generate a unique client token
16 | CLIENT_TOKEN=$(uuidgen)
17 | 
18 | # TODO Attch the correct role
19 | # Launch the EC2 instance
20 | INSTANCE_ID=$(aws --profile=$PROFILE_ID ec2 run-instances --image-id "ami-04b70fa74e45c3917" --instance-type "t3.small" --instance-initiated-shutdown-behavior "stop" --key-name "rshear-2023-07" --block-device-mappings '{"DeviceName":"/dev/sda1","Ebs":{"Encrypted":false,"DeleteOnTermination":false,"Iops":3000,"VolumeSize":10,"VolumeType":"gp3","Throughput":125}}' --network-interfaces '{"SubnetId":"subnet-8bd210a0","DeleteOnTermination":true,"AssociatePublicIpAddress":true,"DeviceIndex":0,"Groups":["sg-0c1a550c794f7f48e","sg-06d862363ca409a55"]}' --hibernation-options '{"Configured":false}' --monitoring '{"Enabled":true}' --credit-specification '{"CpuCredits":"unlimited"}' --capacity-reservation-specification '{"CapacityReservationPreference":"open"}' --tag-specifications "ResourceType=instance,Tags=[{Key=bioc:notes,Value='This server delivers content to www.bioconductor.org/packages/stats/ via reverse-proxy to master.'},{Key=bioc:application,Value=webstats},{Key=Name,Value=$INSTANCE_NAME},{Key=bioc:availability,Value=high},{Key=bioc:creator,Value=robert.shear@bioconductor},{Key=bioc:environment,Value=production}]" --iam-instance-profile '{"Arn":"arn:aws:iam::555219204010:instance-profile/AmazonSSMRoleForInstancesQuickSetup"}' --enclave-options '{"Enabled":false}' --metadata-options '{"HttpEndpoint":"enabled","HttpProtocolIpv6":"disabled","HttpPutResponseHopLimit":1,"HttpTokens":"optional","InstanceMetadataTags":"disabled"}' --placement '{"Tenancy":"default"}' --private-dns-name-options '{"HostnameType":"ip-name","EnableResourceNameDnsARecord":false,"EnableResourceNameDnsAAAARecord":false}' --maintenance-options '{"AutoRecovery":"default"}' --client-token "$CLIENT_TOKEN" --count "1" --query 'Instances[0].InstanceId' --output text)
21 | 
22 | # Wait for the instance to be in running state
23 | aws --profile=$PROFILE_ID ec2 wait instance-running --instance-ids $INSTANCE_ID
24 | 
25 | # associate bioc-webstats-runner instance profile with instance
26 | # TODO this assumes that the iip is already defined
27 | aws --profile=$PROFILE_ID ec2 replace-iam-instance-profile-association --association-id "iip-assoc-05817f9302900b693" --iam-instance-profile '{"Arn":"arn:aws:iam::555219204010:instance-profile/bioc-webstats-webrunner","Name":"bioc-webstats-webrunner"}' 
28 | 
29 | # Associate the Elastic IP with the instance
30 | aws --profile=$PROFILE_ID ec2 associate-address --instance-id $INSTANCE_ID --allocation-id $EIP_ALLOCATION_ID
31 | 
32 | echo "Instance $INSTANCE_ID is created and Elastic IP associated with the name $INSTANCE_NAME."


--------------------------------------------------------------------------------
/bioc_webstats/database.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Database module, including the SQLAlchemy database object and DB-related utilities."""
  3 | from typing import Optional, Type, TypeVar
  4 | 
  5 | from .extensions import db
  6 | 
  7 | T = TypeVar("T", bound="PkModel")
  8 | 
  9 | # Alias common SQLAlchemy names
 10 | Column = db.Column
 11 | relationship = db.relationship
 12 | 
 13 | 
 14 | class DictMixin:
 15 |     """Add-in class for SQLAlchemy tables to transform Rows into Dictionaries."""
 16 | 
 17 |     def as_dict(self, exclude=None):
 18 |         """Self-explanitory."""
 19 |         if exclude is None:
 20 |             exclude = []
 21 |         table = self.__table__
 22 |         return {
 23 |             field: getattr(self, field)
 24 |             for field in table.columns.keys()
 25 |             if field not in exclude
 26 |         }
 27 | 
 28 | 
 29 | class CRUDMixin(object):
 30 |     """Mixin that adds convenience methods for CRUD (create, read, update, delete) operations."""
 31 | 
 32 |     @classmethod
 33 |     def create(cls, **kwargs):
 34 |         """Create a new record and save it the database."""
 35 |         instance = cls(**kwargs)
 36 |         return instance.save()
 37 | 
 38 |     def update(self, commit=True, **kwargs):
 39 |         """Update specific fields of a record."""
 40 |         for attr, value in kwargs.items():
 41 |             setattr(self, attr, value)
 42 |         if commit:
 43 |             return self.save()
 44 |         return self
 45 | 
 46 |     def save(self, commit=True):
 47 |         """Save the record."""
 48 |         db.session.add(self)
 49 |         if commit:
 50 |             db.session.commit()
 51 |         return self
 52 | 
 53 |     def delete(self, commit: bool = True) -> None:
 54 |         """Remove the record from the database."""
 55 |         db.session.delete(self)
 56 |         if commit:
 57 |             return db.session.commit()
 58 |         return
 59 | 
 60 | 
 61 | class Model(CRUDMixin, db.Model, DictMixin):
 62 |     """Base model class that includes CRUD convenience methods."""
 63 | 
 64 |     __abstract__ = True
 65 | 
 66 | 
 67 | class PkModel(Model):
 68 |     """Base model class that includes CRUD convenience methods, plus adds a 'primary key' column named ``id``."""
 69 | 
 70 |     __abstract__ = True
 71 |     id = Column(db.Integer, primary_key=True)
 72 | 
 73 |     @classmethod
 74 |     def get_by_id(cls: Type[T], record_id) -> Optional[T]:
 75 |         """Get record by ID."""
 76 |         if any(
 77 |             (
 78 |                 isinstance(record_id, (str, bytes)) and record_id.isdigit(),
 79 |                 isinstance(record_id, (int, float)),
 80 |             )
 81 |         ):
 82 |             return cls.query.get(int(record_id))
 83 |         return None
 84 | 
 85 | 
 86 | def reference_col(
 87 |     tablename, nullable=False, pk_name="id", foreign_key_kwargs=None, column_kwargs=None
 88 | ):
 89 |     """Column that adds primary key foreign key reference.
 90 | 
 91 |     Usage: ::
 92 | 
 93 |         category_id = reference_col('category')
 94 |         category = relationship('Category', backref='categories')
 95 |     """
 96 |     foreign_key_kwargs = foreign_key_kwargs or {}
 97 |     column_kwargs = column_kwargs or {}
 98 | 
 99 |     return Column(
100 |         db.ForeignKey(f"{tablename}.{pk_name}", **foreign_key_kwargs),
101 |         nullable=nullable,
102 |         **column_kwargs,
103 |     )
104 | 


--------------------------------------------------------------------------------
/etl/log_partition_projector.py:
--------------------------------------------------------------------------------
  1 | #! /bin/python
  2 | import argparse
  3 | from os import environ
  4 | 
  5 | import boto3
  6 | 
  7 | """log_partition_projector - Move or Rename weblogs in s3 buckets to faciitate partition projection on dates
  8 | 
  9 | WIP
 10 | FOR TESTING:
 11 | python3 log_partion_projector -s bioc-cloudfront-logs -d web-stats-dev --dryrun
 12 | 
 13 | """
 14 | 
 15 | 
 16 | def log_partition_projector(
 17 |     source_bucket: str,
 18 |     destination_bucket: str,
 19 |     old_prefix: str,
 20 |     new_prefix: str,
 21 |     dryrun: bool,
 22 |     first: str
 23 | ) -> int:
 24 | 
 25 |     s3 = boto3.client('s3')
 26 | 
 27 |     def fetch_objects(bucket_name, prefix):
 28 |         continuation_token = None
 29 | 
 30 |         while True:
 31 |             list_kwargs = {
 32 |                 'Bucket': bucket_name,
 33 |                 'Prefix': prefix,
 34 |                 'StartAfter': first
 35 |             }
 36 |             if continuation_token:
 37 |                 list_kwargs['ContinuationToken'] = continuation_token
 38 | 
 39 |             response = s3.list_objects_v2(**list_kwargs)
 40 |             yield from response.get('Contents', [])
 41 | 
 42 |             if not response.get('IsTruncated'):  # Stop iteration if no more files
 43 |                 break
 44 | 
 45 |             continuation_token = response.get('NextContinuationToken')
 46 | 
 47 |     if dryrun:
 48 |         "--dryrun selected. No modifications will be made"
 49 |     object_count = 0
 50 | 
 51 |     for item in fetch_objects(source_bucket, old_prefix):
 52 |         filename = item["Key"]
 53 |         # Extract date information from the filename
 54 |         date_parts = filename.split(".")[1].split("-")
 55 | 
 56 |         # TODO parameterie hard coded new prefix
 57 |         new_key = (
 58 |             f"{new_prefix}/{date_parts[0]}/{date_parts[1]}/{date_parts[2]}/{filename}"
 59 |         )
 60 |         print(new_key)
 61 |         object_count += 1
 62 |         if not dryrun:
 63 |             s3.copy_object(
 64 |                 Bucket=destination_bucket,
 65 |                 CopySource={"Bucket": source_bucket, "Key": filename},
 66 |                 Key=new_key,
 67 |             )
 68 |         pass
 69 | 
 70 |     print(f"{object_count} objects rearranged.")
 71 |     return 0
 72 | 
 73 | 
 74 | if __name__ == "__main__":
 75 |     parser = argparse.ArgumentParser(
 76 |         description="Transform weblog default weblog filename formats to partionable format"
 77 |     )
 78 | 
 79 |     parser.add_argument("-s", "--source", required=True, help="Source bucket")
 80 |     parser.add_argument("-d", "--destination", required=True, help="Destination bucket")
 81 |     parser.add_argument("-p", "--prefix", help="Old (Source) prefix", default="")
 82 |     parser.add_argument(
 83 |         "-n", "--newprefix", help="New (Destination) prefix", default="weblogs"
 84 |     )
 85 |     parser.add_argument("-f", "--first", help="Start after this string")
 86 |     parser.add_argument(
 87 |         "-m", "--move", action="store_true", help="If move then source will be deleted"
 88 |     )
 89 |     parser.add_argument("--dryrun", action="store_true", help="Run in dry-run mode")
 90 | 
 91 |     # HACK cli: aws --profile prod sso login
 92 |     environ["AWS_PROFILE"] = "prod"
 93 | 
 94 |     test_args = "-s bioc-cloudfront-logs -d web-stats-dev -n weblogs -f E1TVLJONPTUXV3.2023-05".split(" ")
 95 |     # TODO DEBUG ONLY
 96 |     args = parser.parse_args(test_args)
 97 | 
 98 |     result_code = log_partition_projector(
 99 |         args.source, args.destination, args.prefix, args.newprefix, args.dryrun, args.first
100 |     )
101 |     exit(result_code)
102 | 


--------------------------------------------------------------------------------
/installer_scripts/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:22.04
 2 | 
 3 | ARG DEBIAN_FRONTEND=noninteractive
 4 | ARG TARGETARCH
 5 | 
 6 | ENV container=docker
 7 | STOPSIGNAL SIGRTMIN+3
 8 | 
 9 | # Update package index and install required base packages
10 | RUN apt-get update && \
11 |     apt-get install -y systemd systemd-sysv ca-certificates wget gnupg software-properties-common g++ freeglut3-dev build-essential libx11-dev libxmu-dev libxi-dev libglu1-mesa libglu1-mesa-dev python3.10 python3.10-venv python3.10-dev openssh-server tigervnc-standalone-server tigervnc-xorg-extension tigervnc-viewer xorg xfce4 dbus-x11 sudo git curl nano tmux ffmpeg htop vlc x11-xserver-utils xfce4-terminal dbus snapd && \
12 |     apt-get clean && \
13 |     rm -rf /var/lib/apt/lists/* && \
14 |     (cd /lib/systemd/system/sysinit.target.wants/; for i in *; do [ $i == systemd-tmpfiles-setup.service ] || rm -f $i; done) && \
15 |     rm -f /lib/systemd/system/multi-user.target.wants/* && \
16 |     rm -f /etc/systemd/system/*.wants/* && \
17 |     rm -f /lib/systemd/system/local-fs.target.wants/* && \
18 |     rm -f /lib/systemd/system/sockets.target.wants/*udev* && \
19 |     rm -f /lib/systemd/system/sockets.target.wants/*initctl* && \
20 |     rm -f /lib/systemd/system/basic.target.wants/* && \
21 |     rm -f /lib/systemd/system/anaconda.target.wants/* && \
22 |     find /etc/systemd/system /lib/systemd/system -path '*.wants/*' -exec rm {} \; && \
23 |     echo 'tmpfs /run tmpfs mode=755,nosuid,nodev 0 0' > /etc/fstab && \
24 |     echo 'tmpfs /tmp tmpfs mode=1777,nosuid,nodev 0 0' >> /etc/fstab
25 | 
26 | # Generate machine-id
27 | RUN systemd-machine-id-setup
28 | 
29 | # Add OBS Studio repository (works for both architectures)
30 | RUN add-apt-repository -y ppa:obsproject/obs-studio && \
31 |     apt-get update && \
32 |     apt-get install -y obs-studio && \
33 |     apt-get clean && \
34 |     rm -rf /var/lib/apt/lists/*
35 | 
36 | # Conditional Google Chrome installation based on architecture
37 | RUN if [ "$TARGETARCH" = "amd64" ]; then \
38 |         wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb -O chrome.deb; \
39 |     else \
40 |         wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_arm64.deb -O chrome.deb; \
41 |     fi && \
42 |     apt-get update && \
43 |     apt-get install -y ./chrome.deb || apt-get install -f -y && \
44 |     rm chrome.deb && \
45 |     apt-get clean && \
46 |     rm -rf /var/lib/apt/lists/*
47 | 
48 | # Conditional VS Code installation based on architecture
49 | RUN if [ "$TARGETARCH" = "amd64" ]; then \
50 |         wget -q https://update.code.visualstudio.com/latest/linux-deb-x64/stable -O vscode.deb; \
51 |     else \
52 |         wget -q https://update.code.visualstudio.com/latest/linux-deb-arm64/stable -O vscode.deb; \
53 |     fi && \
54 |     apt-get update && \
55 |     dpkg -i vscode.deb || apt-get install -f -y && \
56 |     rm vscode.deb && \
57 |     apt-get clean && \
58 |     rm -rf /var/lib/apt/lists/*
59 | 
60 | # Configure SSH and logging
61 | RUN sed -i 's/#Port 22/Port 22/g' /etc/ssh/sshd_config && \
62 |     mkdir /var/run/sshd && \
63 |     sed -i 's/Storage=auto/Storage=persistent/' /etc/systemd/journald.conf
64 | 
65 | # Install snapd
66 | RUN apt-get update && \
67 |     apt-get install -y snapd && \
68 |     apt-get clean && \
69 |     rm -rf /var/lib/apt/lists/*
70 | 
71 | # Set environment variables
72 | ENV USER="" \
73 |     PASSWORD="" \
74 |     VNCPASSWORD=""
75 | 
76 | # Copy entrypoint script and make it executable
77 | COPY entrypoint.sh /entrypoint.sh
78 | RUN chmod +x /entrypoint.sh
79 | 
80 | # Expose necessary ports
81 | EXPOSE 22 5901 3389
82 | 
83 | # Set entrypoint
84 | ENTRYPOINT ["/entrypoint.sh"]
85 | 


--------------------------------------------------------------------------------
/migrations/env.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from logging.config import fileConfig
  3 | 
  4 | from flask import current_app
  5 | 
  6 | from alembic import context
  7 | 
  8 | # this is the Alembic Config object, which provides
  9 | # access to the values within the .ini file in use.
 10 | config = context.config
 11 | 
 12 | # Interpret the config file for Python logging.
 13 | # This line sets up loggers basically.
 14 | fileConfig(config.config_file_name)
 15 | logger = logging.getLogger('alembic.env')
 16 | 
 17 | 
 18 | def get_engine():
 19 |     try:
 20 |         # this works with Flask-SQLAlchemy<3 and Alchemical
 21 |         return current_app.extensions['migrate'].db.get_engine()
 22 |     except (TypeError, AttributeError):
 23 |         # this works with Flask-SQLAlchemy>=3
 24 |         return current_app.extensions['migrate'].db.engine
 25 | 
 26 | 
 27 | def get_engine_url():
 28 |     try:
 29 |         return get_engine().url.render_as_string(hide_password=False).replace(
 30 |             '%', '%%')
 31 |     except AttributeError:
 32 |         return str(get_engine().url).replace('%', '%%')
 33 | 
 34 | 
 35 | # add your model's MetaData object here
 36 | # for 'autogenerate' support
 37 | # from myapp import mymodel
 38 | # target_metadata = mymodel.Base.metadata
 39 | config.set_main_option('sqlalchemy.url', get_engine_url())
 40 | target_db = current_app.extensions['migrate'].db
 41 | 
 42 | # other values from the config, defined by the needs of env.py,
 43 | # can be acquired:
 44 | # my_important_option = config.get_main_option("my_important_option")
 45 | # ... etc.
 46 | 
 47 | 
 48 | def get_metadata():
 49 |     if hasattr(target_db, 'metadatas'):
 50 |         return target_db.metadatas[None]
 51 |     return target_db.metadata
 52 | 
 53 | 
 54 | def run_migrations_offline():
 55 |     """Run migrations in 'offline' mode.
 56 | 
 57 |     This configures the context with just a URL
 58 |     and not an Engine, though an Engine is acceptable
 59 |     here as well.  By skipping the Engine creation
 60 |     we don't even need a DBAPI to be available.
 61 | 
 62 |     Calls to context.execute() here emit the given string to the
 63 |     script output.
 64 | 
 65 |     """
 66 |     url = config.get_main_option("sqlalchemy.url")
 67 |     context.configure(
 68 |         url=url, target_metadata=get_metadata(), literal_binds=True
 69 |     )
 70 | 
 71 |     with context.begin_transaction():
 72 |         context.run_migrations()
 73 | 
 74 | 
 75 | def run_migrations_online():
 76 |     """Run migrations in 'online' mode.
 77 | 
 78 |     In this scenario we need to create an Engine
 79 |     and associate a connection with the context.
 80 | 
 81 |     """
 82 | 
 83 |     # this callback is used to prevent an auto-migration from being generated
 84 |     # when there are no changes to the schema
 85 |     # reference: http://alembic.zzzcomputing.com/en/latest/cookbook.html
 86 |     def process_revision_directives(context, revision, directives):
 87 |         if getattr(config.cmd_opts, 'autogenerate', False):
 88 |             script = directives[0]
 89 |             if script.upgrade_ops.is_empty():
 90 |                 directives[:] = []
 91 |                 logger.info('No changes in schema detected.')
 92 | 
 93 |     conf_args = current_app.extensions['migrate'].configure_args
 94 |     if conf_args.get("process_revision_directives") is None:
 95 |         conf_args["process_revision_directives"] = process_revision_directives
 96 | 
 97 |     connectable = get_engine()
 98 | 
 99 |     with connectable.connect() as connection:
100 |         context.configure(
101 |             connection=connection,
102 |             target_metadata=get_metadata(),
103 |             **conf_args
104 |         )
105 | 
106 |         with context.begin_transaction():
107 |             context.run_migrations()
108 | 
109 | 
110 | if context.is_offline_mode():
111 |     run_migrations_offline()
112 | else:
113 |     run_migrations_online()
114 | 


--------------------------------------------------------------------------------
/installer_scripts/docker_dev_setup_example.md:
--------------------------------------------------------------------------------
  1 | # Setting up a ubuntu/dev environment
  2 | This is an example that shows the basic steps in assembling a webstats environmewnt for an ubuntubased developer.
  3 | It does not include the installtion of an IDE.
  4 | ## Creating the container
  5 | Start in the home directory of a current clone of the bioconductor/bio-web-stats repo
  6 | ```bash
  7 | cd ~/Projects/bio-web-stats/installer_scripts/
  8 | source ./build_docker.sh
  9 | ```
 10 | We now have a docker image named  `webstats-server` We will create a container and clone the source repo.
 11 | ```bash
 12 | docker run --privileged  -p 5000:5000 -p 22:22 -p 80:80 --name=webstats-dev -d webstats-server
 13 | # Here is the local IP address for ther server
 14 | docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' webstats-dev
 15 | ```
 16 | Note the IP address reported (ysually, `172.17.0.2`). Connect to the server.
 17 | 
 18 | ```bash
 19 | docker exec -it --user ubuntu webstats-dev /bin/bash
 20 | ```
 21 | 
 22 | If you want to run postgres, install it.
 23 | ```bash
 24 | sudo apt-get update
 25 | sudo apt-get install postgresql postgresql-contrib 
 26 | ```
 27 | 
 28 | Now clone the repo and install poetry.
 29 | ```bash
 30 | cd $HOME
 31 | # verify that the working directory is now /home/ubuntu/
 32 | pwd
 33 | git clone https://github.com/bioconductor/bio-web-stats.git
 34 | cd bio-web-stats
 35 | pipx install poetry
 36 | pipx ensurepath
 37 | . ~/.bashrc
 38 | # Build the app and see if it is working.
 39 | poetry install
 40 | ```
 41 | 
 42 | The application is now installed. Accessing the applicaiton and any of its utilities can be accomplished with this command
 43 | 
 44 | ```bash
 45 | export FLASK_APP=autoapp.py
 46 | poetry run flask --help
 47 | ```
 48 | 
 49 | If everything worked correctly you should see output that looks like this.
 50 | ```
 51 | Usage: flask [OPTIONS] COMMAND [ARGS]...
 52 | 
 53 |   A general utility script for Flask applications.
 54 |   ... more ...
 55 |   
 56 | Commands:
 57 |   configp  Initialize AWS parameter set
 58 |   db       Perform database migrations.
 59 |   digest   md5 tag and compress static files.
 60 |   gendb    Generate small test database.
 61 |   ingest   Read raw weblogs, select valid package downlads, update...
 62 |   lint     Lint and check code style with black, flake8 and isort.
 63 |   routes   Show the routes for the app.
 64 |   run      Run a development server.
 65 |   shell    Run a shell in the app context.
 66 |   test     Run the tests.
 67 |   ```
 68 |   If you have the "ingest" and "gendb" commands, flask is attacched to the webtats application.
 69 |   To start a development server, 
 70 | 
 71 |   In addition to poetry run command, you can also open a new shell which will have the
 72 |   poetry environment available.
 73 |   ```shell
 74 | poetry shell
 75 | ```
 76 | Now, you are within the shell but in the poetry environment.
 77 | The prompt should looks something like this: `(bioc-webstats-py3.12) ubuntu@88dc3287cc84:~/bio-web-stats$ `
 78 | 
 79 | Run these commands:
 80 | ```bash
 81 | flask test
 82 | flask gendb
 83 | # the --host parameter directs the development web server to bind to all IP addresses
 84 | flask run --host="0.0.0.0"
 85 | ```
 86 | You should now see this:
 87 | ```
 88 |  * Serving Flask app 'autoapp.py'
 89 |  * Debug mode: off
 90 | WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
 91 |  * Running on all addresses (0.0.0.0)
 92 |  * Running on http://127.0.0.1:5000
 93 |  * Running on http://172.17.0.2:5000
 94 | Press CTRL+C to quit
 95 | ```
 96 | and be able to access the site locally via `http://127.0.0.1:5000/packages/stats/`.
 97 | 
 98 | If you also want to run it on the host machine, you should `flask run --host=0.0.0.0`. Then
 99 | (assuming that the local IP address of the server is `172.17.0.2`) you can access the site at 
100 | `http://172.17.0.2:5000/packages/stats/`.
101 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # `bioc-webstats`
 2 | 
 3 | This document provides a high level overview of the operation of the `bioc-webstats` system, an anciallary system to [www.biocconductor.org](https://www.biocconductor.org). 
 4 | 
 5 | This is an internal production system. It has characteristics that are specific to its single mode of use. For additional information, refer to the code base.
 6 | 
 7 | The purpose of bioc-webstats is to maintain a permanent record of the download counts for each Bioconductor package in an SQL database and to report this information on www.bioconductor.org. It includes records from January 1, 2009, to the present.
 8 | 
 9 | This application replaces the "stats server" application. That server produced static pages for all the content under `www.bioconductor.org/packages/stats/`. The `bio-webstats` appliation, in its initial implementation, is designed to match the eact form of the application that it replaces.
10 | 
11 | The `bioc-webstats  application is implementd as in Python application and supported by a SQL database. It has two major functions.
12 | 
13 | 1. Data ingestion. Consume web traffic logs in Common Log Format (CLF), select those records which are package downloads, store them in a SQL table, and maintain summary statistics for each package.
14 | 2. Web reporting. The application can serve as a backend to any web server that supports the `WSGI` standard. It consumes `http get` requests, infers their semantics from the the `URI` stem, and returns content that is functionally the same as the system it replaces. In the case of `html` responses, this means that both the content as well as the look and feel are the same. Other responses are unformatted text downloags (`.tab` and `.txt`), which are byte-for-byte idenitcal to the prior system.
15 | 
16 | ## Technical Stack
17 | 
18 | The application implmenetation is based on several frameworks and libraries:
19 | 
20 | - Python 3.12
21 | - [Poetry](https://python-poetry.org) - Dependency management and padckaging.
22 | - [Flask](https://flask.palletsprojects.com) - Web application framework.
23 | - [SQLAlchemy](https://www.sqlalchemy.org) -  Pythn SQL toolkit and bject-rlational mapper.
24 | - [Chart.js](https://www.chartjs.org/) - JavaScript charting library.
25 | - [Bootstrap 5](https://getbootstrap.com) - Responsive JavaScript frontend toolkit.
26 | 
27 | 
28 | There are various additional Python and JavaScript dependencies that support the application. See `pyproject.toml` in the project root directory for details.
29 | 
30 | The application is distibuted as a `whl` file and can be installed by any installation and package manager, including `poetry`, `pipenv`, `pipx`, `virtualenv`, or `conda`.
31 | 
32 | ## General System Flow
33 | 
34 | The general system flow as deployed as of this writing is depicted figure 1 below. The Python application can be run on any server that has secure access to `master.biconductor.org` and the AWS `Athena` service that can read the CloudFront CLF logs. This includes `master.biocnductor.org` itself. Other components, including the SQL Server, and the interal web server, are easily replaced. 
35 | 
36 | Not depicted in the stack are AWS-specific features for configuration (the AWS Systems Manager Parameter Store) and security (the AWS Security Manager).
37 | 
38 | Note: The direction of each line indicates the functional flow of information. That is, for every request-response pattern, the arrow points to the consumer of the response.
39 | 
40 | ![General System Flow](docs/bioc-webstats-architecture-v2.excalidraw.png)
41 | <p align="center">
42 |   <em>Figure 1: General System Flow.</em>
43 | </p>
44 | 
45 | A. A [Waitress](https://pypi.org/project/waitress/) lightweight webserver that consumes incoming `http get` requests from and returns results to an upstream webserver (arrow 3).
46 | 
47 | B. A cron job that runs daily at 01:00 UTC that:
48 | - Detects detects changes in the development version of the manifest (arrow 1).
49 | - Invokes an AWS Athena View to return all CloudFront log entries for dates newer than those previously uploaded (arrow 2), but only if they have a URI stem that implies a download, and only if the package name is valid.
50 | - Updates the summary tables, `stats` and `categorystats` (see Database Structure below).
51 | 
52 | C. A SQL Database server. Currently implemented as a serverless AWS RDS instance, Postgres 15.
53 | 
54 | ### Database Model
55 | 
56 | ![Database Model](docs/webstats-erd-0_1_9.png)
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/installer_scripts/installer.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # usage script installer.sh TARGET_PLATFORM
  3 | #  TARGET_PLATFORM = EC2 | docker (default is EC2)
  4 | 
  5 | TARGET_PLATFORM="${1:-EC2}"
  6 | echo "Starting installation of bioc-webstats"
  7 | echo $(date -R)
  8 | if [ "$TARGET_PLATFORM" != "EC2" ] && [ "$TARGET_PLATFORM" != "docker" ]; then
  9 |     echo "Error: Platform must be either 'EC2' or 'docker'."
 10 |     exit 1
 11 | else
 12 |     echo "TARGET_PLATFORM=$TARGET_PLATFORM"
 13 | fi
 14 | 
 15 | 
 16 | sudo apt update
 17 | sudo apt upgrade
 18 | sudo apt install python3 python3-pip python3.12-venv curl unzip nano vim tree -y
 19 | 
 20 | python3 -m venv .venv
 21 | . .venv/bin/activate
 22 | 
 23 | wheel_file="$(ls -1t bioc_webstats*.whl| sort | head -1)"
 24 | # install the newest wheel file in sort order
 25 | cd ~
 26 | pip install $wheel_file
 27 | 
 28 | aws_location=$(which aws)
 29 | 
 30 | if [ -z "$aws_location" ]; then
 31 |     if [ "$TARGET_PLATFORM" = "EC2" ]; then
 32 |         sudo snap install aws-cli --classic
 33 |     else
 34 |     . $(find .venv/lib/ -name aws_installer.sh)
 35 |     fi
 36 | fi
 37 | 
 38 | if [ ! -e ~/.aws/config ]; then
 39 |     mkdir -p ~/.aws
 40 |     cat > .aws/config <<eof
 41 | [profile default]
 42 | sso_session = webstats
 43 | sso_account_id = 931729544676
 44 | sso_role_name = AdministratorAccess
 45 | region = us-east-1
 46 | output = json
 47 | [sso-session webstats]
 48 | sso_start_url = https://bioconductor.awsapps.com/start
 49 | sso_region = us-east-1
 50 | sso_registration_scopes = sso:account:access
 51 | eof
 52 | 
 53 | fi
 54 | 
 55 | . $(find .venv/lib/ -name flask_environment)
 56 | 
 57 | 
 58 | # Create group if it doesn't exist
 59 | if ! getent group "$FLASK_OSGROUP" > /dev/null; then
 60 |   sudo groupadd "$FLASK_OSGROUP"
 61 | fi
 62 | 
 63 | # Create user if it doesn't exist and add to the group
 64 | if ! id "$FLASK_OSUSER" > /dev/null 2>&1; then
 65 |   sudo useradd -m -g "$FLASK_OSGROUP" "$FLASK_OSUSER"
 66 |   echo "User $FLASK_OSUSER created and added to group $FLASK_OSGROUP."
 67 | else
 68 |   echo "User $FLASK_OSUSER already exists."
 69 | fi
 70 | 
 71 | sudo mkdir -p $FLASK_APPROOT
 72 | sudo chown -R $FLASK_OSUSER:$FLASK_OSGROUP $FLASK_APPROOT
 73 | 
 74 | sudo mkdir -p $FLASK_LOGROOT
 75 | sudo chown -R $FLASK_OSUSER:$FLASK_OSGROUP $FLASK_LOGROOT
 76 | 
 77 | # create bioc-webstats.service part 1
 78 | cat > bioc-webstats.service <<eof
 79 | [Unit]
 80 | Description=Waitress service for www-webstats
 81 | After=network.target
 82 | 
 83 | [Service]
 84 | eof
 85 | 
 86 | # create bioc-webstats.service part 2, then environment variables
 87 | 
 88 | # Initialize an empty string to hold the environment variables
 89 | env_string="Environment="
 90 | 
 91 | # Loop through all environment variables
 92 | while IFS='=' read -r name value ; do
 93 |   # Check if the variable name starts with "FLASK_"
 94 |   if [[ $name == FLASK_* ]]; then
 95 |     # Remove the "FLASK_" prefix from the variable name
 96 |     env_string+="\"${name}=${value}\" "
 97 |   fi
 98 | done < <(env)
 99 | 
100 | # Trim the trailing space and print the result
101 | env_string=$(echo "$env_string" | sed 's/ $//')
102 | 
103 | echo "$env_string" >> bioc-webstats.service 
104 | 
105 | # part 3, the rest of the file.
106 | cat >> bioc-webstats.service <<eof
107 | WorkingDirectory=/home/ubuntu
108 | ExecStart=/home/ubuntu/.venv/bin/python3 -m bioc_webstats.app_waitress
109 | 
110 | [Install]
111 | WantedBy=multi-user.target
112 | eof
113 | 
114 | # TODO install logrotate from ./installer_scripts/logrotate.d
115 | # TODO logrotate.d/bioc-wobstats has hardcoded user as ubuntu. should be FLASK_OSUSER and FLASK_OSGROUP.
116 | sudo cp bioc-webstats.service /etc/systemd/system/
117 | # TODO clean up local copy of bioc-webstats.service
118 | sudo chown root:root /etc/systemd/system/bioc-webstats.service
119 | sudo chmod 644 /etc/systemd/system/bioc-webstats.service
120 | sudo cp "$(find .venv/lib/ -name flask_environment)" $FLASK_APPROOT/
121 | sudo chown root:root /etc/systemd/system/bioc-webstats.service
122 | sudo chmod 644 $FLASK_APPROOT/flask_environment
123 | SITE_PACKAGES_PATH=$(python -c "import site; print(site.getsitepackages()[0])")
124 | sudo cp  "$SITE_PACKAGES_PATH/bioc_webstats/flask_ingest_crontab_setup.sh"  $FLASK_APPROOT/
125 | sudo cp  "$SITE_PACKAGES_PATH/bioc_webstats/flask_ingest.sh"  $FLASK_APPROOT/
126 | 
127 | 
128 | sudo systemctl enable bioc-webstats.service
129 | sudo systemctl start bioc-webstats.service
130 | sudo systemctl status bioc-webstats.service
131 | 


--------------------------------------------------------------------------------
/bioc_webstats/ingest_logs.py:
--------------------------------------------------------------------------------
  1 | """Ingest download logs from Athena."""
  2 | 
  3 | import logging
  4 | from datetime import date, datetime, timedelta
  5 | from typing import Optional
  6 | 
  7 | import awswrangler as wr
  8 | import boto3
  9 | import click
 10 | import pandas as pd
 11 | from flask import current_app
 12 | 
 13 | import bioc_webstats.aws_functions as aws_functions
 14 | import bioc_webstats.models as db
 15 | 
 16 | 
 17 | def ingest_logs(
 18 |     start_date: Optional[date] = None, 
 19 |     end_date: Optional[date] = None,
 20 |     aws_profile: Optional[chr] = None, 
 21 |     source_database: Optional[chr] = None,
 22 |     result_filename: Optional[chr] = None,
 23 |     cloudfront_id: Optional[chr] = None,
 24 |     cloudfront_path: Optional[chr] = None
 25 |     ) -> None:
 26 |     """Process download access logs"""
 27 | 
 28 |     """See https://aws-sdk-pandas.readthedocs.io/en/latest/index.html"""
 29 | 
 30 |     def datetime2str(dt: datetime) -> chr:
 31 |         """Conveninece function transform datetime into precise date and time string for logs"""
 32 |         return dt.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
 33 | 
 34 |     # If an aws_profile value is specified, assume that we need to setup
 35 |     # the session
 36 |     if aws_profile is not None:
 37 |         boto3.setup_default_session(profile_name = aws_profile)
 38 |     # TODO this is a patch
 39 |     boto3.setup_default_session(region_name = 'us-east-1')
 40 |     log = current_app.logger
 41 |     log.log(logging.INFO, f'Starting ingest_logs')
 42 |     # source_connection_string = "s3://bioc-webstats-download-logs/data/year=2024/month=01/day=10/"  # TODO current_app.config["SOURCE LOCATION"]
 43 |     # df = wr.s3.read_parquet(source_connection_string, dataset=True)
 44 | 
 45 |     # TODO verify that we are not re-inserting existing dates
 46 | 
 47 |     if start_date is None:
 48 |         start_date = db.WebstatsInfo.get_valid_thru_date() + timedelta(days=1)
 49 | 
 50 |     if end_date is None:
 51 |         end_date = datetime.utcnow().date() - timedelta(days=1)
 52 |         
 53 |     if start_date > end_date:
 54 |         log.warning(f"Start date {start_date} greater than end date {end_date}. No log records ingested", 
 55 |                     start_date,
 56 |                     end_date)
 57 |         return
 58 | 
 59 |     last_log_date = db.BiocWebDownloads.get_last_date_log_date()
 60 |     
 61 |     if last_log_date >= end_date:
 62 |         # Protect against duplicate inserts of the lgos
 63 |         log.warning(f"Database already contains logs through {last_log_date} - ingest skipped")
 64 |     else:
 65 |         # Some dates need uploading. All of them?
 66 |         if last_log_date >= start_date:
 67 |             log.warning(f"Logs already uploaded through {last_log_date}.")
 68 |             start_date = last_log_date + timedelta(days=1)
 69 |         log.info(f"Ingesting logs from {start_date} to {end_date}")
 70 |             
 71 |         query_str = f"""
 72 |     select  "date", "c-ip" as c_ip, "sc-status" as sc_status, "category", "package" from v_bioc_web_downloads
 73 |         where "date" between DATE '{start_date.strftime( "%Y-%m-%d")}' 
 74 |             and DATE '{end_date.strftime("%Y-%m-%d")}'
 75 |     """
 76 |         if source_database is None:
 77 |             source_database = "default"
 78 | 
 79 |         # TODO try/except protection
 80 |         result = wr.athena.read_sql_query(sql=query_str, database=source_database, ctas_approach=True)
 81 |         log.info(f"{len(result)} records read")
 82 | 
 83 |         # Dump records to csv file if requested
 84 |         if result_filename is not None:
 85 |         # Write output to csv file
 86 |             result.to_csv(result_filename, index = False)
 87 |             log.info(f"All records written to {result_filename}")
 88 |             return
 89 |         
 90 |         # Write out put to database table
 91 |         db.BiocWebDownloads.insert_from_dataframe(dataframe=result)
 92 |         log.info("Upload to database complete")
 93 |         # End of log uploads
 94 |     
 95 |     # Now update the stats from the start date to the current time
 96 |     db.BiocWebDownloads.update_stats_from_downloads(start_date.replace(day=1))
 97 |     log.info("Update of stats complete")
 98 |     
 99 |     if cloudfront_id is None:
100 |         log.info("Cache invalidation skipped")
101 |     else:
102 |         log.info(f"CloudFront ditribution {cloudfront_id}/{cloudfront_path} invalidation started")
103 |         aws_functions.cloudfront_invalidation(cloudfront_id, [cloudfront_path])
104 |         log.info("Cache invalidation complete")
105 | 
106 |     log.info("Log ingestion complete")
107 | 


--------------------------------------------------------------------------------
/bioc_webstats/configmodule.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | """The configuration dictionary establishes application specific configuration parameters
  4 | and nmaps hierarchical names to flask names
  5 | """
  6 | configuration_dictionary = [
  7 |     {
  8 |         "Name": "db/dbname",
  9 |         "FlaskName": "DBNAME",
 10 |         "Type": "String",
 11 |         "Value": "webstats",
 12 |         "Description": "Postgres database name, default 'webstats'",
 13 |     },
 14 |     {
 15 |         "Name": "db/credentials",
 16 |         "FlaskName": "DBCREDENTIALS",
 17 |         "Type": "String",
 18 |         "Value": "arn:aws:secretsmanager:reference-to-database-credentials-secret",
 19 |         "Description": "arn tof secrets manager secret",
 20 |     },
 21 |     {
 22 |         "Name": "db/dbuser",
 23 |         "FlaskName": "DBUSER",
 24 |         "Type": "String",
 25 |         "Value": "webstats_runner",
 26 |         "Description": "PostgrSQL user name, default 'webstats_runner'",
 27 |     },
 28 |     {
 29 |         "Name": "db/port",
 30 |         "FlaskName": "DBPORT",
 31 |         "Type": "String",
 32 |         "Value": "5432",
 33 |         "Description": "Server endpoint port number",
 34 |     },
 35 |     {
 36 |         "Name": "db/server",
 37 |         "FlaskName": "DBSERVER",
 38 |         "Type": "String",
 39 |         "Value": "TBD",
 40 |         "Description": "The symbolic address of the endpoint for the Postgres server",
 41 |     },
 42 |     {
 43 |         "Name": "flask/flask_app",
 44 |         "FlaskName": "APP",
 45 |         "Type": "String",
 46 |         "Value": "bioc_webstats.app:create_app('development')",
 47 |         "Description": "Default initiation call for Flask",
 48 |     },
 49 |     {
 50 |         "Name": "flask/approot",
 51 |         "FlaskName": "APPROOT",
 52 |         "Type": "String",
 53 |         "Value": "/var/www/webstats",
 54 |         "Description": "Working directory for app",
 55 |     },
 56 |     {
 57 |         "Name": "flask/logroot",
 58 |         "FlaskName": "LOGROOT",
 59 |         "Type": "String",
 60 |         "Value": "/var/log/bioc-webstats",
 61 |         "Description": "Location of log files for the app",
 62 |     },
 63 |     {
 64 |         "Name": "flask/osgroup",
 65 |         "FlaskName": "OSGROUP",
 66 |         "Type": "String",
 67 |         "Value": "webstats",
 68 |         "Description": "Execution group name",
 69 |     },
 70 |     {
 71 |         "Name": "flask/osuser",
 72 |         "FlaskName": "OSUSER",
 73 |         "Type": "String",
 74 |         "Value": "webstats",
 75 |         "Description": "Execution user name",
 76 |     },
 77 |     {
 78 |         "Name": "flask/flask_debug",
 79 |         "FlaskName": "DEBUG",
 80 |         "Type": "String",
 81 |         "Value": "FALSE",
 82 |         "Description": "False' Caution: Do not enable in production",
 83 |     },
 84 |     {
 85 |         "Name": "flask/log_level",
 86 |         "FlaskName": "LOG_LEVEL",
 87 |         "Type": "String",
 88 |         "Value": "INFO",
 89 |         "Description": "Standard log levels, default 'INFO'",
 90 |     },
 91 |     {
 92 |         "Name": "flask/secret_key",
 93 |         "FlaskName": "SECRET_KEY",
 94 |         "Type": "String",
 95 |         "Value": "TBD",
 96 |         "Description": "Secret key for activating web client flask debugging tools",
 97 |     },
 98 | ]
 99 | 
100 | 
101 | class Config(object):
102 |     CSRF_ENABLED = True
103 |     SQLALCHEMY_TRACK_MODIFICATIONS = False
104 |     LOG_LEVEL = "INFO"
105 |     LOG_NAME = 'webstats'
106 |     LOG_FILEPATH = '/var/log/bioc-webstats/webstats.log'
107 |     DEBUG_TB_INTERCEPT_REDIRECTS = False
108 |     CACHE_TYPE = "SimpleCache"  # Can be "MemcachedCache", "RedisCache", etc.
109 |     SQLALCHEMY_TRACK_MODIFICATIONS = False
110 |     URI_PATH_PREFIX = "/packages/stats"
111 |     SECRET_KEY = ''
112 | 
113 | class productionConfig(Config):
114 |     ENV="production"
115 |     DATABASE_URL=os.getenv('DATABASE_URL', '')
116 |     DEVELOPMENT=False
117 |     TESTING=False
118 |     AWS_PATH_PARAMETER='/bioc/webstats/prod'
119 |     # TODO Temporarily harrd-coded to sandbox rds cluster
120 |     SEND_FILE_MAX_AGE_DEFAULT=0
121 | 
122 | class developmentConfig(Config):
123 |     ENV="development"
124 |     DEVELOPMENT=True
125 |     TESTING=True
126 |     LOG_LEVEL="DEBUG"
127 |     AWS_PATH_PARAMETER='/bioc/webstats/dev'
128 |     # TODO Temporarily harrd-coded to sandbox rds cluster
129 |     LOG_FILEPATH = './instance/webstats.log'
130 |     SEND_FILE_MAX_AGE_DEFAULT=31556926
131 |     DATABASE_URL = os.getenv('DATABASE_URL', "sqlite:///dev.db")
132 | 
133 | class debugConfig(Config):
134 |     # TODO Create a Debug ENV value
135 |     ENV = "development"
136 |     TESTING = True
137 |     DEBUG = True
138 |     DATABASE_URL = 'sqlite:///:memory:'
139 |     LOG_FILEPATH = './instance/webstats.log'
140 |     DEBUG_TB_ENABLED = False
141 |     CACHE_TYPE = "simple"  # Can be "memcached", "redis", etc.
142 |     SQLALCHEMY_TRACK_MODIFICATIONS = False
143 |     WTF_CSRF_ENABLED = False  # Allows form testing
144 | 


--------------------------------------------------------------------------------
/bioc_webstats/templates/category.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
  2 | <HTML>
  3 | 
  4 | <HEAD>
  5 |     <TITLE>Download stats for Bioconductor {{ category_name }} packages</TITLE>
  6 |     <!-- TODO Use a jinja extends layout.html verb here. And make an appropriate layout.html -->
  7 |     <!-- TODO Styling doesn't match old production -->
  8 |     <style>
  9 |         BODY {
 10 |             font-family: sans-serif;
 11 |             font-size: 10pt;
 12 |         }
 13 | 
 14 |         H1 {
 15 |             font-size: 16pt;
 16 |         }
 17 | 
 18 |         H2 {
 19 |             font-size: 14pt;
 20 |             margin-top: 8px;
 21 |             margin-bottom: 4px;
 22 |         }
 23 | 
 24 |         H3 {
 25 |             font-size: 12pt;
 26 |             margin-top: 0px;
 27 |             margin-bottom: 4px;
 28 |         }
 29 | 
 30 |         H4 {
 31 |             font-size: 11pt;
 32 |             margin-top: 0px;
 33 |             margin-bottom: 4px;
 34 |         }
 35 | 
 36 |         TABLE.stats {
 37 |             border: solid black 1px;
 38 |         }
 39 |         .column-container {
 40 |             display: flex;
 41 |         }
 42 |         .column {
 43 |             flex: 1;
 44 |         }
 45 |     </style>
 46 | 
 47 |     </style>
 48 | </HEAD>
 49 | 
 50 | <BODY>
 51 |     <TABLE style="width: 100%; border-spacing: 0px; border-collapse: collapse;">
 52 |         <TR>
 53 |             <TD style="padding: 0px; text-align: right;">
 54 |                 <I>See download stats for:</I>&nbsp;&nbsp;&nbsp;&nbsp;
 55 |                 {% for ref, desc in category_links %}
 56 |                 <I><A HREF="{{ ref }}">Bioconductor {{ desc }} packages</A></I>&nbsp;&nbsp;&nbsp;&nbsp;
 57 |                 {% endfor %}
 58 |             </TD>
 59 |         </TR>
 60 |     </TABLE>
 61 |     <H1 style="text-align: center;">Download stats for Bioconductor {{ category_name }} packages</H1>
 62 |     <p> {{ url_list }} </p>
 63 |     <P style="text-align: center;">
 64 |         <I>Data as of {{ generated_date.strftime("%a. %d %b %Y") }}.</I>
 65 |     </P>
 66 |     <P>The number reported next to each package name is the <I>download score</I>, that is, the average number of
 67 |         distinct IPs that "hit" the package each month for the last 12 months (not counting the current month).</P>
 68 |     {% if top_count > 0 %}
 69 |     <HR>
 70 |     <H2>Top {{ top_count }}</H2>
 71 |     <table class="pkg_index">
 72 |         <tbody>
 73 |             <tr>
 74 |                 <td style="vertical-align: top; width:300px;">
 75 |                     <!-- This double loop to create 3 top-to-bottom columns -->
 76 |                     {% set N = top|length // 3 %}
 77 |                     {% for i in range(N) %}
 78 |                 <tr class="pkg_index">
 79 |                         {% for j in range(3) %}
 80 |                         {% set (package_name, package_score, package_rank)=top[i + j * N] %} <td
 81 |                             style="width:25px; text-align: right">{{ package_rank }}</td>
 82 |                         <td>
 83 |                             <a href="{{  category_url_stem }}/{{ package_name}}/">
 84 |                                 {{ package_name}}&nbsp;({{ package_score }})
 85 |                             </a>
 86 |                         </td>
 87 |                         {% endfor %}
 88 |                     </tr>
 89 |                 {% endfor %}
 90 |                 </td>
 91 |             </tr>
 92 |         </tbody>
 93 |     </table>
 94 |     <hr>
 95 |     <h2>All {{ category_name }} packages</h2>
 96 |     {% endif %}
 97 |     <p>
 98 |         All {{ category_name }} package stats in one file:&nbsp;
 99 |         <a href="{{ category_url_stem }}/{{ tab_page_prefix }}_pkg_stats.tab">
100 |             {{ tab_page_prefix }}_pkg_stats.tab
101 |         </a>
102 |     </p>
103 |     <p>
104 |         All {{ category_name }} download scores in one file:&nbsp;
105 |         <a href="{{ category_url_stem }}/{{ tab_page_prefix }}_pkg_scores.tab">
106 |             {{ tab_page_prefix }}_pkg_scores.tab
107 |         </a>
108 |     </p>
109 |     <p style="text-align: center">
110 |         <a href="{{ category_url_stem }}/index.html">
111 |                 See Download stats for Bioconductor {{ category_name }} repository (all packages combined)
112 |         </a>
113 |     </p>
114 | 
115 |         {% for letterkey, package_list in scores.items() %}
116 |             <h3 style="font-family: monospace; font-size: larger;">{{ letterkey }}</h3>
117 |             <div class="column-container">
118 |                 {% set n = package_list|length %}
119 |                 {% set third = (n / 3)|round(method='ceil')|int %}
120 |         
121 |                 {% for i in range(3) %}
122 |                     <div class="column">
123 |                         {% for package_name, package_score, package_rank in package_list[i*third:(i+1)*third] %}
124 |                         <p>
125 |                                 <a href="{{  category_url_stem }}/{{ package_name }}/">{{ package_name }}&nbsp;({{ package_score }})</a>
126 |                             </p>
127 |                             {% endfor %}
128 |                     </div>
129 |                 {% endfor %}
130 |             </div>
131 |         {% endfor %}
132 |     </BODY>
133 | 
134 | </HTML>


--------------------------------------------------------------------------------
/bioc_webstats/packages_table_update.py:
--------------------------------------------------------------------------------
  1 | """get_bioc_package_history - Extract Bioconductor package history from source repo"""
  2 | import logging
  3 | import yaml
  4 | import requests
  5 | import bioc_webstats.models as db
  6 | 
  7 | 
  8 | from flask import current_app
  9 | 
 10 | # Local manifest constants
 11 | BIOCONDUCTOR_HOME_URI = "https://www.bioconductor.org/"
 12 | PACKAGE_CATEGORIES = ["bioc", "data-annotation", "data-experiment", "workflows"]
 13 | PACKAGE_UPDATE_MAXIMUM_ALLOWED = 50
 14 | 
 15 | def version_str_to_int(version_number:chr):
 16 |     parts = version_number.split('.')
 17 |     if len(parts) == 2:
 18 |         return int(parts[0]) * 100 + int(parts[1])
 19 |     else:
 20 |         raise ValueError(f"Invalid version number: {version_number}")
 21 | 
 22 | def version_int_to_str(version_int: int):
 23 |     return f"{version_int // 100}.{version_int % 100}"
 24 |     
 25 | def web_download( stem:str, fqdn: str = BIOCONDUCTOR_HOME_URI):
 26 |     uri = f"{fqdn}{stem}"
 27 |     response = requests.get(uri)
 28 |     if response.status_code != 200:
 29 |         raise ValueError(f"Failed to download file from {uri}")
 30 |     return response.text
 31 | 
 32 | 
 33 | def packages_table_update(dry_run:bool, verbose:bool, force:bool):
 34 |     """Update database table packages from manifests on www.bioconductor.org
 35 | 
 36 |     Keyword Arguments:
 37 |         dry_run -- Calcluate changes to packges but do not update the database (default: {False})
 38 |         verbose -- Additional information to log file (default: {True})
 39 |         force -- Proceed with update even if the number of changes exceeds PACKAGE_UPDATE_MAXIMUM_ALLOWED. (default: {False})
 40 | 
 41 |     Returns:
 42 |         _description_
 43 |     """
 44 |     
 45 |     log = current_app.logger
 46 |     log.log(logging.INFO, f'starting pacakges update')
 47 | 
 48 |     bioconductor_config = yaml.safe_load(web_download("config.yaml"))
 49 |     
 50 |     release_version = bioconductor_config["release_version"]
 51 |     devel_version = bioconductor_config["devel_version"]
 52 |     
 53 |     manifest_packages = {}
 54 |     for category in PACKAGE_CATEGORIES:
 55 |         package_text = web_download(f"packages/devel/{category.replace("-", "/")}/src/contrib/PACKAGES")
 56 |         package_list = package_text.splitlines()
 57 |         p = {line.split(": ")[1]: db.PackageType[category.removeprefix("data-").upper()] for line in package_list if line.startswith("Package:")}
 58 |         manifest_packages.update(p)
 59 |         
 60 |     # dev_pcakages represents the currently active packages in the "devel" version.
 61 |     # all_packages is the complete package history
 62 |     # A package that is in dev but not all is new in the devel version
 63 |     #   and so will be added with first_version set to the devel version and last_version set to NULL
 64 |     # A package that is in all but not dev has been removed from the last version
 65 |     #   and so will have its last_version value set to the release_version
 66 |     # If it is in both dev and all, the last_version should be null. If it is not, then
 67 |     #   the package was reinstated in the devel release and the last_version will be reset to NULL
 68 |     
 69 |     dev_packages = set(manifest_packages.keys())
 70 |     # the Packages model will return 4-tuples. Turn this into a dictionary, indexted by package name
 71 |     all_package_details = db.Packages.all_package_details()
 72 |     all_active_packages = {t[0] for t in all_package_details if t[3] == 'NULL'}
 73 |     all_inactive_packages = {t[0] for t in all_package_details} - all_active_packages
 74 |     
 75 |     removed_package_names = all_active_packages - dev_packages
 76 |     reinstated_package_names = all_inactive_packages & dev_packages
 77 |     new_package_names = dev_packages - all_active_packages - reinstated_package_names
 78 |     if (verbose):
 79 |         log.log(logging.INFO, f"Total packages before update: {len(all_package_details)}")
 80 |         log.log(logging.INFO, f"Packages removed: {len(removed_package_names)}")
 81 |         log.log(logging.INFO, f"Packages added: {len(new_package_names)}")
 82 |         log.log(logging.INFO, f"Packages reinstated: {len(reinstated_package_names)}")
 83 |     
 84 |     total_changes = len(removed_package_names) + len(reinstated_package_names) + len(new_package_names)
 85 |     if total_changes > PACKAGE_UPDATE_MAXIMUM_ALLOWED:
 86 |         log.log(logging.WARN, f"total number of changes ({total_changes}) excceds maximum allowed ({PACKAGE_UPDATE_MAXIMUM_ALLOWED})")
 87 |         if not force:
 88 |             log.log(logging.ERROR, "No update made")
 89 |             return
 90 |         log.log(logging.WARN, "Force parameter is TRUE. Update will proceed")
 91 |     
 92 |     # mark the inactive packages with the value of the last release
 93 |     db.Packages.update_package_last_version(removed_package_names, release_version)
 94 |     # mark any reinstated packages by setting the last_vesion to NLL
 95 |     db.Packages.update_package_last_version(reinstated_package_names, None)
 96 |     # insert any new packages
 97 |     records = [{"package": package, "category": manifest_packages[package], "first_version": str(version_str_to_int(devel_version)), "last_version": None} for package in new_package_names]
 98 |     if (len(records) > 0):
 99 |         db.Packages.insert_records(records)
100 |     # TODO REPORT ON CONSOLe IF ERROR
101 |     return
102 | 
103 | 


--------------------------------------------------------------------------------
/bioc_webstats/templates/stats-bioc.html:
--------------------------------------------------------------------------------
  1 | {% macro title_text(package, category) %} Download stats for {% if package !=
  2 | None %} {{ category_name }} package {{ package }} {% else %} Bioconductor {{
  3 | category_name }} repository (all packages combined) {% endif %} {% endmacro %}
  4 | 
  5 | <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
  6 | <html>
  7 | 
  8 | <head>
  9 |     <title>{{ title_text(package, category) }}</title>
 10 |     <style>
 11 |         BODY {
 12 |             font-family: sans-serif;
 13 |             font-size: 10pt;
 14 |         }
 15 | 
 16 |         H1 {
 17 |             font-size: 16pt;
 18 |         }
 19 | 
 20 |         H2 {
 21 |             font-size: 14pt;
 22 |             margin-top: 8px;
 23 |             margin-bottom: 4px;
 24 |         }
 25 | 
 26 |         H3 {
 27 |             font-size: 12pt;
 28 |             margin-top: 0px;
 29 |             margin-bottom: 4px;
 30 |         }
 31 | 
 32 |         H4 {
 33 |             font-size: 11pt;
 34 |             margin-top: 0px;
 35 |             margin-bottom: 4px;
 36 |         }
 37 | 
 38 |         TABLE.stats {
 39 |             border: solid black 1px;
 40 |         }
 41 |     </style>
 42 | </head>
 43 | 
 44 | <body>
 45 |     <table style="width: 100%; border-spacing: 0px; border-collapse: collapse">
 46 |         <tr>
 47 |             <td style="padding: 0px; text-align: left">
 48 |                 <i><a href="{{ category_index_page }}">Back to the "Download stats for Bioconductor {{ category_name }}
 49 |                         packages"</a></i>
 50 |             </td>
 51 |         </tr>
 52 |     </table>
 53 | 
 54 |     <h1 style="text-align: center">{{ title_text(package, category) }}</h1>
 55 |     <p style="text-align: center">
 56 |         <i>Data as of {{ generated_date.strftime("%a. %d %b %Y") }}</i>
 57 |     </p>
 58 |     {% if package is not none %}
 59 |     <p style="text-align: center">
 60 |         {% if deprecated_version is not none %}
 61 |             Package {{ package }} is not in the current release of Bioconductor. It was last seen in {{ deprecated_version }}.
 62 |         {% else %}
 63 |             <b>{{ package }}</b> home page:
 64 |             <a href="/packages/release/{{ category.replace('-', '/') }}/html/{{ package }}.html">release version</a>,
 65 |             <a href="/packages/devel/{{ category.replace('-', '/') }}/html/{{ package }}.html">devel version</a>.
 66 |         {% endif %}
 67 |     </p>
 68 |     {% endif %}
 69 |     <p style="text-align: center">
 70 |         Number of package downloads from the Bioconductor software package
 71 |         repository, year by year, from {{ last_year }} back to {{ first_year }} (years with no downloads are omitted):
 72 |     </p>
 73 | 
 74 |     {% for data_year in data_by_year.keys() %}
 75 |         <hr />
 76 |         <h2 style="text-align: center">{{ data_year }}</h2>
 77 |         <!-- TODO aign=center to css -->
 78 |         <table width="90%" align="center">
 79 |             <tr>
 80 |                 <td style="text-align: center">
 81 |                     <canvas id="barchart_{{ data_year }}" class="barchart" width="720" height="400">
 82 |                         Your browser may be too old as it does not support the html canvas element.
 83 |                     </canvas>
 84 |                 </td>
 85 |                 <td style="text-align: center">
 86 |                     <table class="stats" algin="center">
 87 |                         <thead>
 88 |                             <tr>
 89 |                                 <th>Month</th>
 90 |                                 <th>Nb of distinct IPs</th>
 91 |                                 <th>Nb of downloads</th>
 92 |                             </tr>
 93 |                         </thead>
 94 |                         <tbody>
 95 |                             {% for line in data_by_year[data_year] %}
 96 |                                 <tr>
 97 |                                     <td style="text-align: right">{{line["month"]}}/{{ line["year"] }}</td>
 98 |                                     <td style="text-align: right; background: #aaaaff">
 99 |                                         {{ line["unique_ips"] }}
100 |                                     </td>
101 |                                     <td style="text-align: right; background: #ddddff">
102 |                                         {{ line["downloads"] }}
103 |                                     </td>
104 |                                 </tr>
105 |                                 {% endfor %}
106 |                             </tbody>
107 |                         </table>
108 |                         <a href="{{ package or category }}_{{ data_year }}_stats.tab">
109 |                             {{ package or category }}_{{ data_year }}_stats.tab
110 |                         </a>
111 |                     </td>
112 |                 </tr>
113 |             </table>
114 |     {% endfor %}
115 |     <hr />
116 |     <p style="text-align: center">
117 |         All years in one file:&nbsp;<a href="{{ package or category }}_stats.tab">{{ package or category
118 |             }}_stats.tab</a>
119 |     </p>
120 |     <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
121 |     <script type="text/javascript">
122 |         var data_table = {{ data_by_year | tojson }};
123 |     </script>
124 |     <script type="module" src="/packages/stats/static/barchart.js"></script>
125 | </body>
126 | 
127 | </html>


--------------------------------------------------------------------------------
/tests/test_models.py:
--------------------------------------------------------------------------------
  1 | """Model unit tests."""
  2 | import datetime as dt
  3 | 
  4 | import pytest
  5 | from sqlalchemy import select
  6 | 
  7 | from bioc_webstats.models import (
  8 |     Packages,
  9 |     PackageType,
 10 |     Stats,
 11 |     WebstatsInfo,
 12 |     list_to_dict,
 13 | )
 14 | 
 15 | from .conftest import check_hashed_count_list
 16 | 
 17 | 
 18 | @pytest.mark.usefixtures("db")
 19 | class TestStats:
 20 |     """Stats tests."""
 21 | 
 22 |     def test_db_valid_thru_date(self, webstatsinfo):
 23 |         """Verify appropriate last dtabase update date."""
 24 |         # Arrange
 25 |         expected = dt.date(2023, 10, 4)
 26 | 
 27 |         # Act
 28 |         result = WebstatsInfo.get_valid_thru_date()
 29 | 
 30 |         # Assert
 31 |         assert result == expected
 32 | 
 33 |     def test_statsfactory_types(self, db):
 34 |         """Test stats factory."""
 35 |         # Arrange
 36 | 
 37 |         # Act
 38 |         results = db.session.execute(select(Stats))
 39 |         result = next(results, None)[0]
 40 | 
 41 |         # Assert
 42 |         assert isinstance(result.category, PackageType)
 43 |         assert str(result.package)
 44 |         assert isinstance(result.date, dt.date)
 45 |         assert bool(result.is_monthly)
 46 |         assert int(result.ip_count)
 47 |         assert int(result.download_count)
 48 | 
 49 |     def test_stats_getall(self, db, stats):
 50 |         """Compare contents of stats table with the list of dictionaries from which it was created."""
 51 |         # Arrange
 52 | 
 53 |         # Act
 54 |         result = db.session.scalars(select(Stats))
 55 |         result = list_to_dict(result)
 56 | 
 57 |         # Assert
 58 |         assert check_hashed_count_list(result)
 59 |         assert stats == result
 60 | 
 61 |     def test_get_package_names(self, packages):
 62 |         """Get the complete list of package names in collation sequence."""
 63 |         # Arrange
 64 |         expected = sorted(packages)
 65 | 
 66 |         # Act
 67 |         result = Packages.get_package_names()
 68 | 
 69 |         # Assert
 70 |         assert expected == result
 71 | 
 72 |     # TODO Review database return values for consistency
 73 |     # TODO Verify that we only want dates and counts for this function
 74 |     def test_get_download_counts_year(self, stats):
 75 |         """Select category, package and year."""
 76 |         category = PackageType.BIOC
 77 |         package = "affy"
 78 |         year = 2023
 79 |         expected = [(x["date"], x["ip_count"], x["download_count"]) for x in stats
 80 |                     if x["category"] == category and x["package"] == package and x["date"].year == year]
 81 | 
 82 |         result = Stats.get_download_counts(category=category, package=package, year=year)
 83 | 
 84 |         # Assert
 85 |         assert expected == result
 86 | 
 87 |     def test_get_download_counts_full_year(self, stats):
 88 |         """Select one full year of download counts."""
 89 |         # Arrange
 90 |         #
 91 |         category = PackageType.ANNOTATION
 92 |         package = "BSgenome.Hsapiens.UCSC.hg38"
 93 |         year = 2022
 94 |         expected = [(d["date"], d["ip_count"], d["download_count"]) for d in stats
 95 |                     if d["category"] == category and d["package"] == package and d["date"].year == year]
 96 | 
 97 |         result = Stats.get_download_counts(
 98 |             category=category, package=package, year=year
 99 |         )
100 | 
101 |         # Assert
102 |         assert result == expected
103 | 
104 |     def test_get_download_counts_package(self, stats):
105 |         """Select all the download counts for a given package."""
106 |         # Arrange
107 |         #
108 |         category = PackageType.ANNOTATION
109 |         package = "BSgenome.Hsapiens.UCSC.hg38"
110 |         expected = [(d["date"], d["ip_count"], d["download_count"]) for d in stats
111 |                     if d["category"] == category and d["package"] == package]
112 | 
113 |         # Check the legacy sort order (increasing or decreasing year, but always increasing month and day)
114 |         result_hi_first = Stats.get_download_counts(category=category, package=package, newest_year_first=True)
115 |         result_lo_first = Stats.get_download_counts(category=category, package=package, newest_year_first=False)
116 | 
117 |         # Assert
118 |         expected_hi = sorted(expected, key=lambda x: x[0], reverse=True)
119 |         expected_hi = sorted(expected, key=lambda x: (-x[0].year, x[0].month, x[0].day))
120 | 
121 |         assert result_hi_first == expected_hi
122 | 
123 |         expected_lo = sorted(expected, key=lambda x: (x[0].year, x[0].month, x[0].day))
124 |         assert result_lo_first == expected_lo
125 | 
126 |     def test_get_download_counts_category(self, stats):
127 |         """Select all the download counts for a given category."""
128 |         # Arrange
129 |         category = PackageType.BIOC
130 |         expected = [(d["package"], d["date"], d["ip_count"], d["download_count"])
131 |                     for d in stats if d["category"] == category]
132 | 
133 |         result = Stats.get_download_counts(category=category)
134 | 
135 |         # Assert
136 |         assert result == expected
137 | 
138 |     def test_get_download_scores(self, stats):
139 |         """Select all the scores for a given category."""
140 |         # Arrange
141 |         category = PackageType.BIOC
142 |         expected = [('affy', 2, 2), ('affydata', 7, 1)]
143 | 
144 |         result = Stats.get_download_scores(category=category)
145 | 
146 |         # Assert
147 |         assert result == expected
148 | 


--------------------------------------------------------------------------------
/bioc_webstats/aws_functions.py:
--------------------------------------------------------------------------------
  1 | """ aws_functions TODO rename this. """
  2 | import json
  3 | import logging
  4 | import boto3
  5 | import psycopg2
  6 | from botocore.exceptions import ClientError
  7 | 
  8 | 
  9 | import boto3
 10 | import logging
 11 | 
 12 | 
 13 | def aws_assume_sts_role(role_arn, role_session_name):
 14 | 
 15 |     try:
 16 |         # Create an STS client
 17 |         sts_client = boto3.client('sts')
 18 | 
 19 |         # Assume the specified role
 20 |         assumed_role_object = sts_client.assume_role(
 21 |             RoleArn=role_arn,
 22 |             RoleSessionName=role_session_name
 23 |         )
 24 | 
 25 |         # Credentials to be used for the session with the assumed role
 26 |         credentials = assumed_role_object['Credentials']
 27 |         return credentials
 28 | 
 29 |     except Exception as e:
 30 |         logging.critical(f"Could not assume AWS role {role_arn}. {e}")
 31 |         raise SystemExit(1) 
 32 | 
 33 |     
 34 | def get_parameter_store_values(parameter_path: str, region_name='us-east-1') -> dict:
 35 |     """Get all SSM parameter store values for a specific configuration.
 36 | 
 37 |     Arguments:
 38 |         parameter_path -- The prefix for the configure. Example: "/bioc/webstats/dev"
 39 | 
 40 |     Returns:
 41 |         A dictionary of parameter names (excluding the prefix) and their values.
 42 |     """
 43 | 
 44 |     try:
 45 |         ssm_client = boto3.client('ssm', region_name = region_name)
 46 |         plist = ssm_client.get_parameters_by_path(Path = parameter_path, Recursive=True)
 47 |         for item in plist["Parameters"]:
 48 |             plist = ssm_client.get_parameters_by_path(Path = parameter_path, Recursive=True)
 49 |             result = {item["Name"][len(parameter_path)+1:] : item["Value"] for item in plist["Parameters"]}
 50 |         return result
 51 | 
 52 |     except Exception as e:
 53 |         logging.critical(f"Failed to read AWS parameter store {parameter_path}. {e}")
 54 |         raise SystemExit(1) 
 55 | 
 56 | 
 57 | def get_secret(secret_name, region_name):
 58 | 
 59 |     # Create a Secrets Manager client
 60 |     session = boto3.session.Session()
 61 |     client = session.client(
 62 |         service_name='secretsmanager',
 63 |         region_name=region_name
 64 |     )
 65 | 
 66 |     try:
 67 |         get_secret_value_response = client.get_secret_value(
 68 |             SecretId=secret_name
 69 |         )
 70 | 
 71 |         # Decrypts secret using the associated KMS key.
 72 |         secret = get_secret_value_response['SecretString']
 73 |         return secret
 74 | 
 75 |     except Exception as e:
 76 |         logging.critical(f"Could not read AWS secret {secret_name}. {e}")
 77 |         raise SystemExit(1) 
 78 | 
 79 | def psql_get_connection(secret_name, region_name, database_name):
 80 |     """TODO."""
 81 | 
 82 |     try:
 83 |         connection_string = aws_secret_to_psql_url(secret_name, region_name, database_name)
 84 |         conn = psycopg2.connect(connection_string)
 85 |         return conn
 86 | 
 87 |     except Exception as e:
 88 |         logging.critical(f"Could not open psql {database_name} with {secret_name}. {e}")
 89 |         raise SystemExit(1) 
 90 | 
 91 | 
 92 | def aws_secret_to_psql_url(secret_name, region_name, database_name):
 93 |     secret = get_secret(secret_name, region_name)
 94 |     db_credentials = json.loads(secret)
 95 |     # TODO: add database name to secret in Secrets Manager
 96 |     db_credentials['dbname'] = database_name
 97 | 
 98 |     # Create the PostgreSQL connection string
 99 |     connection_string = f"postgresql://{db_credentials['username']}:{db_credentials['password']}@{db_credentials['host']}:{db_credentials['port']}/{db_credentials['dbname']}"
100 |     return connection_string
101 | 
102 | 
103 | def uri_to_arn(uri):
104 |     # Parse the URI to extract the components
105 |     scheme, path = uri.split("://", 1)
106 |     if scheme != "awsarn":
107 |         raise ValueError("Invalid scheme in URI: expected 'awsarn'")
108 |     
109 |     # Split the path into its components
110 |     service_region, account_id, resource_type_path = path.split("/", 2)
111 |     service, region = service_region.split(".", 1)
112 |     
113 |     # Construct the ARN
114 |     arn = f"arn:aws:{service}:{region}:{account_id}:{resource_type_path}"
115 |     
116 |     return arn
117 | 
118 | # Example usage
119 | # uri = "awsarn://secretsmanager.us-east-1.amazonaws.com/931729544676/secret/bioc/rdb/login/webstats_runner-fQFuUn"
120 | # arn = uri_to_arn(uri)
121 | # print(arn)
122 | 
123 | 
124 | def arn_to_uri(arn):
125 |     # Validate and parse the ARN
126 |     parts = arn.split(':')
127 |     if len(parts) != 6 or parts[0] != 'arn' or parts[1] != 'aws':
128 |         raise ValueError("Invalid ARN format")
129 | 
130 |     # Extract the ARN components
131 |     _, _, service, region, account_id, resource_path = parts
132 | 
133 |     # Construct the URI
134 |     uri = f"awsarn://{service}.{region}.amazonaws.com/{account_id}/{resource_path}"
135 |     
136 |     return uri
137 | 
138 | # Example usage
139 | # arn = "arn:aws:secretsmanager:us-east-1:931729544676:secret:/bioc/rdb/login/webstats_runner-fQFuUn"
140 | # uri = arn_to_uri(arn)
141 | # print(uri)
142 | 
143 | def cloudfront_invalidation(distribution_id, paths):
144 |     """Invalidate Cloudfront Cache
145 | 
146 |     Keyword Arguments:
147 |         distribution_id -- _description_ (default: {'E1TVLJONPTUXV3'})
148 |         paths -- _description_ (default: {['/packages/stats/*']})
149 |     """
150 | 
151 | 
152 |     # TODO move distribution_id and paths defaults to flask dispatcher
153 |     client = boto3.client('cloudfront')
154 | 
155 |     # Distribution ID and the paths you want to invalidate
156 | 
157 |     # Create the invalidation
158 |     response = client.create_invalidation(
159 |         DistributionId=distribution_id,
160 |         InvalidationBatch={
161 |             'Paths': {
162 |                 'Quantity': len(paths),
163 |                 'Items': paths
164 |             },
165 |             'CallerReference': str(hash(frozenset(paths)))  # Unique reference
166 |         }
167 |     )
168 | 
169 |     # TODO  from response, log error if needed, otherwise report timestamp for invalidation
170 | 


--------------------------------------------------------------------------------
/docs/webstats-system-overview.md:
--------------------------------------------------------------------------------
  1 | # `webstats` System Overview
  2 | 
  3 | Author: Robert Shear rshear@ds.dfci.harvard.edu
  4 | Date: 2024-09-25
  5 | 
  6 | This document provides a high level overview of the operation of the `bioc-webstats` system. It is current as of version `0.1.9`.
  7 | 
  8 | The purpose of bioc-webstats is to maintain a permanent record of the download counts for each Bioconductor package in an SQL database and to report this information on www.bioconductor.org. It includes records from January 1, 2009, to the present.
  9 | 
 10 | This application replaces the "stats server" application. That server produced static pages for all the content under `www.bioconductor.org/packages/stats/`. The `bio-webstats` appliation, in its initial implementation, is designed to match the eact form of the application that it replaces.
 11 | 
 12 | The `bioc-webstats  application is implementd as in Python application and supported by a SQL database. It has two major functions.
 13 | 
 14 | 1. Data ingestion. Consume web traffic logs in Common Log Format (CLF), select those records which are package downloads, store them in a SQL table, and maintain summary statistics for each package.
 15 | 2. Web reporting. The application can serve as a backend to any web server that supports the `WSGI` standard. It consumes `http get` requests, infers their semantics from the the `URI` stem, and returns content that is functionally the same as the system it replaces. In the case of `html` responses, this means that both the content as well as the look and feel are the same. Other responses are unformatted text downloags (`.tab` and `.txt`), which are byte-for-byte idenitcal to the prior system.
 16 | 
 17 | ## Technical Stack
 18 | 
 19 | The application implmenetation is based on several frameworks and libraries:
 20 | 
 21 | - Python 3.12
 22 | - [Poetry](https://python-poetry.org) - Dependency management and padckaging.
 23 | - [Flask](https://flask.palletsprojects.com) - Web application framework.
 24 | - [SQLAlchemy](https://www.sqlalchemy.org) -  Pythn SQL toolkit and bject-rlational mapper.
 25 | - [Chart.js](https://www.chartjs.org/) - JavaScript charting library.
 26 | - [Bootstrap 5](https://getbootstrap.com) - Responsive JavaScript frontend toolkit.
 27 | 
 28 | 
 29 | There are various additional Python and JavaScript dependencies that support the application. See `pyproject.toml` in the project root directory for details.
 30 | 
 31 | The application is distibuted as a `whl` file and can be installed by any installation and package manager, including `poetry`, `pipenv`, `pipx`, `virtualenv`, or `conda`.
 32 | 
 33 | ## General System Flow
 34 | 
 35 | The general system flow as deployed as of this writing is depicted figure 1 below. The Python application can be run on any server that has secure access to `master.biconductor.org` and the AWS `Athena` service that can read the CloudFront CLF logs. This includes `master.biocnductor.org` itself. Other components, including the SQL Server, and the interal web server, are easily replaced. 
 36 | 
 37 | Not depicted in the stack are AWS-specific features for configuration (the AWS Systems Manager Parameter Store) and security (the AWS Security Manager).
 38 | 
 39 | Note: The direction of each line indicates the functional flow of information. That is, for every request-response pattern, the arrow points to the consumer of the response.
 40 | 
 41 | ![General System Flow](bioc-webstats-architecture-v2.excalidraw.png)
 42 | <p align="center">
 43 |   <em>Figure 1: General System Flow.</em>
 44 | </p>
 45 | 
 46 | A. A [Waitress](https://pypi.org/project/waitress/) lightweight webserver that consumes incoming `http get` requests from and returns results to an upstream webserver (arrow 3).
 47 | 
 48 | B. A cron job that runs daily at 01:00 UTC that:
 49 | - Detects detects changes in the development version of the manifest (arrow 1).
 50 | - Invokes an AWS Athena View to return all CloudFront log entries for dates newer than those previously uploaded (arrow 2), but only if they have a URI stem that implies a download, and only if the package name is valid.
 51 | - Updates the summary tables, `stats` and `categorystats` (see Database Structure below).
 52 | 
 53 | C. A SQL Database server. Currently implemented as a serverless AWS RDS instance, Postgres 15.
 54 | 
 55 | 
 56 | # Database Structure
 57 | 
 58 | ![Database Model](webstats-erd-0_1_9.png)
 59 | 
 60 | 
 61 | # Configuration
 62 | 
 63 | ## Development Configuratiion
 64 | 
 65 | ## Production Configuration
 66 | 
 67 | ## Paramater Names and Default Valuses
 68 | 
 69 | 
 70 | | Name              | FlaskName     | Default alue                   | Description              |
 71 | | ----------------- | ------------- | ------------------------------ | ------------------------ |
 72 | | db/dbname         | DBNAME        | webstats                       | Postgres database name, default 'webstats'                   |
 73 | | db/credentials    | DBCREDENTIALS | arn:aws:secretsmanager:reference-to-database-credentials-secret | arn of secrets manager secret                                |
 74 | | db/dbuser         | DBUSER        | webstats_runner                | PostgrSQL user name, default 'webstats_runner'               |
 75 | | db/port           | DBPORT        | 5432                           | Server endpoint port number |
 76 | | db/server         | DBSERVER      | None                           | The symbolic address of the endpoint for the Postgres server |
 77 | | flask/flask_app   | APP           | bioc_webstats. app:create_app('development')                    | Default initiation call for Flask                            |
 78 | | flask/approot     | APPROOT       | /var/www/webstats              | Working directory for app   |
 79 | | flask/logroot     | LOGROOT       | /var/log/bioc-webstats         | Location of log files for the app   |
 80 | | flask/osgroup     | OSGROUP       | webstats                       | Execution group name        |
 81 | | flask/osuser      | OSUSER        | webstats                       | Execution user name         |
 82 | | flask/flask_debug | DEBUG         | FALSE                          | False' Caution: Do not enable in production  |
 83 | | flask/log_level   | LOG_LEVEL     | INFO                           | Standard log levels, default 'INFO'  |
 84 | | flask/secret_key  | SECRET_KEY    | None                           | Secret key for activating web client flask debugging tools |
 85 | 
 86 | # Deploymnet
 87 | 
 88 | TODO
 89 | 
 90 | # Web Application Internals OVerview
 91 | 
 92 | The platofrm independent logic for the system is in the directory `./bioc_stats`.
 93 | 
 94 | ## Initialization
 95 | 
 96 | The application is always initialized by invoking `bioc_webstats.app`.
 97 | 
 98 | TODO
 99 | 
100 | ## Processing a `http get`
101 | 
102 | 1. `stats.py`
103 | 
104 | 2. `models.py`
105 | 
106 | 3. `templates/stats-bioc.html`
107 | 
108 | 


--------------------------------------------------------------------------------
/bioc_webstats/commands.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Click commands."""
  3 | import os
  4 | from datetime import date, datetime
  5 | from glob import glob
  6 | import logging
  7 | from subprocess import call
  8 | 
  9 | import boto3
 10 | import click
 11 | 
 12 | from flask import current_app
 13 | from bioc_webstats.ingest_logs import ingest_logs
 14 | from bioc_webstats.packages_table_update import packages_table_update
 15 | from bioc_webstats.configmodule import configuration_dictionary
 16 | 
 17 | HERE = os.path.abspath(os.path.dirname(__file__))
 18 | PROJECT_ROOT = os.path.join(HERE, os.pardir)
 19 | TEST_PATH = os.path.join(PROJECT_ROOT, "tests")
 20 | 
 21 | 
 22 | def parse_date(ctx, param, value):
 23 |     """Helper for parsing click.option dates"""
 24 |     if value is None:
 25 |         return value
 26 |     try:
 27 |         return datetime.strptime(value, "%Y-%m-%d").date()
 28 |     except ValueError:
 29 |         raise click.BadParameter("Date should be in YYYY-MM-DD format.")
 30 | 
 31 | 
 32 | @click.command()
 33 | @click.option(
 34 |     "-c/-C",
 35 |     "--coverage/--no-coverage",
 36 |     default=True,
 37 |     is_flag=True,
 38 |     help="Show coverage report",
 39 | )
 40 | def test(coverage):
 41 |     """Run the tests."""
 42 |     import pytest
 43 | 
 44 |     args = [TEST_PATH, "--verbose"]
 45 |     if coverage:
 46 |         args.append("--cov=bioc_webstats")
 47 |     rv = pytest.main(args)
 48 |     exit(rv)
 49 | 
 50 | 
 51 | @click.command()
 52 | @click.option(
 53 |     "-f",
 54 |     "--fix-imports",
 55 |     default=True,
 56 |     is_flag=True,
 57 |     help="Fix imports using isort, before linting",
 58 | )
 59 | @click.option(
 60 |     "-c",
 61 |     "--check",
 62 |     default=False,
 63 |     is_flag=True,
 64 |     help="Don't make any changes to files, just confirm they are formatted correctly",
 65 | )
 66 | def lint(fix_imports, check):
 67 |     """Lint and check code style with black, flake8 and isort."""
 68 |     skip = ["node_modules", "requirements", "migrations"]
 69 |     root_files = glob("*.py")
 70 |     root_directories = [
 71 |         name for name in next(os.walk("."))[1] if not name.startswith(".")
 72 |     ]
 73 |     files_and_directories = [
 74 |         arg for arg in root_files + root_directories if arg not in skip
 75 |     ]
 76 | 
 77 |     def execute_tool(description, *args):
 78 |         """Execute a checking tool with its arguments."""
 79 |         command_line = list(args) + files_and_directories
 80 |         click.echo(f"{description}: {' '.join(command_line)}")
 81 |         rv = call(command_line)
 82 |         if rv != 0:
 83 |             exit(rv)
 84 | 
 85 |     isort_args = []
 86 |     black_args = []
 87 |     if check:
 88 |         isort_args.append("--check")
 89 |         black_args.append("--check")
 90 |     if fix_imports:
 91 |         execute_tool("Fixing import order", "isort", *isort_args)
 92 |     execute_tool("Formatting style", "black", *black_args)
 93 |     execute_tool("Checking code style", "flake8")
 94 | 
 95 | 
 96 | @click.command()
 97 | def gendb():
 98 |     """Generate small test database."""
 99 |     from bioc_webstats.database import db
100 |     from tests.conftest import generate_small_test_db
101 | 
102 |     click.echo("Creating small test database")
103 |     app = current_app._get_current_object()
104 | 
105 |     test_db_contents = generate_small_test_db(app)
106 |     pass
107 | 
108 | 
109 | @click.command()
110 | @click.option(
111 |     "-s",
112 |     "--start",
113 |     required=False,
114 |     callback=parse_date,
115 |     help="Beginning date for upload. Default: first date not already proceessed.",
116 | )
117 | @click.option(
118 |     "-e",
119 |     "--end",
120 |     required=False,
121 |     callback=parse_date,
122 |     help="Ending date for upload. Default: yesterday (UTC)",
123 | )
124 | @click.option(
125 |     "-d",
126 |     "--database",
127 |     required=False,
128 |     help="Name of the source database. DefaUlt: default",
129 | )
130 | @click.option(
131 |     "-f",
132 |     "--filename",
133 |     required=False,
134 |     help="Specifies the name of a local file to receive the csv results instead of sending them to the database",
135 | )
136 | @click.option(
137 |     "-c",
138 |     "--cloudfront",
139 |     required=False,
140 |     help="If present, the distribution ID of the CloudFront cachce to refresh. If absent, no refresh",
141 | )
142 | @click.option(
143 |     "--path",
144 |     required=False,
145 |     help="The CloudFront path to refresh. Default: '/packages/stats/*'",
146 | )
147 | def ingest(start, end, database, filename, cloudfront, path):
148 |     """Read raw weblogs, select valid package downlads, update webstats database"""
149 | 
150 |     if path is None:
151 |         path = "/packages/stats/*"
152 | 
153 |     ingest_logs(
154 |         start_date=start,
155 |         end_date=end,
156 |         source_database=database,
157 |         result_filename=filename,
158 |         cloudfront_id=cloudfront,
159 |         cloudfront_path=path,
160 |     )
161 | 
162 | 
163 | @click.command()
164 | @click.option(
165 |     "-n", "--namespace", required=False, help="Namespace (parameter path prefix)"
166 | )
167 | @click.option("-p", "--profile", required=False, help="AWS SSO profile for target")
168 | @click.option("-r", "--region", required=False, help="AWS target region")
169 | def configp(namespace, profile, region):
170 |     """Initialize AWS parameter set"""
171 | 
172 |     if namespace is None:
173 |         namespace = '/bioc/webstats/prod/'
174 |     if region is None:
175 |         region = 'us-east-1'
176 |     if profile is None:
177 |         profile = 'bioc'
178 | 
179 |     session = boto3.Session(
180 |         profile_name=profile,
181 |         region_name=region
182 |     )
183 | 
184 |     ssm_client = session.client('ssm')
185 | 
186 |     # TODO Test for previously existing. Create --force parameter
187 |     # TODO Add tags
188 |     
189 |     try:
190 |         for p in configuration_dictionary:
191 |             q = p
192 |             q["Name"] = namespace + p["Name"]
193 |             response = ssm_client.put_parameter(**q)
194 |             # TODO check response for errors
195 |     except Exception as e:
196 |         logging.error(f"Failed to store parameters. {e}")
197 |         raise e
198 |     logging.info("AWs Parameter set configured namespace:{namespace} profile:{profile} {region}.")
199 | 
200 | @click.command()
201 | @click.option("-d", "--dry_run", is_flag=True, required=False, help="Report packages changes but do not update database")
202 | @click.option("-v", "--verbose", is_flag=True, required=False, help="More reporting")
203 | @click.option("-f", "--force", is_flag=True, required=False, help="Update table even though the update is suspicously large")
204 | def packages(dry_run:bool = False, verbose: bool = True, force: bool = False):
205 |     """Read package information from the Bioconductor infrastructure and update the packages table to reflect current status"""
206 | 
207 |     packages_table_update(dry_run, verbose, force)
208 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | """Initialization for pytests."""
  2 | import datetime as dt
  3 | import logging
  4 | import math
  5 | from zlib import crc32
  6 | 
  7 | import pytest
  8 | from dateutil.relativedelta import relativedelta
  9 | from flask import Flask
 10 | from flask_sqlalchemy import SQLAlchemy
 11 | from sqlalchemy import create_engine
 12 | from sqlalchemy.pool import StaticPool
 13 | from webtest import TestApp
 14 | 
 15 | from bioc_webstats.app import create_app
 16 | from bioc_webstats.extensions import db as _db
 17 | from bioc_webstats.models import PackageType
 18 | 
 19 | from .factories import PackagesFactory, StatsFactory, WebstatsInfoFactory
 20 | 
 21 | 
 22 | @pytest.fixture(scope="session")
 23 | def app():
 24 |     """Create application for the tests."""
 25 |     _app = create_app("debug")
 26 |     create_engine(
 27 |         "sqlite:///:memory:",
 28 |         connect_args={"check_same_thread": False},
 29 |         poolclass=StaticPool,
 30 |     )
 31 |     _app.logger.setLevel(logging.DEBUG)
 32 |     ctx = _app.test_request_context()
 33 |     ctx.push()
 34 | 
 35 |     yield _app
 36 | 
 37 |     ctx.pop()
 38 | 
 39 | 
 40 | @pytest.fixture(scope="session")
 41 | def db(app: Flask):
 42 |     """Session-wide test database."""
 43 |     _db = generate_small_test_db(app)
 44 |     yield _db
 45 |     _db.session.close()
 46 |     _db.drop_all()
 47 | 
 48 | 
 49 | @pytest.fixture(scope="session")
 50 | def webapp(app: Flask, db: SQLAlchemy):
 51 |     """Fixture for app test."""
 52 |     return TestApp(app)
 53 | 
 54 | 
 55 | @pytest.fixture(scope="function")
 56 | def session(db: SQLAlchemy, request: pytest.FixtureRequest):
 57 |     """Create isolated transaction."""
 58 |     db.session.begin_nested()
 59 | 
 60 |     def commit():
 61 |         db.session.flush()
 62 | 
 63 |     # patch commit method
 64 |     old_commit = db.session.commit
 65 |     db.session.commit = commit
 66 | 
 67 |     def teardown():
 68 |         db.session.rollback()
 69 |         db.session.close()
 70 |         db.session.commit = old_commit
 71 | 
 72 |     request.addfinalizer(teardown)
 73 |     return db.session
 74 | 
 75 | 
 76 | database_test_cases = [
 77 |     (PackageType.BIOC, "affy", "2023-09-01"),
 78 |     (PackageType.BIOC, "affydata", "2023-08-01"),
 79 |     (PackageType.ANNOTATION, "BSgenome.Hsapiens.UCSC.hg38", "2019-01-01"),
 80 |     (PackageType.ANNOTATION, "BSgenome.Scerevisiae.UCSC.sacCer3", "2021-01-01"),
 81 | ]
 82 | 
 83 | database_test_valid_date = dt.date(2023, 10, 4)
 84 | 
 85 | 
 86 | def create_hashed_counts(d: dict) -> tuple[int, int]:
 87 |     """Calculate reproducable hashed ip_count and download_count values for test stats rows.
 88 | 
 89 |     For small database tests, create ip_count and downlooad count values that are a function of the
 90 |     other columns of the stats table. This function is used to both generate the test rows and to check
 91 |     that the test rows return the correct values.
 92 | 
 93 |     Arguments:
 94 |         d -- A dictionary containing the tvalues of a stats record
 95 | 
 96 |     Returns:
 97 |         an ordered pair, the hashed ip_count and the hashed download_count
 98 |     """
 99 |     s = "|".join(
100 |         [str(d.get(tag, "")) for tag in ["category", "package", "date", "is_monthly"]]
101 |     )
102 |     # 9007 is a prime number of a size to give a reasonable hash for test purposes
103 |     download_count = crc32(s.encode("utf-8")) % 9007
104 |     ip_count = int(math.ceil(math.sqrt(download_count)))
105 |     return (ip_count, download_count)
106 | 
107 | 
108 | def generate_small_test_db_packages():
109 |     """Create list of package names in the small_test database."""
110 |     packages_dict = []
111 |     for category, package, _ in database_test_cases:
112 |         u = {
113 |             "category": category,
114 |             "package": package,
115 |             "first_version": 201,
116 |             "last_version": None,
117 |         }
118 |         packages_dict.append(u)
119 |     return packages_dict
120 | 
121 | 
122 | def generate_small_test_db_stats():
123 |     """Create list of dictionary objects corresponding to Stats columns for small test database."""
124 |     end_date = database_test_valid_date
125 | 
126 |     def months_sequence(start_date: dt.date, end_date: dt.date):
127 |         """Yield the first day of each month from start_date to end_date inclusive."""
128 |         current_date = start_date
129 | 
130 |         while current_date <= end_date:
131 |             yield current_date
132 |             current_date += relativedelta(months=1)
133 | 
134 |     stats_dict = []
135 |     for category, package, start_date in database_test_cases:
136 |         for d in months_sequence(
137 |             dt.datetime.strptime(start_date, "%Y-%m-%d").date(), end_date
138 |         ):
139 |             u = {
140 |                 "category": category,
141 |                 "package": package,
142 |                 "date": d,
143 |                 "is_monthly": True,
144 |             }
145 |             u["ip_count"], u["download_count"] = create_hashed_counts(u)
146 |             stats_dict.append(u)
147 | 
148 |     return stats_dict
149 | 
150 | def generate_small_test_db(app: Flask):
151 |     """Session-wide test database."""
152 |     _db.app = app
153 |     with app.app_context():
154 |         _db.create_all()
155 |         u = generate_small_test_db_stats()
156 |         [StatsFactory(**v) for v in u]
157 |         u = [{"key": "ValidThru", "value": "2023-10-04"}]
158 |         [WebstatsInfoFactory(**v) for v in u]
159 |         u = generate_small_test_db_packages()
160 |         [PackagesFactory(**v) for v in u]
161 |         _db.session.commit()
162 |         return _db
163 | 
164 | 
165 | def check_hashed_counts(d: dict) -> bool:
166 |     """Check that a genearted test data with hashed counts are correct.
167 | 
168 |     Arguments:
169 |         d -- Dictionary form of stats table row
170 | 
171 |     Returns:
172 |         True ==> The ip_count and download_count matches the calcuated hash
173 |     """
174 |     ip_count, download_count = create_hashed_counts(d)
175 |     return (
176 |         d.get("ip_count", -1) == ip_count
177 |         and d.get("download_count", -1) == download_count
178 |     )
179 | 
180 | 
181 | def check_hashed_count_list(d_list: list[dict]) -> bool:
182 |     """
183 |     This function checks if all stats rows in a list have the expected hash counts.
184 |     
185 |     :param d_list: A list of dictionaries derived from Stats Rows
186 |     :type d_list: [dict]
187 |     :return: The function `check_hashed_hashed_count_list` returns a boolean value. It returns `True` if
188 |     all the rows in the input list have the expected count values, and it returns `False` if at least
189 |     one row has an incorrect count value.
190 |     """
191 |     """Check that all stats rows in this list have expected hash counts.
192 | 
193 |     Arguments:
194 |         d_list -- A list of dictionaries derivd from Stats Rows.
195 | 
196 |     Returns:
197 |         True ==> All the rows have the expected count values.
198 |         False ==> At least one row was incorrect.
199 |     """
200 |     for r in d_list:
201 |         if not check_hashed_counts(r):
202 |             return False
203 |     return True
204 | 
205 | 
206 | @pytest.fixture(scope="session")
207 | def webstatsinfo(db: SQLAlchemy):
208 |     """Create WebstatsInfo for the tests."""
209 |     return
210 | 
211 | 
212 | @pytest.fixture(scope="session")
213 | def stats(db: SQLAlchemy):
214 |     """Create stats for the tests."""
215 |     return generate_small_test_db_stats()
216 | 
217 | 
218 | @pytest.fixture(scope="session")
219 | def packages(db: SQLAlchemy):
220 |     """Create packages for the tests."""
221 |     return [u['package'] for u in generate_small_test_db_packages()]
222 | 


--------------------------------------------------------------------------------
/bioc_webstats/app.py:
--------------------------------------------------------------------------------
  1 | """
  2 | app.py
  3 | 
  4 | Create the flask application and inialize the environment
  5 | 
  6 | Summary: 
  7 |     This module implments the application factory, 
  8 |     as explained here: http://flask.pocoo.org/docs/patterns/appfactories/.
  9 | 
 10 | Description:
 11 |     All the run time parameters are read and various services are registered with the Flask infrastructure.
 12 | 
 13 | Notes:
 14 |     The run time parameters can come from the following sources. They are processed in the order
 15 |     shown here. A later source will overwrite an earlier source.
 16 |     1. Bootstrap parameters. Enviroment variables necessary to get started.
 17 |     2. configmodule.py. This defines the invariant default values for each parameter. Also defines manifest constants.
 18 |     3. An environmental parameter store. In specific, the AWS Systems Manager (SSM) Parameter Store.
 19 |     4. FLASK_* environment variables. For temporary overrides in production.
 20 |     5. ".env" files. Provides for parameters to be set based on their presence in this file.
 21 |     Useful for setting up test environments. Should never be used in production.
 22 | 
 23 | """
 24 | import logging
 25 | import logging.handlers
 26 | import os
 27 | import sys
 28 | 
 29 | from flask import Flask, render_template
 30 | from werkzeug.utils import import_string
 31 | 
 32 | import bioc_webstats.aws_functions as aws
 33 | from bioc_webstats import commands, splash, stats
 34 | from bioc_webstats.configmodule import configuration_dictionary
 35 | from bioc_webstats.extensions import (
 36 |     cache,
 37 |     csrf_protect,
 38 |     db,
 39 |     debug_toolbar,
 40 |     flask_static_digest,
 41 |     migrate,
 42 | )
 43 | 
 44 | 
 45 | def create_app(
 46 |     config_type=None,
 47 |     aws_parameter_path=None,
 48 |     enable_remote_debugging=False
 49 | ):
 50 |     """The Application Factory. Set up the particular instance of the Flask class.
 51 | 
 52 |     Keyword Arguments:
 53 |         config_type -- The configmodule subclass object to use. Allowed values "production" and "development" (default: {"production"})
 54 |         aws_parameter_path -- If present, the the AWS System Manager Parameter Store will be searched for runtime parameters. See function aws.get_parameter_store_values for more information  (default: {None})
 55 |         enable_remote_debugging -- If True, the system wil attempt to enable the Visual Studio COde remote debugging protocl. See package ptvsd for details. (default: {False})
 56 | 
 57 |     Returns:
 58 |         A fully configured Flask App object.
 59 |     """    
 60 | 
 61 | 
 62 |     if enable_remote_debugging:
 63 |         # This will allow the use of the VS Code remote debugger.
 64 |         import ptvsd
 65 |         ptvsd.enable_attach(address=('0.0.0.0', 5678), redirect_output=True)
 66 |         print("Waiting for debugger to attach...")
 67 |         ptvsd.wait_for_attach()
 68 | 
 69 |     app = Flask(__name__.split(".")[0])
 70 |     
 71 |     # Bootstrap variables
 72 |     if config_type is None:
 73 |         config_type = os.getenv('FLASK_ENV', config_type)
 74 |     app.config["ENV"] = config_type
 75 | 
 76 |     config_object_name = f"bioc_webstats.configmodule.{app.config["ENV"]}Config"
 77 | 
 78 |     # Populate the configuration from config and its sublcasses
 79 |     cfg = import_string(config_object_name)()
 80 |     app.config.from_object(cfg)
 81 | 
 82 |     # Next, load parameters from the SSM Parameter store.
 83 |     if aws_parameter_path is not None:
 84 |         param_dict = aws.get_parameter_store_values(app.config["AWS_PATH_PARAMETER"])
 85 |         xref = {}
 86 |         for u in configuration_dictionary:
 87 |             xref[u["Name"]] = u["FlaskName"]
 88 |         for k, v in param_dict.items():
 89 |             try:
 90 |                 app.config[xref[k]] = v
 91 |             except Exception as e:
 92 |                 logging.error(f"Could not map AWS SSM parameter {k}.")
 93 | 
 94 |     # Override with environment variables with FLASK_ prefix
 95 |     app.config.from_prefixed_env()
 96 | 
 97 |     # Exract database credentials from Secrets Manager
 98 |     if "DBCREDENTIALS" in app.config:
 99 |         app.config["DATABASE_URL"] = aws.aws_secret_to_psql_url(
100 |             param_dict["db/credentials"], "us-east-1", "webstats"
101 |         )
102 | 
103 | 
104 | 
105 |     app.config["SQLALCHEMY_DATABASE_URI"] = app.config["DATABASE_URL"]
106 |     # TODO SECRET_KEY from paraemter store
107 |     app.config[
108 |         "SECRET_KEY"
109 |     ] = "1849cb85026145adc5164b9568d6afbde65351264f87c25aebdadc576ae662f5"
110 | 
111 |     configure_logger(app)
112 |     register_extensions(app)
113 |     register_blueprints(app)
114 |     register_errorhandlers(app)
115 |     register_shellcontext(app)
116 |     register_commands(app)
117 |     return app
118 | 
119 | 
120 | def register_extensions(app):
121 |     """Register Flask extensions."""
122 |     cache.init_app(app)
123 |     db.init_app(app)
124 |     csrf_protect.init_app(app)
125 | 
126 |     # No debug toolbar for production
127 |     if app.config["ENV"] != "production":
128 |         debug_toolbar.init_app(app)
129 | 
130 |     migrate.init_app(app, db)
131 |     flask_static_digest.init_app(app)
132 |     return None
133 | 
134 | 
135 | def register_blueprints(app):
136 |     """Register Flask blueprints."""
137 |     app.register_blueprint(stats.bp)
138 |     # Exclude debuging tools if this is a production environment
139 |     if app.config["ENV"] != "production":
140 |         app.register_blueprint(splash.blueprint)
141 |     return None
142 | 
143 | 
144 | def register_errorhandlers(app):
145 |     """Register error handlers."""
146 | 
147 |     def render_error(error):
148 |         """Render error template."""
149 |         # If a HTTPException, pull the `code` attribute; default to 500
150 |         error_code = getattr(error, "code", 500)
151 |         return render_template(f"{error_code}.html"), error_code
152 | 
153 |     # Pass through http error codes if this is production
154 |     if (app.config['ENV'] != 'production'):
155 |         for errcode in [401, 404, 500]:
156 |             app.errorhandler(errcode)(render_error)
157 | 
158 |     return None
159 | 
160 | 
161 | def register_shellcontext(app):
162 |     """Register shell context objects."""
163 | 
164 |     def shell_context():
165 |         """Shell context objects."""
166 |         return {"db": db}
167 | 
168 |     app.shell_context_processor(shell_context)
169 | 
170 | 
171 | def register_commands(app):
172 |     """Register Click commands."""
173 |     app.cli.add_command(commands.test)
174 |     app.cli.add_command(commands.lint)
175 |     app.cli.add_command(commands.gendb)
176 |     app.cli.add_command(commands.ingest)
177 |     app.cli.add_command(commands.configp)
178 |     app.cli.add_command(commands.packages)
179 | 
180 | 
181 | def configure_logger(app):
182 |     """Configure loggers."""
183 | 
184 |     logger = logging.getLogger("webstats")
185 |     logger.setLevel(app.config["LOG_LEVEL"])
186 |     log_file = app.config["LOG_FILEPATH"]
187 | 
188 |     # create the log file if necessasry    
189 |     dir_path = os.path.dirname(log_file)
190 |     if not os.path.exists(dir_path):
191 |         os.makedirs(dir_path)
192 |     # Create the file if it does not exist
193 |     with open(log_file, 'a'):
194 |         os.utime(log_file, None)
195 | 
196 |     # TODO Why do we need a logrotate.d entry if I have the code below
197 |     file_handler = logging.handlers.RotatingFileHandler(
198 |         log_file, maxBytes=100000000, backupCount=5
199 |     )
200 |     file_handler.setLevel(app.config["LOG_LEVEL"])
201 |     formatter = logging.Formatter(
202 |         "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
203 |     )
204 |     file_handler.setFormatter(formatter)
205 |     logger.addHandler(file_handler)
206 | 
207 |     app.logger.handlers = logger.handlers
208 |     app.logger.setLevel(logger.level)
209 | 


--------------------------------------------------------------------------------
/docs/stats_replacement_design_brief.md:
--------------------------------------------------------------------------------
  1 | # Design Brief: stats.bioconductor.org replacement
  2 | 
  3 | Author: Robert Shear rshear@ds.dfci.harvard.edu
  4 | Date: 2023-10-03
  5 | Version: rev 1
  6 | 
  7 | ## Change log
  8 | ### rev 2
  9 | - 04-Oct-2023
 10 | - The "Current State:Static Web Assets" section spoke of 9 content types. There are 8 content types. The `<package_type>_pkg_scores.tab` example [https://bioconductor.org/packages/stats/bioc/bioc_pkg_scores.tab](https://bioconductor.org/packages/stats/bioc/bioc_pkg_scores.tab) appeared twice in the example table.
 11 | 
 12 | ### rev 1
 13 | - 03-Oct-2023
 14 | - Added coding language information (markdown+html+javascript) to section [[#Server www.bioconductor.org]]
 15 | - Resolved Future State S3 design issues
 16 | - Current state description of paths to content with links to examples added 
 17 | - Publishing to www elaborated
 18 | - Future state architecture now simply Python data-aware pages.
 19 | - Future state Glue issues resolved
 20 | - Additional Future State considerations will be determined during development
 21 | 
 22 | ## Initial draft
 23 | - 21-Sep-2023
 24 | 
 25 | # Overview
 26 | 
 27 | ![Block Diagram](design_brief_block_diagram.png)
 28 | # Current State
 29 | ## Repo
 30 | - https://github.com/Bioconductor/download_stats
 31 | ## Service: S3 Bucket
 32 | - The only S3 bucket that is currently active with respect to the stats process is 
 33 | ```
 34 | s3://bioc-cloudfront-logs
 35 | ```
 36 | 
 37 | - Each object is a gz-compressed W3C Extended Web Log file. The recent files are generated by www CloudFront distribution. The provenance of the early files, which go back as far as 2013, are unknown, but assumed to have the same format as the current files.
 38 | - There is no prefix to the objects, i.e., all objects are at the top level with names as seen in the specimen s3 object name below. The code `E1...V3` designates the log source. The `0129f869` is a hash to assure object uniqueness. 
 39 | ```
 40 | s3://bioc-cloudfront-logs/E1TVLJONPTUXV3.2023-09-17-00.0129f869.gz
 41 | ```
 42 | `` 
 43 | - Objects are destroyed when 6 months old. They are all assigned to the Standard storage class. The object count is ~61k and the bucket size is ~33 GB.
 44 | ## Server: stats.bioconductor.org
 45 | 
 46 | ### Process: Log Intake
 47 | - Copy logs verbatim from S3 bucket to EBS volume
 48 | - Python script: `get_s3_logs.py`, repo `Bioconductor/download_stats`
 49 | - Runs weekly Sunday 20:00 local time
 50 | 
 51 | ### Data: WC3 Web Logs
 52 | - Space used: 359 GB
 53 | - Location `/home/biocadmin/bioc-access-logs/s3/YYYY-MM-DD/E1TVLJONPTUXV3.YYYY-MM-DD-HH.0129f869.gz`
 54 | - Date range: 2013-08-28 - 2023-09-18
 55 | - Specimen file name: `E1TVLJONPTUXV3.2023-09-17-00.0129f869.gz`
 56 | ### Process: Database Load
 57 | - Transforms logs from syslog files to rows in sqlite database
 58 | 	- Log line-per-row, various column level transformations
 59 | - Python script: `get_s3_logs.py`, repo `Bioconductor/download_stats`
 60 | - Runs weekly Sunday 22:00 local time
 61 | ### Data: SQLite DBs
 62 | - Space used 67 GB
 63 | - Directory: `/home/biocadmin/download_dbs/`
 64 | - One file for each year: format `download_db_YYYY.sqlite` , e.g.: `download_db_2023.sqlite`
 65 | - Range of files: 2009 through 2023
 66 | ### Process: Generate Static Web Assets
 67 | - Creates static html assets that are surfaced by www.bioconductor.org Apache2 server
 68 | - Produced for one topic at a time, weekly. For each topic, there are two scripts, `extractDownloadStats-for-xxx.py` and `makeDownloadStatsHTML-for-xxx.py`, where `xxx` is given in the table below. 
 69 | 
 70 | | Topic               | When            | Code            |
 71 | | ------------------- | --------------- | --------------- |
 72 | | software packages   | Monday 12:00    | bioc            |
 73 | | annotation packages | Wednesday 15:00 | data-annotation |
 74 | | experiment packages | Friday 03:00    | data-experiment |
 75 | | workflows packages   | Saturday 15:00  | workflows                |
 76 | 
 77 | 
 78 | ### Data: Static Web Assets (.html,  .txt, .tab)
 79 | On stats.bioconductor.org. the web content are stored under directory `/home/biocadmin/public_html/stats/`. When published to master.bioconductor.org, they are stored under `/extra/www/bioc/packages/stats`.
 80 | 
 81 | 
 82 | The files are organized hierarchically with the keys as shown below.
 83 | 
 84 | | Level | Keys                                    | Example                          |
 85 | | ----- | --------------------------------------- | -------------------------------- |
 86 | | 0     |                                         | Aggregate measures |
 87 | | 1     | `package_type`                          | `bioc`                           |
 88 | | 2     | `package_type`, `package_name` | `bioc, S4Vectors`                                 |
 89 | | 3     | `package_type`, `package_name`,  `year` | `bioc, S4Vectors, 2023`                                 |
 90 | 
 91 | The legal values for `package_type` in are `bioc`, `data-annotation`,` data-experiment`, and `workflows`.
 92 | 
 93 | There are 8 content types, 3 of which are HTML documents, the remainder being text files. Specimens of these files can be retrieved by following the links in the table below.
 94 | 
 95 | |                                                                                                                            |                          |
 96 | |:-------------------------------------------------------------------------------------------------------------------------- |:------------------------ |
 97 | | [ROOT](https://bioconductor.org/packages/stats/)                                                                           | index.html               |
 98 | | [bioc/](https://bioconductor.org/packages/stats/bioc/)                                                                     | index.html               |
 99 | | [bioc/bioc_packages.txt](https://bioconductor.org/packages/stats/bioc/bioc_packages.txt)                                   | bioc_packages.txt        |
100 | | [bioc/bioc_pkg_scores.tab](https://bioconductor.org/packages/stats/bioc/bioc_pkg_scores.tab)                               | bioc_pkg_scores.tab      |
101 | | [bioc/bioc_pkg_stats.tab](https://bioconductor.org/packages/stats/bioc/bioc_pkg_stats.tab)                                 | bioc_pkg_stats.tab       |
102 | | [bioc/S4Vectors/](https://bioconductor.org/packages/stats/bioc/S4Vectors/)                                                 | index.html               |
103 | | [bioc/S4Vectors/S4Vectors_2023_stats.tab](https://bioconductor.org/packages/stats/bioc/S4Vectors/S4Vectors_2023_stats.tab) | S4Vectors_2023_stats.tab |
104 | | [bioc/S4Vectors/S4Vectors_stats.tab](https://bioconductor.org/packages/stats/bioc/S4Vectors/S4Vectors_stats.tab)           | S4Vectors_stats.tab      |
105 | 
106 | ## Server: www.bioconductor.org
107 | - Apache2 Server
108 | # Future State
109 | 
110 | ## Service: S3 Bucket
111 | 
112 | ### Recovering history
113 | The object retention time in the current bucket is 183 days. However the EBS files in the `/home/biocadmin/bioc-access-logs/s3` directory go back to 2013. These files should be returned to the S3 bucket. S3 storage is archival durability, while EBS storage is not. In addition, S3 storage is less expensive than EBS storage. Furthermore, there are less expensive storage S3 storage classes that are appropriate for this use case.
114 | ### Data privacy
115 | The source IP addresses in the weblog records are subject to privacy requirements in GDPR jurisdictions as well as elsewhere. We should encrypt the source IP addresses with a strong, secret asymmetrical key. This will allow us to continue to count unique IP addresses, while securing the identity of the user.
116 | ### Partition Projection 
117 | When an S3 bucket is the source for an Athena query, the strategy is to scan the entire bucket (or prefix). This is barely satisfactory in the current state, with 6 months of history. Once the log history has been returned to S3, it would make date-range limited inquires intractable. 
118 | 
119 | Athena has the capability of extracting information from the object path and mapping it to database columns. This is known as a partition projection. Specifically, we propose to add a path prefix comprised of the CloudFront distribution ID (for other weblog sources, an alternative unique identifier) followed by the date. So, the file `E1TVLJONPTUXV3.2021-12-26-HH.0129f869.gz` will have an S3 URI like this:
120 | 
121 | ```
122 | s3://bucket_name/E1TVLJONPTUXV3/2021-12-26/`E1TVLJONPTUXV3.2021-12-26-HH.0129f869.gz
123 | ```
124 | 
125 | More information may be found at [Setting up partition projection](https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html). Details on mapping the URI structure to columns may be found in in the document [Specifying custom S3 storage locations](https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html#partition-projection-specifying-custom-s3-storage-locations).
126 | 
127 | #### Implementation
128 | 
129 | Before commencing the data migration, the performance characteristics of this partition projection should be verified.
130 | 
131 | The current CloudFront-to-S3 process will require modification to attach the appropriate prefix to each object.
132 | ### Data Migration
133 | 
134 | A new bucket will be established for the weblogs, using the new production account number. The log stream with the new format should be activated and a final synchronization with the old S3 bucket effected.
135 | Due to the file structure of the current state log files, the migration from the current state `stats` server can be migrated with an `rsync` command. 
136 | ## Service: ETL
137 | ## AWS Glue
138 | The AWS Glue service provides ETL (extract-transform-load) capabilities.  Loading transformed weblogs into a relational database is a central use case for Glue.
139 | 
140 | Athena is an AWS service based on the Trino (formerly PrestoSQL) open-source distributed SQL query engine. It mediates the consumption of the weblogs and the loading of transformed results into other database systems. The data transformation rules in the Current State Python ETL code will be converted to SQL.
141 | 
142 | Note. In 2021, Martin Morgan developed an Athena based system for analyzing our weblogs directly from S3. The source code can be found at https://github.com/Bioconductor/AthenaLogs.
143 | 
144 | ## Service: Relational Database
145 | - AWS Aurora PostgreSQL
146 | ## Service: Data Gateway
147 | - The data gateway will be a Python function run within the AWS Lambda service that will consume http GET URLs and return the appropriate text or HTML document. The Python function will be a composition of the Current State Python code.
148 | ## Server: \www.bioconductor.org
149 | All HTTP requests to `https://bioconductor.org/packages/stats/` will be routed to a new Python application that will decode the URI path and produce a bit-for-bit perfect replica of the current output.
150 | 
151 | This program will tentatively use the `Flask` framework.
152 | # Additional Considerations
153 | 
154 | The following topics will be determined during development.
155 | - Roles
156 | - Tags
157 | - Test Instance
158 | - System Management 
159 | 	- Terraform
160 | 	- Ansible
161 | - CloudWatch Alerts and Metrics
162 | 


--------------------------------------------------------------------------------
/bioc_webstats/stats.py:
--------------------------------------------------------------------------------
  1 | """_summary_.
  2 | 
  3 | Raises:
  4 |     NotImplementedError: _description_
  5 | 
  6 | Returns:
  7 |     _description_
  8 | """
  9 | import os
 10 | from collections import defaultdict
 11 | from datetime import date, timedelta
 12 | 
 13 | from flask import (
 14 |     Blueprint,
 15 |     Response,
 16 |     abort,
 17 |     make_response,
 18 |     render_template,
 19 |     send_from_directory,
 20 | )
 21 | 
 22 | # TODO mixed pattern! should either import all the models (e.g. Packages) or make everything qualified e.g. db.WebstatsInfo...
 23 | import bioc_webstats.models as db
 24 | from bioc_webstats.models import Packages, PackageType, WebstatsInfo
 25 | 
 26 | URI_PATH_PREFIX = "/packages/stats"
 27 | 
 28 | # Map from incoming page name name to PackageType
 29 | category_map = {
 30 |     "bioc": {
 31 |         "category": PackageType.BIOC,
 32 |         "description": "software",
 33 |         "index_page": "index.html",
 34 |         "stem": "bioc",
 35 |         "tab_page_prefix": "bioc",
 36 |         "top": 75,
 37 |     },
 38 |     "data-annotation": {
 39 |         "category": PackageType.ANNOTATION,
 40 |         "description": "annotation",
 41 |         "index_page": "data-annotation.html",
 42 |         "stem": "data-annotation",
 43 |         "tab_page_prefix": "annotation",
 44 |         "top": 30,
 45 |     },
 46 |     "data-experiment": {
 47 |         "category": PackageType.EXPERIMENT,
 48 |         "description": "experiment",
 49 |         "index_page": "data-experiment.html",
 50 |         "stem": "data-experiment",
 51 |         "tab_page_prefix": "experiment",
 52 |         "top": 15,
 53 |     },
 54 |     "workflows": {
 55 |         "category": PackageType.WORKFLOWS,
 56 |         "description": "workflow",
 57 |         "index_page": "workflows.html",
 58 |         "stem": "workflows",
 59 |         "tab_page_prefix": "workflows",
 60 |         "top": 0,
 61 |     },
 62 | }
 63 | 
 64 | bp = Blueprint("stats", __name__, url_prefix=URI_PATH_PREFIX)
 65 | 
 66 | def webstats_response(payload, content_type='text/html') -> Response:
 67 |     """Create an http Response including response headers required by consuming systems.
 68 |     
 69 |     Specifically, the Last-Modified header is set to, which is set to midnight GMT, 
 70 |     one second after the "valid through" date.
 71 | 
 72 |     Arguments:
 73 |         payload -- The body of the http response, either raw text or html
 74 | 
 75 |     Keyword Arguments:
 76 |         content_type -- The value of the Content-Type header (default: {'text/html'})
 77 | 
 78 |     Returns:
 79 |         A Response variable, ready to be returned from a route or blueprint decorated function
 80 |     """
 81 |     response = make_response(payload)
 82 |     response.headers['Content-Type'] = content_type
 83 |     generated_date=WebstatsInfo.get_valid_thru_date()
 84 |     modified_date = (generated_date + timedelta(days=1))
 85 |     response.headers['Last-Modified'] = modified_date.strftime("%a, %d %b %Y %H:%M:%S GMT")
 86 |     return response
 87 | 
 88 | def split_to_dict_list(lst):
 89 |     """Transform int a dictionary based on first letter (case insensitive)."""
 90 |     result = defaultdict(list)
 91 | 
 92 |     for item in sorted(lst, key=lambda x: x[0].lower()):
 93 |         first_char = item[0][0].upper()  # Extract the first character of the string
 94 |         result[first_char].append(item)
 95 | 
 96 |     return result
 97 | 
 98 | 
 99 | def result_list_to_visual_list(rows):
100 |     """Transform 3 column databas results to 4 column visual results with dense months."""
101 |     dates = set([u[0] for u in rows])
102 |     y0 = min(dates).year
103 |     y1 = max(dates).year
104 |     holes = (
105 |         set([date(y, m + 1, 1) for y in range(y0, y1 + 1) for m in range(12)]) - dates
106 |     )
107 |     out = sorted(rows + [(w, 0, 0) for w in holes], key=lambda x: (-x[0].year, x[0]))
108 |     return [
109 |         {'year': dt.year, 'month': dt.strftime("%b") if dt.day == 1 else "all", 'unique_ips': ip, 'downloads': dl}
110 |         for dt, ip, dl in out
111 |     ]
112 | 
113 | 
114 | def query_result_to_text(source):
115 |     """Transform tabular query results to string.
116 | 
117 |     The strings are exact replicas of the .tab files found under
118 |     www.bioconductor.org/packages/stats/.../<package>_stats.tab
119 |     and <package>_scores.tab.
120 | 
121 |     The match exactly because they may be consumed by external software.
122 | 
123 |     Arguments:
124 |         source -- A list of tuples in the form
125 |             [(package, year, month, IP_count, Download_count)]
126 |             or
127 |             [(year, month, IP_count, Download_count)]
128 | 
129 |     Returns:
130 |         A string in the format of a tab seperated file with one header row.
131 | 
132 |     """
133 | 
134 |     def process_one_package(package, rows):
135 |         """For one package produce the result. If package is None, return 4 columns."""
136 |         if package is None:
137 |             k = ""
138 |         else:
139 |             k = package + "\t"
140 | 
141 |         out = result_list_to_visual_list(rows)
142 |         return "\n".join(
143 |             [f"{k}{u['year']}\t{u['month']}\t{u['unique_ips']}\t{u['downloads']}" for u in out])
144 | 
145 |     if source == []:
146 |         return ""
147 |     heading = "Year\tMonth\tNb_of_distinct_IPs\tNb_of_downloads"
148 |     match len(source[0]):
149 |         case 3:
150 |             return heading + "\n" + (process_one_package(None, source))
151 | 
152 |         case 4:
153 |             result = ["Package\t" + heading]
154 |             split = {}
155 |             for t in source:
156 |                 split.setdefault(t[0], []).append(t[1:])
157 | 
158 |             for k, v in split.items():
159 |                 result.append(process_one_package(k, v))
160 | 
161 |             return "\n".join(result)
162 | 
163 |         case _:
164 |             raise AssertionError("query_result_to_text expects 4 or 5 columns")
165 | 
166 | @bp.route('/static/<path:filename>')
167 | def static_files(filename):
168 |     """Redirect requests to serve static files from root to the actual root of webstats."""
169 |     static_folder = os.path.join(os.path.dirname(__file__), 'static')
170 |     return send_from_directory(static_folder, filename)
171 | 
172 | 
173 | @bp.route("/bioc/bioc_packages.txt")
174 | def show_packages():
175 |     """_summary_."""
176 |     payload = db.Packages.get_package_names()
177 |     text = ("\n").join(payload)
178 |     return webstats_response(text, content_type="text/plain")
179 | 
180 | 
181 | @bp.route("<category>/<package>_pkg_scores.tab")
182 | def show_package_scores(category, package):
183 |     """_summary_."""
184 |     # We match the legacy system, where both the path and the file_name included the category
185 | 
186 |     # if for category, in a form like this; /bio/bioc_pkg_scores.tab, or /data-annotation/annotation.pkg_scores.tab
187 |     # `category_map` is a dictionary that maps the category names to their
188 |     # corresponding information. Each category has a set of attributes such as the
189 |     # package type, description, package index page, stem, tab page prefix, and
190 |     # top count. This mapping is used in various parts of the code to retrieve the
191 |     # relevant information based on the category name.
192 |     selected_category = category_map.get(category, None)
193 |     if selected_category is not None and package == selected_category["tab_page_prefix"]:
194 |         payload = db.Stats.get_download_scores(category=selected_category["category"])
195 |     else:
196 |         abort(404)
197 |     text = "Package\tDownload_score\n" + "\n".join([f"{x[0]}\t{x[1]}" for x in payload])
198 |     return webstats_response(text, content_type="text/plain")
199 | 
200 | 
201 | @bp.route("<category>/<package>_stats.tab")
202 | @bp.route("<category>/<package>_<year>_stats.tab")
203 | @bp.route("<category>/<package_path>/<package>_stats.tab")
204 | @bp.route("<category>/<package_path>/<package>_<year>_stats.tab")
205 | def show_package_stats(category, package, package_path=None, year=None):
206 |     """_summary_."""
207 |     selected_category = category_map.get(category, None)
208 |     if selected_category is None:
209 |         abort(404)
210 |     # If there is a second level in the path, then it can only be the package name
211 |     # and that name must match the package name at the leaf
212 |     # If the url is for all the packages in the repo,
213 |     # it will be in the form /bio/bio_pkg_stats.tab and the year parameter will be 'pkg'
214 | 
215 |     # Helpful to keep the nested conditionals reasonably simple
216 |     payload = None
217 |     
218 |     if package_path is None and selected_category["stem"] == category:
219 |         # Here the package is actually the name of the category
220 |         package = None
221 |         if year == 'pkg':
222 |             # Here are /bioc/bioc_stats.tab and /data_exepriment/experiment_pkg.tab
223 |             # In this case we will report all packages in the category
224 |             year = None
225 |         else:
226 |             # Here we report combined data, either for a year or for all years
227 |             payload = db.Categorystats.get_combined_counts(selected_category["category"], year)
228 | 
229 |     if payload is None:
230 |         payload = db.Stats.get_download_counts(selected_category["category"], package, year)
231 | 
232 |     if payload == []:
233 |         abort(404)
234 | 
235 |     return webstats_response(query_result_to_text(payload), content_type="text/plain")
236 | 
237 | 
238 | @bp.route("/")
239 | @bp.route("/index.html")
240 | @bp.route("/<category>.html")
241 | def show_package_summary(category="bioc"):
242 |     """Render package summary page."""
243 |     selected_category = category_map.get(category, None)
244 |     if selected_category is None:
245 |         abort(404)
246 |     category_enum = selected_category["category"]
247 |     scores = db.Stats.get_download_scores(category_enum)
248 |     url_list = [
249 |         [u["index_page"], u["description"]]
250 |         for u in category_map.values()
251 |         if selected_category["category"] != u["category"]
252 |     ]
253 |     top_count = selected_category["top"]
254 |     top = sorted(scores, key=lambda x: x[-1])[:top_count]
255 | 
256 |     result = render_template(
257 |         "category.html",
258 |         top_count=top_count,
259 |         category_links=url_list,
260 |         category=category_enum,
261 |         category_name=selected_category["description"],
262 |         category_url_stem=selected_category["stem"],
263 |         tab_page_prefix=selected_category["tab_page_prefix"],
264 |         generated_date=WebstatsInfo.get_valid_thru_date(),
265 |         top=top,
266 |         scores=split_to_dict_list(scores),
267 |     )
268 |     return webstats_response(result)
269 | 
270 | 
271 | @bp.route("<category>/")
272 | @bp.route("<category>/<package>")
273 | @bp.route("<category>/<package>/")
274 | @bp.route("<category>/index.html")
275 | @bp.route("<category>/<package>/index.html")
276 | def show_package_details(category, package=None):
277 |     """Display package detials."""
278 |     selected_category = category_map.get(category, None)
279 |     if selected_category is None:
280 |         abort(404)
281 | 
282 |     if package is None:
283 |         source = db.Categorystats.get_combined_counts(selected_category["category"])
284 |         depver = None
285 |     else:
286 |         source = db.Stats.get_download_counts(selected_category["category"], package)
287 |         if len(source) == 0:
288 |             abort(404)
289 |         package_info = db.Packages.get_package_details(package)
290 |         depver = package_info[3]
291 |         
292 |     if depver is not None:
293 |         depver = depver[0] + "." +str(int(depver[1:3]))
294 | 
295 |     split = {}
296 |     for t in source:
297 |         split.setdefault(t[0].year, []).append(t)
298 | 
299 |     data_by_year = {year: result_list_to_visual_list(data) for year, data in split.items()}
300 |     generated_date=WebstatsInfo.get_valid_thru_date()
301 |     
302 |     result = render_template(
303 |         "stats-bioc.html",
304 |         category=category,
305 |         category_name=selected_category["description"],
306 |         category_stem=selected_category["stem"],
307 |         category_index_page=('/').join((bp.url_prefix, selected_category["index_page"])),
308 |         package=package,
309 |         generated_date=generated_date,
310 |         data_by_year=data_by_year,
311 |         first_year=list(data_by_year.keys())[-1],
312 |         last_year=list(data_by_year.keys())[0],
313 |         deprecated_version=depver
314 |     )
315 |     return webstats_response(result)
316 | 
317 | 


--------------------------------------------------------------------------------