├── .editorconfig
├── .github
├── dependabot.yml
└── workflows
│ └── dockerpublish.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── backup.py
├── backup
└── .gitignore
├── bin
├── clean.sh
├── cron.sh
├── cronlog.sh
├── django_wait_for_db.sh
├── example_data.sh
├── gunicorn.sh
├── restore.sh
└── swarm_update.sh
├── crontab
├── db.cnf
├── django_wait_for_migrations.py
├── docker-compose.yml
├── docs
└── pull_request_template.md
├── extlinks
├── __init__.py
├── aggregates
│ ├── __init__.py
│ ├── admin.py
│ ├── factories.py
│ ├── management
│ │ ├── __init__.py
│ │ ├── commands
│ │ │ ├── __init__.py
│ │ │ ├── archive_link_aggregates.py
│ │ │ ├── archive_pageproject_aggregates.py
│ │ │ ├── archive_user_aggregates.py
│ │ │ ├── fill_link_aggregates.py
│ │ │ ├── fill_monthly_link_aggregates.py
│ │ │ ├── fill_monthly_pageproject_aggregates.py
│ │ │ ├── fill_monthly_user_aggregates.py
│ │ │ ├── fill_pageproject_aggregates.py
│ │ │ └── fill_user_aggregates.py
│ │ └── helpers
│ │ │ ├── __init__.py
│ │ │ └── aggregate_archive_command.py
│ ├── migrations
│ │ ├── 0001_initial.py
│ │ ├── 0002_useraggregate.py
│ │ ├── 0003_add_indexes.py
│ │ ├── 0004_pageprojectaggregate.py
│ │ ├── 0005_add_organisation_index.py
│ │ ├── 0006_delete_aggregate_tables_info.py
│ │ ├── 0007_add_user_list_flags.py
│ │ ├── 0008_alter_linkaggregate_id_alter_pageprojectaggregate_id_and_more.py
│ │ ├── 0009_pageprojectaggregate_composite_index.py
│ │ ├── 0010_add_aggregate_indexes.py
│ │ ├── 0011_aggregate_composite_indexes.py
│ │ └── __init__.py
│ ├── models.py
│ ├── tests.py
│ └── views.py
├── common
│ ├── __init__.py
│ ├── forms.py
│ ├── helpers.py
│ ├── management
│ │ ├── __init__.py
│ │ └── commands
│ │ │ ├── __init__.py
│ │ │ └── import_twl_data.py
│ ├── swift.py
│ ├── templates
│ │ └── common
│ │ │ ├── statistics_table.html
│ │ │ └── top_organisations_table.html
│ ├── templatetags
│ │ ├── __init__.py
│ │ └── common_filters.py
│ ├── tests.py
│ ├── urls.py
│ └── views.py
├── healthcheck
│ ├── __init__.py
│ ├── admin.py
│ ├── migrations
│ │ └── __init__.py
│ ├── models.py
│ ├── tests.py
│ ├── urls.py
│ └── views.py
├── links
│ ├── __init__.py
│ ├── admin.py
│ ├── factories.py
│ ├── helpers.py
│ ├── management
│ │ ├── __init__.py
│ │ └── commands
│ │ │ ├── __init__.py
│ │ │ ├── fix_proxy_linkevents_on_user_list.py
│ │ │ ├── linkevent_example_data.py
│ │ │ ├── linkevents_archive.py
│ │ │ ├── linkevents_collect.py
│ │ │ ├── linksearchtotal_collect.py
│ │ │ ├── linksearchtotal_example_data.py
│ │ │ ├── remove_ezproxy_collection.py
│ │ │ └── upload_all_archived.py
│ ├── migrations
│ │ ├── 0001_initial.py
│ │ ├── 0002_auto_20190520_1530.py
│ │ ├── 0003_auto_20190530_1045.py
│ │ ├── 0004_auto_20190603_1110.py
│ │ ├── 0005_linkevent_user_is_bot.py
│ │ ├── 0006_auto_20190628_1221.py
│ │ ├── 0007_auto_20190730_1355.py
│ │ ├── 0008_fill_proquest_openurl.py
│ │ ├── 0009_auto_20230215_1656.py
│ │ ├── 0010_data_link_event_id_hash.py
│ │ ├── 0011_auto_20230217_1326.py
│ │ ├── 0012_alter_linkevent_id_alter_linksearchtotal_id_and_more.py
│ │ ├── 0013_add_linkevent_url_linkevent_content_type_and_more.py
│ │ ├── 0014_migrate_url_pattern_relationships.py
│ │ ├── __init__.py
│ │ └── urlpatterns.json
│ ├── models.py
│ └── tests.py
├── logs
│ └── .gitignore
├── organisations
│ ├── __init__.py
│ ├── admin.py
│ ├── factories.py
│ ├── management
│ │ ├── __init__.py
│ │ └── commands
│ │ │ ├── __init__.py
│ │ │ └── users_update_lists.py
│ ├── migrations
│ │ ├── 0001_initial.py
│ │ ├── 0002_auto_20190603_1255.py
│ │ ├── 0003_auto_20190603_1325.py
│ │ ├── 0004_auto_20190603_1325.py
│ │ ├── 0005_auto_20190628_1221.py
│ │ ├── 0006_auto_20190730_1355.py
│ │ ├── 0007_auto_20230216_1931.py
│ │ ├── 0008_alter_collection_id_alter_organisation_id_and_more.py
│ │ ├── 0009_organisation_username_list_updated.py
│ │ └── __init__.py
│ ├── models.py
│ ├── templates
│ │ └── organisations
│ │ │ ├── organisation_charts_include.html
│ │ │ ├── organisation_detail.html
│ │ │ └── organisation_list.html
│ ├── tests.py
│ ├── urls.py
│ └── views.py
├── programs
│ ├── __init__.py
│ ├── admin.py
│ ├── factories.py
│ ├── management
│ │ ├── __init__.py
│ │ └── commands
│ │ │ ├── __init__.py
│ │ │ └── programs_example_data.py
│ ├── migrations
│ │ ├── 0001_initial.py
│ │ ├── 0002_auto_20190603_1255.py
│ │ ├── 0003_alter_program_id.py
│ │ └── __init__.py
│ ├── models.py
│ ├── templates
│ │ └── programs
│ │ │ ├── program_charts_include.html
│ │ │ ├── program_detail.html
│ │ │ └── program_list.html
│ ├── tests.py
│ ├── urls.py
│ └── views.py
├── settings
│ ├── base.py
│ ├── helpers.py
│ ├── local.py
│ ├── logging.py
│ └── production.py
├── templates
│ ├── base.html
│ ├── documentation.html
│ └── homepage.html
├── tests.py
├── urls.py
├── views.py
└── wsgi.py
├── manage.py
├── nginx.conf
├── requirements
├── django.txt
└── local.txt
├── robots.txt
├── static
├── css
│ └── local.css
└── favicon.ico
├── template.env
└── wiki-list.csv
/.editorconfig:
--------------------------------------------------------------------------------
1 | # top-most EditorConfig file
2 | root = true
3 |
4 | # Unix-style newlines with a newline ending every file
5 | [*]
6 | end_of_line = lf
7 | insert_final_newline = true
8 |
9 | # 4 space indentation for Python files
10 | [*.py]
11 | indent_style = space
12 | indent_size = 4
13 | trim_trailing_whitespace = true
14 |
15 | # 2 space indentation for YAML files
16 | [*.{yml, yaml}]
17 | indent_style = space
18 | indent_size = 2
19 |
20 | # 2 space indentation for HTML files
21 | [*.html]
22 | indent_style = space
23 | indent_size = 2
24 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # To get started with Dependabot version updates, you'll need to specify which
2 | # package ecosystems to update and where the package manifests are located.
3 | # Please see the documentation for all configuration options:
4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5 |
6 | version: 2
7 | updates:
8 | - package-ecosystem: "pip" # See documentation for possible values
9 | directory: "/" # Location of package manifests
10 | schedule:
11 | interval: "daily"
12 |
--------------------------------------------------------------------------------
/.github/workflows/dockerpublish.yml:
--------------------------------------------------------------------------------
1 | name: Docker
2 |
3 | on:
4 | push:
5 | # Publish `master` as Docker `latest` image.
6 | branches:
7 | - master
8 | - staging
9 |
10 | # Run tests for any PRs.
11 | pull_request:
12 |
13 | jobs:
14 | # Run tests.
15 | test:
16 | # Ensure latest python image is mirrored before running tests.
17 | runs-on: ubuntu-latest
18 | steps:
19 | - uses: actions/checkout@v4
20 | - name: Build and Start Images
21 | run: |
22 | cp template.env .env
23 | docker compose up -d --build
24 | - name: Run tests
25 | run: |
26 | docker compose exec -T externallinks /app/bin/django_wait_for_db.sh python django_wait_for_migrations.py test
27 |
28 | # Push images to quay.io/wikipedialibrary.
29 | push:
30 | # Ensure test job passes before pushing images.
31 | needs: test
32 | runs-on: ubuntu-latest
33 | if: github.event_name == 'push'
34 |
35 | steps:
36 | - uses: actions/checkout@v4
37 |
38 | - name: Log into quay.io
39 | run: echo "${{ secrets.CR_PASSWORD }}" | docker login quay.io -u ${{ secrets.CR_USERNAME }} --password-stdin
40 |
41 | - name: Build Images
42 | run: |
43 | cp template.env .env
44 | docker compose build
45 |
46 | - name: Set branch tag
47 | id: branch
48 | run: |
49 | # Strip git ref prefix from version
50 | branch_tag=$(echo "${{ github.ref }}" | sed -e 's,.*/\(.*\),\1,')
51 |
52 | # Strip "v" prefix from tag name
53 | [[ "${{ github.ref }}" == "refs/tags/"* ]] && branch_tag=$(echo $branch_tag | sed -e 's/^v//')
54 |
55 | # preprend with "branch_" so we know what the tag means by looking at it.
56 | branch_tag="branch_${branch_tag}"
57 |
58 | echo ::set-output name=tag::$(echo $branch_tag)
59 |
60 | - name: Set commit tag
61 | id: commit
62 | run: |
63 | # The short git commit object name.
64 | commit_tag=${GITHUB_SHA::8}
65 |
66 | # prepend with "commit_" so we know what the tag means by looking at it.
67 | commit_tag="commit_${commit_tag}"
68 |
69 | echo ::set-output name=tag::$(echo $commit_tag)
70 |
71 | - name: Push externallinks image to quay.io/wikipedialibrary
72 | run: |
73 | # The image name represents both the local image name and the remote image repository.
74 | image_name=quay.io/wikipedialibrary/externallinks
75 | branch_tag=${{ steps.branch.outputs.tag }}
76 | commit_tag=${{ steps.commit.outputs.tag }}
77 |
78 | docker tag ${image_name}:latest ${image_name}:${branch_tag}
79 | docker tag ${image_name}:latest ${image_name}:${commit_tag}
80 | docker push ${image_name}:${branch_tag}
81 | docker push ${image_name}:${commit_tag}
82 |
83 | - name: Push eventstream image to quay.io/wikipedialibrary
84 | run: |
85 | # The image name represents both the local image name and the remote image repository.
86 | image_name=quay.io/wikipedialibrary/eventstream
87 | branch_tag=${{ steps.branch.outputs.tag }}
88 | commit_tag=${{ steps.commit.outputs.tag }}
89 |
90 | docker tag ${image_name}:latest ${image_name}:${branch_tag}
91 | docker tag ${image_name}:latest ${image_name}:${commit_tag}
92 | docker push ${image_name}:${branch_tag}
93 | docker push ${image_name}:${commit_tag}
94 |
95 | - name: Push externallinks_cron image to quay.io/wikipedialibrary
96 | run: |
97 | # The image name represents both the local image name and the remote image repository.
98 | image_name=quay.io/wikipedialibrary/externallinks_cron
99 | branch_tag=${{ steps.branch.outputs.tag }}
100 | commit_tag=${{ steps.commit.outputs.tag }}
101 |
102 | docker tag ${image_name}:latest ${image_name}:${branch_tag}
103 | docker tag ${image_name}:latest ${image_name}:${commit_tag}
104 | docker push ${image_name}:${branch_tag}
105 | docker push ${image_name}:${commit_tag}
106 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.log
2 | .idea
3 | .vscode
4 | .env
5 | .env.*
6 | .swp
7 |
8 | # Python stuff
9 | __pycache__/
10 | *.pyc
11 |
12 | static/admin
13 | static/debug_toolbar
14 | static/django_extensions
15 |
16 | db.json
17 |
18 | .coverage
19 | htmlcov/
20 |
21 | .DS_Store
22 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Dockerfile
2 | FROM quay.io/wikipedialibrary/python:3.11-bullseye-updated AS eventstream
3 |
4 | WORKDIR /app
5 | ARG REQUIREMENTS_FILE
6 | ENV REQUIREMENTS_FILE=${REQUIREMENTS_FILE:-django.txt}
7 | COPY requirements/* /app/requirements/
8 | RUN echo "Installing $REQUIREMENTS_FILE" && pip install -r /app/requirements/$REQUIREMENTS_FILE
9 | RUN apt update && apt install -y default-mysql-client && rm -rf /var/lib/apt/lists/* && rm -f /var/log/apt/*
10 | # This file only exists once the code directory is mounted by docker-compose.
11 | ENTRYPOINT ["/app/bin/django_wait_for_db.sh"]
12 |
13 | FROM eventstream AS externallinks
14 | RUN pip install gunicorn
15 |
16 | FROM eventstream AS cron
17 | RUN apt update && apt install -y cron && rm -rf /var/lib/apt/lists/* && rm -f /var/log/apt/*
18 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Wikimedia Foundation
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/backup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import argparse
3 | from datetime import datetime
4 | from dotenv import load_dotenv
5 | from filelock import FileLock
6 | import subprocess
7 | import os
8 |
9 |
10 | def backup(args):
11 | ## Dump and gzip DB
12 | date = datetime.today().strftime("%Y%m%d")
13 | print("Backing up database.")
14 | filename = "/app/backup/{}.sql.gz".format(date)
15 | extra_opts = ""
16 | if args.missing_only:
17 | extra_opts = "--insert-ignore --no-create-info --skip-opt"
18 | filename = "/app/backup/{}.missing-only.sql.gz".format(date)
19 | command = 'nice -n 5 bash -c "mysqldump {extra_opts} --skip-comments -h db -u root -p{mysql_root_password} {mysql_database} | gzip > {filename}"'.format(
20 | extra_opts=extra_opts,
21 | mysql_root_password=os.environ["MYSQL_ROOT_PASSWORD"],
22 | mysql_database=os.environ["MYSQL_DATABASE"],
23 | filename=filename,
24 | )
25 | subprocess.run(command, shell=True, check=True)
26 |
27 | ## `root:wikidev` only; using IDs instead of names to avoid problems in localdev
28 | os.chown(filename, 0, 500)
29 | os.chmod(filename, 0o640)
30 |
31 | print("Finished backup.")
32 |
33 |
34 | def clean():
35 | # Retain backups for 14 days.
36 | subprocess.run(
37 | 'find /app/backup -name "*.sql.gz" -mtime +14 -delete || :',
38 | shell=True,
39 | check=True,
40 | )
41 | print("Removed backups created 14 days ago or more.")
42 |
43 |
44 | def main():
45 | load_dotenv(".env")
46 | parser = argparse.ArgumentParser(description="externallinks compressed backup")
47 | parser.add_argument("--missing_only", action="store_true")
48 | args = parser.parse_args()
49 |
50 | # Use a lockfile to prevent overruns.
51 | lockfile = "/tmp/backup.lock"
52 | lock = FileLock(lockfile)
53 | lock.acquire()
54 | try:
55 | backup(args)
56 | clean()
57 | finally:
58 | lock.release()
59 | os.remove(lockfile)
60 |
61 |
62 | if __name__ == "__main__":
63 | main()
64 |
--------------------------------------------------------------------------------
/backup/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 |
--------------------------------------------------------------------------------
/bin/clean.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -eo pipefail
4 |
5 | PATH=/usr/local/bin:/usr/bin:/bin:/sbin:/app/bin:$PATH
6 |
7 | if /app/bin/django_wait_for_db.sh
8 | then
9 | echo "This will drop all tables in ${MYSQL_DATABASE}. Proceed [y/N]?"
10 | read -p "This will drop all tables in ${MYSQL_DATABASE}. Proceed [y/N]?" -n 1 -r
11 | echo ""
12 | if [[ ! $REPLY =~ ^[Yy]$ ]]
13 | then
14 | echo "Exiting..."
15 | exit
16 | fi
17 | mysql_cmd="mysql -h db -u root -p${MYSQL_ROOT_PASSWORD} -D ${MYSQL_DATABASE}"
18 | # Build an SQL statement for dropping every table
19 | concat_fragment="GROUP_CONCAT('DROP TABLE IF EXISTS ', table_name SEPARATOR ';')"
20 | get_tables_query="SELECT ${concat_fragment} FROM information_schema.tables WHERE table_schema = '${MYSQL_DATABASE}';"
21 | drop_query=$(echo ${get_tables_query} | ${mysql_cmd})
22 | drop_query=${drop_query/$concat_fragment/}
23 | drop_query=${drop_query//[$'\r\n']}
24 | if [ "$drop_query" == "NULL" ]
25 | then
26 | echo "No tables to drop."
27 | exit
28 | fi
29 | drop_query="SET FOREIGN_KEY_CHECKS = 0;${drop_query};SET FOREIGN_KEY_CHECKS = 1;"
30 | echo "Dropping tables."
31 | echo ${drop_query}
32 | nice -n 5 bash -c "echo \"${drop_query}\" | ${mysql_cmd}"
33 |
34 | echo "Tables dropped."
35 | else
36 | exit 1
37 | fi
38 |
39 |
--------------------------------------------------------------------------------
/bin/cron.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # copy crontab, set permissions, and start cron
3 | set -eo pipefail
4 | PATH=/usr/local/bin:/usr/bin:/bin:/sbin:/app/bin:$PATH
5 | if /app/bin/django_wait_for_db.sh
6 | then
7 | cp /app/crontab /etc/crontab
8 | # `root:wikidev` only; using IDs instead of names to avoid problems in localdev
9 | chown 0:500 /etc/crontab
10 | chmod 640 /etc/crontab
11 | echo "Starting cron."
12 | cron -f -L 8
13 | else
14 | echo "ERROR: couldn't start cron."
15 | exit 1
16 | fi
17 |
--------------------------------------------------------------------------------
/bin/cronlog.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # A simple wrapper to redirect cron STDOUT & STDERR to docker logs
3 | set -eo pipefail
4 | PATH=/usr/local/bin:/usr/bin:/bin:/sbin:/app/bin:$PATH
5 | cd /app
6 | bash "$@">/proc/1/fd/1 2>/proc/1/fd/2
7 |
--------------------------------------------------------------------------------
/bin/django_wait_for_db.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Try to get a db shell
4 | db_init_wait=0
5 | db_init_timeout=60
6 | function connect() {
7 | connect=$(echo 'exit' | python manage.py dbshell 2>&1 >/dev/null)
8 | if ${connect} 2>/dev/null
9 | then
10 | true
11 | else
12 | echo ${connect} | sed -e "s/'--\(user\|password\)=[^']*'/'--\1=******'/g" >/tmp/externallink_db_connect
13 | false
14 | fi
15 | }
16 |
17 | until connect || [ $db_init_wait -eq $db_init_timeout ]
18 | do
19 | >&2 echo "Waiting for DB."
20 | sleep 1
21 | db_init_wait=$(( $db_init_wait + 1 ))
22 | done
23 |
24 | if [ $db_init_wait -lt $db_init_timeout ]
25 | then
26 | >&2 echo "DB up."
27 | rm /tmp/externallink_db_connect 2>/dev/null || :
28 | exec "$@"
29 | else
30 | cat /tmp/externallink_db_connect
31 | rm /tmp/externallink_db_connect 2>/dev/null || :
32 | exit 1
33 | fi
34 |
--------------------------------------------------------------------------------
/bin/example_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo "Creating Programs"
4 | python /app/manage.py programs_example_data 10
5 | echo "Creating LinkSearchTotals"
6 | python /app/manage.py linksearchtotal_example_data 60
7 | echo "Creating LinkEvents"
8 | python /app/manage.py linkevent_example_data 10000
9 | echo "Done"
--------------------------------------------------------------------------------
/bin/gunicorn.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | python manage.py migrate
4 | python manage.py collectstatic --noinput
5 |
6 | exec gunicorn extlinks.wsgi:application \
7 | --name extlinks_django \
8 | --bind 0.0.0.0:8000 \
9 | --worker-class gthread \
10 | --workers 7 \
11 | --threads 1 \
12 | --timeout 30 \
13 | --backlog 2048 \
14 | --log-level=info \
15 | --reload \
16 | "$@"
17 |
--------------------------------------------------------------------------------
/bin/restore.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -eo pipefail
4 |
5 | PATH=/usr/local/bin:/usr/bin:/bin:/sbin:/app/bin:$PATH
6 |
7 | restore_file=${1}
8 |
9 | if /app/bin/django_wait_for_db.sh
10 | then
11 |
12 | echo "This may drop the DB. Proceed [y/N]?"
13 | read -p "This may drop the DB. Proceed [y/N]?" -n 1 -r
14 | echo ""
15 | if [[ ! $REPLY =~ ^[Yy]$ ]]
16 | then
17 | echo "Exiting..."
18 | exit
19 | fi
20 |
21 | echo "Importing backup DB."
22 | nice -n 5 bash -c "gunzip -c ${restore_file} | mysql -h db -u root -p${MYSQL_ROOT_PASSWORD} -D ${MYSQL_DATABASE}"
23 |
24 | ## Run any necessary DB operations.
25 | python /app/manage.py migrate
26 |
27 | echo "Finished restore."
28 | else
29 | exit 1
30 | fi
31 |
32 |
--------------------------------------------------------------------------------
/bin/swarm_update.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # One level up from this script, which should be the root of this repo.
4 | dir=$(dirname $(readlink -f $0))/..
5 |
6 | # Gets a value from a dotenv file at .env.
7 | function default () {
8 | grep ${1} ${dir}/.env | cut -d '=' -f2
9 | }
10 |
11 | # Pull updated image if available.
12 | function pull () {
13 | image_name=${1}
14 | tag=${2}
15 | target=${image_name}:${tag}
16 |
17 | # Check for newer image
18 | pull=$(docker pull ${target})
19 |
20 | # Pull swarm config updates and update the stack if there is a new image.
21 | if echo ${pull} | grep "Status: Downloaded newer image for ${target}" >/dev/null
22 | then
23 | echo "${target} updated"
24 |
25 | # Report if the local image is already up to date.
26 | elif echo ${pull} | grep "Status: Image is up to date for ${target}" >/dev/null
27 | then
28 | echo "${target} already up to date"
29 |
30 | # Fail in any other circumstance.
31 | else
32 | echo "Error updating ${target}"
33 | exit 1
34 | fi
35 | }
36 |
37 | # Take .env values or arguments.
38 | env=${1:-$(default ENV)}
39 | externallinks_tag=${2:-$(default EXTERNALLINKS_TAG)}
40 | eventstream_tag=${3:-$(default EVENTSTREAM_TAG)}
41 |
42 | if [ -z "$env" ] || [ -z "$externallinks_tag" ] || [ -z "$eventstream_tag" ]
43 | then
44 | echo "Usage: swarm_update.sh \$env \$externallinks_tag \$eventstream_tag
45 | \$env docker swarm environment (eg. staging | production).
46 | \$externallinks_tag docker hub image tag (eg. branch_staging | branch_production | latest)
47 | \$eventstream_tag docker hub image tag (eg. branch_staging | branch_production | latest)"
48 | exit 1;
49 | fi
50 |
51 | # Pull image updates.
52 | pull quay.io/wikipedialibrary/externallinks ${externallinks_tag}
53 | pull quay.io/wikipedialibrary/eventstream ${eventstream_tag}
54 |
55 | # Update repository for updates to code or to the swarm deployment itself.
56 | git -C ${dir} pull
57 |
58 | # Deploy the updates.
59 | docker stack deploy -c <(cd ${dir}; docker-compose config 2>/dev/null) ${env}
60 |
--------------------------------------------------------------------------------
/crontab:
--------------------------------------------------------------------------------
1 | # /etc/crontab: system-wide crontab
2 | # Unlike any other crontab you don't have to run the `crontab'
3 | # command to install the new version when you edit this file
4 | # and files in /etc/cron.d. These files also have username fields,
5 | # that none of the other crontabs do.
6 |
7 | SHELL=/app/bin/cronlog.sh
8 | PATH=/app/bin:/usr/bin:/bin
9 |
10 | # Example of job definition:
11 | #.---------------- minute (0 - 59)
12 | #| .------------- hour (0 - 23)
13 | #| | .---------- day of month (1 - 31)
14 | #| | | .------- month (1 - 12) OR jan,feb,mar,apr ...
15 | #| | | | .---- day of week (0 - 6) (Sunday=0 or 7) OR sun,mon,tue,wed,thu,fri,sat
16 | #| | | | |
17 | #* * * * * user command to be executed
18 | 30 6 */2 * * root python backup.py
19 | # from extlinks/aggregates/cron.py
20 | # daily
21 | 0 0 * * * root python manage.py fill_link_aggregates
22 | 5 0 * * * root python manage.py fill_user_aggregates
23 | 45 0 * * * root python manage.py fill_pageproject_aggregates
24 | 0 3 * * * root python manage.py fill_monthly_link_aggregates
25 | 10 3 * * * root python manage.py fill_monthly_user_aggregates
26 | 50 3 * * * root python manage.py fill_monthly_pageproject_aggregates
27 | # from extlinks/links/cron.py
28 | # weekly
29 | 10 5 * * 1 root python manage.py linksearchtotal_collect
30 | # daily
31 | 0 2 * * * root python manage.py linkevents_archive dump
32 | # from extlinks/organisations/cron.py
33 | # hourly (was every 65 minutes for some reason?)
34 | 5 * * * * root python manage.py users_update_lists
35 |
36 |
--------------------------------------------------------------------------------
/db.cnf:
--------------------------------------------------------------------------------
1 | [mysqld]
2 | collation-server = utf8mb4_unicode_ci
3 | init-connect = SET NAMES utf8mb4 COLLATE utf8mb4_unicode_ci
4 | character-set-server = utf8mb4
5 | max_allowed_packet=1024M
6 | connect_timeout = 1200
7 | net_read_timeout = 180
8 | innodb_buffer_pool_size = 512M
9 |
10 |
--------------------------------------------------------------------------------
/django_wait_for_migrations.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Wrapper for Django's command-line utility for administrative tasks."""
3 | import os
4 | import sys
5 | import logging
6 | from time import sleep
7 |
8 | try:
9 | from django.db.migrations.executor import MigrationExecutor
10 | from django.db.utils import ConnectionHandler, DEFAULT_DB_ALIAS
11 | except ImportError as exc:
12 | raise ImportError(
13 | "Couldn't import Django. Are you sure it's installed and "
14 | "available on your PYTHONPATH environment variable? Did you "
15 | "forget to activate a virtual environment?"
16 | ) from exc
17 |
18 |
19 | def db_migrated(database):
20 | connections = ConnectionHandler()
21 | connection = connections[database]
22 | connection.prepare_database()
23 | executor = MigrationExecutor(connection)
24 | targets = executor.loader.graph.leaf_nodes()
25 | return not executor.migration_plan(targets)
26 |
27 |
28 | def wait_for_migrations(args):
29 | try:
30 | from django import setup
31 | from django.core.management import execute_from_command_line
32 | except ImportError as exc:
33 | raise ImportError(
34 | "Couldn't import Django. Are you sure it's installed and "
35 | "available on your PYTHONPATH environment variable? Did you "
36 | "forget to activate a virtual environment?"
37 | ) from exc
38 | setup()
39 | logger = logging.getLogger(__name__)
40 | wait = 0
41 | # Unapplied migrations found.
42 | while not db_migrated(DEFAULT_DB_ALIAS):
43 | logger.info("Unapplied migrations found.")
44 | sleep(1)
45 | wait += 1
46 |
47 | if wait > 30:
48 | raise Exception("Migration timeout")
49 |
50 | # All migrations have been applied.
51 | if db_migrated(DEFAULT_DB_ALIAS):
52 | logger.info("All migrations have been applied.")
53 | execute_from_command_line(args)
54 | else:
55 | raise Exception("Unknown error.")
56 |
57 |
58 | if __name__ == "__main__":
59 | wait_for_migrations(sys.argv)
60 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | ---
2 | version: "3.8"
3 | volumes:
4 | mysql:
5 |
6 | services:
7 | externallinks:
8 | image: quay.io/wikipedialibrary/externallinks:${EXTERNALLINKS_TAG}
9 | build:
10 | context: .
11 | target: externallinks
12 | env_file:
13 | - ".env"
14 | depends_on:
15 | - db
16 | ports:
17 | - "8000:8000"
18 | command: ["/app/bin/gunicorn.sh"]
19 | volumes:
20 | - type: bind
21 | source: ./
22 | target: /app
23 | - type: bind
24 | source: ${HOST_BACKUP_DIR}
25 | target: /app/backup
26 | deploy:
27 | resources:
28 | reservations:
29 | cpus: "0.25"
30 | memory: "384M"
31 | crons:
32 | image: quay.io/wikipedialibrary/externallinks_cron:${EXTERNALLINKS_TAG}
33 | build:
34 | context: .
35 | target: cron
36 | env_file:
37 | - ".env"
38 | depends_on:
39 | - db
40 | command: ["/app/bin/cron.sh"]
41 | volumes:
42 | - type: bind
43 | source: ./
44 | target: /app
45 | - type: bind
46 | source: ${HOST_BACKUP_DIR}
47 | target: /app/backup
48 | deploy:
49 | resources:
50 | reservations:
51 | cpus: "0.25"
52 | memory: "384M"
53 | db:
54 | image: quay.io/wikipedialibrary/mariadb:10-updated
55 | env_file:
56 | - ".env"
57 | ports:
58 | - "3306:3306"
59 | volumes:
60 | - type: volume
61 | source: mysql
62 | target: /var/lib/mysql
63 | volume: {}
64 | - type: bind
65 | source: ./db.cnf
66 | target: /etc/mysql/conf.d/db.cnf
67 | healthcheck:
68 | test: ["CMD", "mysqladmin", "ping", "-h", "localhost", "-plinks"]
69 | timeout: 20s
70 | interval: 10s
71 | retries: 10
72 | deploy:
73 | resources:
74 | reservations:
75 | cpus: "0.5"
76 | memory: "2.5G"
77 | nginx:
78 | image: quay.io/wikipedialibrary/nginx:latest-updated
79 | volumes:
80 | - type: volume
81 | target: /var/lib/nginx/cache
82 | - type: bind
83 | source: ./nginx.conf
84 | target: /etc/nginx/conf.d/default.conf
85 | - type: bind
86 | source: ./robots.txt
87 | target: /app/robots.txt
88 | - type: bind
89 | source: ./static
90 | target: /app/static
91 | ports:
92 | - "80:80"
93 | depends_on:
94 | - externallinks
95 | deploy:
96 | resources:
97 | reservations:
98 | cpus: "0.25"
99 | memory: "32M"
100 | eventstream:
101 | image: quay.io/wikipedialibrary/eventstream:${EVENTSTREAM_TAG}
102 | build:
103 | context: .
104 | target: eventstream
105 | depends_on:
106 | - db
107 | env_file:
108 | - ".env"
109 | command:
110 | [
111 | "python",
112 | "django_wait_for_migrations.py",
113 | "linkevents_collect",
114 | "--historical",
115 | ]
116 | volumes:
117 | - type: bind
118 | source: ./
119 | target: /app
120 | deploy:
121 | resources:
122 | reservations:
123 | cpus: "0.25"
124 | memory: "48M"
125 | cache:
126 | image: quay.io/wikipedialibrary/memcached:latest
127 | ports:
128 | - "11211:11211"
129 | entrypoint:
130 | - memcached
131 | depends_on:
132 | - externallinks
133 | deploy:
134 | resources:
135 | reservations:
136 | cpus: "0.25"
137 | memory: "64M"
138 |
--------------------------------------------------------------------------------
/docs/pull_request_template.md:
--------------------------------------------------------------------------------
1 | [//]: # (Thank you for uploading a PR to Wikilinks (externallinks)!)
2 |
3 | ## Description
4 | Describe your changes in detail following the [commit message guidelines](https://www.mediawiki.org/wiki/Gerrit/Commit_message_guidelines)
5 |
6 | ## Rationale
7 | [//]: # (Why is this change required? What problem does it solve?)
8 |
9 | ## Phabricator Ticket
10 | [//]: # (Link to the Phabricator ticket)
11 |
12 | ## How Has This Been Tested?
13 | [//]: # (- Did you add tests to your changes? Did you modify tests to accommodate your changes?)
14 | [//]: # (- Can this change be tested manually? How?)
15 |
16 | ## Screenshots of your changes (if appropriate):
17 | [//]: # (It can also be a GIF to prove that your changes are working)
18 |
19 | ## Types of changes
20 | What types of changes does your code introduce? Add an `x` in all the boxes that apply:
21 | - [ ] Bug fix (non-breaking change which fixes an issue)
22 | - [ ] New feature (non-breaking change which adds functionality)
23 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
24 |
--------------------------------------------------------------------------------
/extlinks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/__init__.py
--------------------------------------------------------------------------------
/extlinks/aggregates/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/aggregates/__init__.py
--------------------------------------------------------------------------------
/extlinks/aggregates/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 |
3 | from .models import LinkAggregate, UserAggregate, PageProjectAggregate
4 |
5 |
6 | class LinkAggregateAdmin(admin.ModelAdmin):
7 | list_display = (
8 | "organisation",
9 | "collection",
10 | "full_date",
11 | "total_links_added",
12 | "total_links_removed",
13 | "on_user_list",
14 | )
15 | list_filter = ("organisation", "collection", "month", "year", "on_user_list")
16 | list_select_related = ["organisation", "collection"]
17 |
18 |
19 | admin.site.register(LinkAggregate, LinkAggregateAdmin)
20 |
21 |
22 | class UserAggregateAdmin(admin.ModelAdmin):
23 | list_display = (
24 | "organisation",
25 | "collection",
26 | "username",
27 | "full_date",
28 | "total_links_added",
29 | "total_links_removed",
30 | "on_user_list",
31 | )
32 | list_filter = ("organisation", "collection", "month", "year", "on_user_list")
33 | list_select_related = ["organisation", "collection"]
34 |
35 | admin.site.register(UserAggregate, UserAggregateAdmin)
36 |
37 |
38 | class PageProjectAggregateAdmin(admin.ModelAdmin):
39 | list_display = (
40 | "organisation",
41 | "collection",
42 | "project_name",
43 | "page_name",
44 | "full_date",
45 | "total_links_added",
46 | "total_links_removed",
47 | "on_user_list",
48 | )
49 | list_filter = ("organisation", "collection", "month", "year", "on_user_list")
50 | list_select_related = ["organisation", "collection"]
51 |
52 | admin.site.register(PageProjectAggregate, PageProjectAggregateAdmin)
53 |
--------------------------------------------------------------------------------
/extlinks/aggregates/factories.py:
--------------------------------------------------------------------------------
1 | import factory
2 | import random
3 | import datetime
4 |
5 | from .models import LinkAggregate, UserAggregate, PageProjectAggregate
6 | from extlinks.organisations.factories import CollectionFactory, OrganisationFactory
7 |
8 |
9 | class LinkAggregateFactory(factory.django.DjangoModelFactory):
10 | class Meta:
11 | model = LinkAggregate
12 | strategy = factory.CREATE_STRATEGY
13 |
14 | organisation = factory.SubFactory(OrganisationFactory)
15 | collection = factory.SubFactory(CollectionFactory)
16 | full_date = factory.Faker(
17 | "date_between_dates",
18 | date_start=datetime.date(2017, 1, 1),
19 | date_end=datetime.date(2020, 10, 31),
20 | )
21 | total_links_added = random.randint(0, 100)
22 | total_links_removed = random.randint(0, 80)
23 |
24 |
25 | class UserAggregateFactory(factory.django.DjangoModelFactory):
26 | class Meta:
27 | model = UserAggregate
28 | strategy = factory.CREATE_STRATEGY
29 |
30 | organisation = factory.SubFactory(OrganisationFactory)
31 | collection = factory.SubFactory(CollectionFactory)
32 | username = factory.Sequence(lambda n: 'user%d' % n)
33 | full_date = factory.Faker(
34 | "date_between_dates",
35 | date_start=datetime.date(2017, 1, 1),
36 | date_end=datetime.date(2020, 10, 31),
37 | )
38 | total_links_added = random.randint(0, 100)
39 | total_links_removed = random.randint(0, 80)
40 |
41 |
42 | class PageProjectAggregateFactory(factory.django.DjangoModelFactory):
43 | class Meta:
44 | model = PageProjectAggregate
45 | strategy = factory.CREATE_STRATEGY
46 |
47 | organisation = factory.SubFactory(OrganisationFactory)
48 | collection = factory.SubFactory(CollectionFactory)
49 | project_name = factory.Faker("word")
50 | page_name = factory.Faker("word")
51 | full_date = factory.Faker(
52 | "date_between_dates",
53 | date_start=datetime.date(2017, 1, 1),
54 | date_end=datetime.date(2020, 10, 31),
55 | )
56 | total_links_added = random.randint(0, 100)
57 | total_links_removed = random.randint(0, 80)
58 |
--------------------------------------------------------------------------------
/extlinks/aggregates/management/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/aggregates/management/__init__.py
--------------------------------------------------------------------------------
/extlinks/aggregates/management/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/aggregates/management/commands/__init__.py
--------------------------------------------------------------------------------
/extlinks/aggregates/management/commands/archive_link_aggregates.py:
--------------------------------------------------------------------------------
1 | from typing import Type
2 | from django.db import models
3 | from extlinks.aggregates.management.helpers import AggregateArchiveCommand
4 | from extlinks.aggregates.models import LinkAggregate
5 |
6 |
7 | class Command(AggregateArchiveCommand):
8 | """
9 | This command archives data from the 'aggregates_linkaggregate' table.
10 | """
11 |
12 | help = "Dump & delete or load data from the LinkAggregate table"
13 | name = "LinkAggregate"
14 |
15 | def get_model(self) -> Type[models.Model]:
16 | return LinkAggregate
17 |
--------------------------------------------------------------------------------
/extlinks/aggregates/management/commands/archive_pageproject_aggregates.py:
--------------------------------------------------------------------------------
1 | from typing import Type
2 | from django.db import models
3 | from extlinks.aggregates.management.helpers import AggregateArchiveCommand
4 | from extlinks.aggregates.models import PageProjectAggregate
5 |
6 |
7 | class Command(AggregateArchiveCommand):
8 | """
9 | This command archives data from the 'aggregates_pageprojectaggregate' table.
10 | """
11 |
12 | help = "Dump & delete or load data from the PageProjectAggregate table"
13 | name = "PageProjectAggregate"
14 |
15 | def get_model(self) -> Type[models.Model]:
16 | return PageProjectAggregate
17 |
--------------------------------------------------------------------------------
/extlinks/aggregates/management/commands/archive_user_aggregates.py:
--------------------------------------------------------------------------------
1 | from typing import Type
2 | from django.db import models
3 | from extlinks.aggregates.management.helpers import AggregateArchiveCommand
4 | from extlinks.aggregates.models import UserAggregate
5 |
6 |
7 | class Command(AggregateArchiveCommand):
8 | """
9 | This command archives data from the 'aggregates_useraggregate' table.
10 | """
11 |
12 | help = "Dump & delete or load data from the UserAggregate table"
13 | name = "UserAggregate"
14 |
15 | def get_model(self) -> Type[models.Model]:
16 | return UserAggregate
17 |
--------------------------------------------------------------------------------
/extlinks/aggregates/management/helpers/__init__.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import json
3 | import os
4 |
5 | from typing import Union
6 |
7 | from django.core.serializers import serialize
8 |
9 | from extlinks.aggregates.models import (
10 | LinkAggregate,
11 | PageProjectAggregate,
12 | UserAggregate,
13 | )
14 | from extlinks.aggregates.management.helpers.aggregate_archive_command import (
15 | AggregateArchiveCommand,
16 | )
17 |
18 |
19 | def decode_archive(filename: str):
20 | """
21 | Loads and decompresses the given archive file.
22 | """
23 |
24 | with gzip.open(filename, "rt", encoding="utf-8") as archive:
25 | return json.loads(archive.read())
26 |
27 |
28 | def validate_aggregate_archive(
29 | aggregate_type: str,
30 | aggregate: Union[LinkAggregate, UserAggregate, PageProjectAggregate],
31 | output_dir: str,
32 | ) -> bool:
33 | """
34 | Validates that the gvien aggregate has a matching archive file.
35 | """
36 |
37 | on_user_list = "1" if aggregate.on_user_list else "0"
38 | filename = f"aggregates_{aggregate_type}_{aggregate.organisation.id}_{aggregate.collection.id}_{aggregate.full_date}_{on_user_list}.json.gz"
39 | archive_path = os.path.join(output_dir, filename)
40 |
41 | if not os.path.isfile(archive_path):
42 | return False
43 |
44 | archive_json = decode_archive(archive_path)
45 | link_aggregate_json = json.loads(serialize("json", [aggregate]))
46 |
47 | return link_aggregate_json == archive_json
48 |
49 |
50 | def validate_link_aggregate_archive(
51 | link_aggregate: LinkAggregate, output_dir: str
52 | ) -> bool:
53 | """
54 | Validates that the given LinkAggregate has a matching archive file.
55 | """
56 |
57 | return validate_aggregate_archive("linkaggregate", link_aggregate, output_dir)
58 |
59 |
60 | def validate_user_aggregate_archive(
61 | user_aggregate: UserAggregate, output_dir: str
62 | ) -> bool:
63 | """
64 | Validates that the given UserAggregate has a matching archive file.
65 | """
66 |
67 | return validate_aggregate_archive("useraggregate", user_aggregate, output_dir)
68 |
69 |
70 | def validate_pageproject_aggregate_archive(
71 | pageproject_aggregate: PageProjectAggregate, output_dir: str
72 | ) -> bool:
73 | """
74 | Validates that the given PageProjectAggregate has a matching archive file.
75 | """
76 |
77 | return validate_aggregate_archive(
78 | "pageprojectaggregate", pageproject_aggregate, output_dir
79 | )
80 |
81 |
82 | __all__ = ["AggregateArchiveCommand"]
83 |
--------------------------------------------------------------------------------
/extlinks/aggregates/migrations/0001_initial.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.3 on 2020-11-10 00:50
2 |
3 | from django.db import migrations, models
4 | import django.db.models.deletion
5 |
6 |
7 | class Migration(migrations.Migration):
8 | dependencies = [
9 | ("organisations", "0006_auto_20190730_1355"),
10 | ]
11 |
12 | operations = [
13 | migrations.CreateModel(
14 | name="LinkAggregate",
15 | fields=[
16 | (
17 | "id",
18 | models.AutoField(
19 | auto_created=True,
20 | primary_key=True,
21 | serialize=False,
22 | verbose_name="ID",
23 | ),
24 | ),
25 | ("day", models.PositiveIntegerField()),
26 | ("month", models.PositiveIntegerField()),
27 | ("year", models.PositiveIntegerField()),
28 | ("full_date", models.DateField()),
29 | ("total_links_added", models.PositiveIntegerField()),
30 | ("total_links_removed", models.PositiveIntegerField()),
31 | ("created_at", models.DateTimeField(auto_now_add=True)),
32 | ("updated_at", models.DateTimeField(auto_now=True)),
33 | (
34 | "collection",
35 | models.ForeignKey(
36 | on_delete=django.db.models.deletion.CASCADE,
37 | to="organisations.collection",
38 | ),
39 | ),
40 | (
41 | "organisation",
42 | models.ForeignKey(
43 | default=None,
44 | on_delete=django.db.models.deletion.CASCADE,
45 | to="organisations.organisation",
46 | ),
47 | ),
48 | ],
49 | ),
50 | ]
51 |
--------------------------------------------------------------------------------
/extlinks/aggregates/migrations/0002_useraggregate.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.3 on 2020-11-16 18:58
2 |
3 | from django.db import migrations, models
4 | import django.db.models.deletion
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ("organisations", "0006_auto_20190730_1355"),
11 | ("aggregates", "0001_initial"),
12 | ]
13 |
14 | operations = [
15 | migrations.CreateModel(
16 | name="UserAggregate",
17 | fields=[
18 | (
19 | "id",
20 | models.AutoField(
21 | auto_created=True,
22 | primary_key=True,
23 | serialize=False,
24 | verbose_name="ID",
25 | ),
26 | ),
27 | ("username", models.CharField(max_length=235)),
28 | ("day", models.PositiveIntegerField()),
29 | ("month", models.PositiveIntegerField()),
30 | ("year", models.PositiveIntegerField()),
31 | ("full_date", models.DateField()),
32 | ("total_links_added", models.PositiveIntegerField()),
33 | ("total_links_removed", models.PositiveIntegerField()),
34 | ("created_at", models.DateTimeField(auto_now_add=True)),
35 | ("updated_at", models.DateTimeField(auto_now=True)),
36 | (
37 | "collection",
38 | models.ForeignKey(
39 | on_delete=django.db.models.deletion.CASCADE,
40 | to="organisations.collection",
41 | ),
42 | ),
43 | (
44 | "organisation",
45 | models.ForeignKey(
46 | on_delete=django.db.models.deletion.CASCADE,
47 | to="organisations.organisation",
48 | ),
49 | ),
50 | ],
51 | ),
52 | ]
53 |
--------------------------------------------------------------------------------
/extlinks/aggregates/migrations/0003_add_indexes.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.3 on 2020-11-25 20:12
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ("aggregates", "0002_useraggregate"),
10 | ]
11 |
12 | operations = [
13 | migrations.AddIndex(
14 | model_name="linkaggregate",
15 | index=models.Index(
16 | fields=["full_date"], name="aggregates__full_da_865352_idx"
17 | ),
18 | ),
19 | migrations.AddIndex(
20 | model_name="linkaggregate",
21 | index=models.Index(
22 | fields=["collection"], name="aggregates__collect_1c3986_idx"
23 | ),
24 | ),
25 | migrations.AddIndex(
26 | model_name="useraggregate",
27 | index=models.Index(
28 | fields=["full_date"], name="aggregates__full_da_3a3ae3_idx"
29 | ),
30 | ),
31 | migrations.AddIndex(
32 | model_name="useraggregate",
33 | index=models.Index(
34 | fields=["collection"], name="aggregates__collect_53b6a2_idx"
35 | ),
36 | ),
37 | ]
38 |
--------------------------------------------------------------------------------
/extlinks/aggregates/migrations/0004_pageprojectaggregate.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.3 on 2020-11-26 00:58
2 |
3 | from django.db import migrations, models
4 | import django.db.models.deletion
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ("organisations", "0006_auto_20190730_1355"),
11 | ("aggregates", "0003_add_indexes"),
12 | ]
13 |
14 | operations = [
15 | migrations.CreateModel(
16 | name="PageProjectAggregate",
17 | fields=[
18 | (
19 | "id",
20 | models.AutoField(
21 | auto_created=True,
22 | primary_key=True,
23 | serialize=False,
24 | verbose_name="ID",
25 | ),
26 | ),
27 | ("project_name", models.CharField(max_length=32)),
28 | ("page_name", models.CharField(max_length=255)),
29 | ("day", models.PositiveIntegerField()),
30 | ("month", models.PositiveIntegerField()),
31 | ("year", models.PositiveIntegerField()),
32 | ("full_date", models.DateField()),
33 | ("total_links_added", models.PositiveIntegerField()),
34 | ("total_links_removed", models.PositiveIntegerField()),
35 | ("created_at", models.DateTimeField(auto_now_add=True)),
36 | ("updated_at", models.DateTimeField(auto_now=True)),
37 | (
38 | "collection",
39 | models.ForeignKey(
40 | on_delete=django.db.models.deletion.CASCADE,
41 | to="organisations.collection",
42 | ),
43 | ),
44 | (
45 | "organisation",
46 | models.ForeignKey(
47 | on_delete=django.db.models.deletion.CASCADE,
48 | to="organisations.organisation",
49 | ),
50 | ),
51 | ],
52 | ),
53 | migrations.AddIndex(
54 | model_name="pageprojectaggregate",
55 | index=models.Index(
56 | fields=["full_date"], name="aggregates__full_da_6cc1a0_idx"
57 | ),
58 | ),
59 | migrations.AddIndex(
60 | model_name="pageprojectaggregate",
61 | index=models.Index(
62 | fields=["collection"], name="aggregates__collect_35e404_idx"
63 | ),
64 | ),
65 | ]
66 |
--------------------------------------------------------------------------------
/extlinks/aggregates/migrations/0005_add_organisation_index.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.3 on 2020-12-03 22:55
2 |
3 | from django.db import migrations, models
4 | import django.db.models.deletion
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('organisations', '0006_auto_20190730_1355'),
11 | ('aggregates', '0004_pageprojectaggregate'),
12 | ]
13 |
14 | operations = [
15 | migrations.AlterField(
16 | model_name='linkaggregate',
17 | name='organisation',
18 | field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='organisations.organisation'),
19 | ),
20 | migrations.AddIndex(
21 | model_name='linkaggregate',
22 | index=models.Index(fields=['organisation'], name='aggregates__organis_c142ff_idx'),
23 | ),
24 | migrations.AddIndex(
25 | model_name='pageprojectaggregate',
26 | index=models.Index(fields=['organisation'], name='aggregates__organis_dc6018_idx'),
27 | ),
28 | migrations.AddIndex(
29 | model_name='useraggregate',
30 | index=models.Index(fields=['organisation'], name='aggregates__organis_3955fd_idx'),
31 | ),
32 | ]
33 |
--------------------------------------------------------------------------------
/extlinks/aggregates/migrations/0006_delete_aggregate_tables_info.py:
--------------------------------------------------------------------------------
1 | from django.db import migrations
2 |
3 |
4 | def delete_information_aggregate_tables(apps, schema_editor):
5 | LinkAggregate = apps.get_model("aggregates", "LinkAggregate")
6 | PageProjectAggregate = apps.get_model("aggregates", "PageProjectAggregate")
7 | UserAggregate = apps.get_model("aggregates", "UserAggregate")
8 |
9 | LinkAggregate.objects.all().delete()
10 | PageProjectAggregate.objects.all().delete()
11 | UserAggregate.objects.all().delete()
12 |
13 |
14 | class Migration(migrations.Migration):
15 |
16 | dependencies = [("aggregates", "0005_add_organisation_index")]
17 |
18 | operations = [migrations.RunPython(delete_information_aggregate_tables)]
19 |
--------------------------------------------------------------------------------
/extlinks/aggregates/migrations/0007_add_user_list_flags.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.4 on 2020-12-23 23:59
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ("aggregates", "0006_delete_aggregate_tables_info"),
10 | ]
11 |
12 | operations = [
13 | migrations.AddField(
14 | model_name="linkaggregate",
15 | name="on_user_list",
16 | field=models.BooleanField(default=False),
17 | ),
18 | migrations.AddField(
19 | model_name="pageprojectaggregate",
20 | name="on_user_list",
21 | field=models.BooleanField(default=False),
22 | ),
23 | migrations.AddField(
24 | model_name="useraggregate",
25 | name="on_user_list",
26 | field=models.BooleanField(default=False),
27 | ),
28 | ]
29 |
--------------------------------------------------------------------------------
/extlinks/aggregates/migrations/0008_alter_linkaggregate_id_alter_pageprojectaggregate_id_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 4.2.7 on 2023-11-07 19:28
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('aggregates', '0007_add_user_list_flags'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='linkaggregate',
15 | name='id',
16 | field=models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID'),
17 | ),
18 | migrations.AlterField(
19 | model_name='pageprojectaggregate',
20 | name='id',
21 | field=models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID'),
22 | ),
23 | migrations.AlterField(
24 | model_name='useraggregate',
25 | name='id',
26 | field=models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID'),
27 | ),
28 | ]
29 |
--------------------------------------------------------------------------------
/extlinks/aggregates/migrations/0009_pageprojectaggregate_composite_index.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 4.2.19 on 2025-02-19 20:52
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('aggregates', '0008_alter_linkaggregate_id_alter_pageprojectaggregate_id_and_more'),
10 | ]
11 |
12 | operations = [
13 | migrations.AddIndex(
14 | model_name='pageprojectaggregate',
15 | index=models.Index(fields=['full_date', 'collection_id', 'project_name', 'page_name'], name='aggregates__full_da_53fee7_idx'),
16 | ),
17 | ]
18 |
--------------------------------------------------------------------------------
/extlinks/aggregates/migrations/0010_add_aggregate_indexes.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 4.2.19 on 2025-02-27 17:24
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('aggregates', '0009_pageprojectaggregate_composite_index'),
10 | ]
11 |
12 | operations = [
13 | migrations.AddIndex(
14 | model_name='linkaggregate',
15 | index=models.Index(fields=['organisation_id', 'collection_id', 'on_user_list', 'year', 'month'], name='aggregates__organis_286b0f_idx'),
16 | ),
17 | migrations.AddIndex(
18 | model_name='pageprojectaggregate',
19 | index=models.Index(fields=['organisation_id', 'collection_id', 'project_name', 'page_name', 'on_user_list', 'year', 'month'], name='aggregates__organis_c106e7_idx'),
20 | ),
21 | migrations.AddIndex(
22 | model_name='useraggregate',
23 | index=models.Index(fields=['organisation_id', 'collection_id', 'username', 'on_user_list', 'year', 'month'], name='aggregates__organis_318980_idx'),
24 | ),
25 | ]
26 |
--------------------------------------------------------------------------------
/extlinks/aggregates/migrations/0011_aggregate_composite_indexes.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 4.2.19 on 2025-03-07 19:13
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ("aggregates", "0010_add_aggregate_indexes"),
10 | ]
11 |
12 | operations = [
13 | migrations.AddIndex(
14 | model_name="pageprojectaggregate",
15 | index=models.Index(
16 | fields=["organisation", "project_name"],
17 | name="aggregates__organis_572036_idx",
18 | ),
19 | ),
20 | migrations.AddIndex(
21 | model_name="useraggregate",
22 | index=models.Index(
23 | fields=["organisation", "username"],
24 | name="aggregates__organis_05ef9a_idx",
25 | ),
26 | ),
27 | migrations.AddIndex(
28 | model_name="pageprojectaggregate",
29 | index=models.Index(
30 | fields=["collection", "project_name", "page_name"],
31 | name="aggregates__collect_e1e227_idx",
32 | ),
33 | ),
34 | migrations.AddIndex(
35 | model_name="useraggregate",
36 | index=models.Index(
37 | fields=["collection", "username"], name="aggregates__collect_463085_idx"
38 | ),
39 | ),
40 | ]
41 |
--------------------------------------------------------------------------------
/extlinks/aggregates/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/aggregates/migrations/__init__.py
--------------------------------------------------------------------------------
/extlinks/aggregates/views.py:
--------------------------------------------------------------------------------
1 | from django.shortcuts import render
2 |
3 | # Create your views here.
4 |
--------------------------------------------------------------------------------
/extlinks/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/common/__init__.py
--------------------------------------------------------------------------------
/extlinks/common/forms.py:
--------------------------------------------------------------------------------
1 | from datetime import timedelta
2 | from dateutil.relativedelta import relativedelta
3 |
4 | from django import forms
5 |
6 |
7 | class DateInput(forms.DateInput):
8 | # Creating a custom widget because default DateInput doesn't use
9 | # input type="date"
10 | input_type = "date"
11 |
12 |
13 | class FilterForm(forms.Form):
14 |
15 | start_date = forms.DateField(
16 | required=False,
17 | label="Start date:",
18 | widget=DateInput(attrs={"class": "form-control"}),
19 | )
20 | end_date = forms.DateField(
21 | required=False,
22 | label="End date:",
23 | widget=DateInput(attrs={"class": "form-control"}),
24 | )
25 |
26 | limit_to_user_list = forms.BooleanField(required=False)
27 |
28 | namespace_id = forms.IntegerField(
29 | required=False,
30 | label="Namespace ID:",
31 | widget=forms.NumberInput(
32 | attrs={"class": "form-control", "style": "width: 6rem;"}
33 | ),
34 | )
35 |
36 | exclude_bots = forms.BooleanField(required=False)
37 |
38 | def clean_start_date(self):
39 | """
40 | This is automatically called by Django when validating this field.
41 | Modify the start date to return the first day of its month.
42 | """
43 | start_date = self.cleaned_data.get("start_date")
44 |
45 | if not start_date:
46 | return None
47 |
48 | return start_date.replace(day=1)
49 |
50 |
51 | def clean_end_date(self):
52 | """
53 | This is automatically called by Django when validating this field.
54 | Modify the end date to return the last day of its month.
55 | """
56 | end_date = self.cleaned_data.get("end_date")
57 |
58 | if not end_date:
59 | return None
60 |
61 | next_month = end_date.replace(day=1) + relativedelta(months=1)
62 |
63 | return next_month - timedelta(days=1)
64 |
--------------------------------------------------------------------------------
/extlinks/common/helpers.py:
--------------------------------------------------------------------------------
1 | from datetime import date, timedelta
2 | from itertools import islice
3 |
4 | from django.db.models import Avg, Q
5 | from django.db.models.functions import TruncMonth
6 |
7 | from logging import getLogger
8 |
9 | logger = getLogger("django")
10 |
11 |
12 | def get_month_average(average_data, check_date):
13 | for avg_data in average_data:
14 | if avg_data["month"] == check_date:
15 | return avg_data["average"]
16 |
17 | return 0
18 |
19 |
20 | def get_linksearchtotal_data_by_time(queryset):
21 | """
22 | Calculates per-unit-time data from a queryset of LinkSearchTotal objects
23 |
24 | Given a queryset of LinkSearchTotal objects, returns the totals
25 | per month.
26 |
27 | Returns two lists: dates and totals
28 | """
29 |
30 | if queryset:
31 | earliest_date = queryset.earliest("date").date
32 | current_date = date.today()
33 |
34 | linksearch_data = []
35 | dates = []
36 |
37 | average_month_data = (
38 | queryset.annotate(month=TruncMonth("date"))
39 | .values("month")
40 | .annotate(average=Avg("total"))
41 | )
42 |
43 | while current_date >= earliest_date:
44 | month_first = current_date.replace(day=1)
45 | this_month_avg = get_month_average(average_month_data, month_first)
46 |
47 | linksearch_data.append(round(this_month_avg))
48 | dates.append(month_first.strftime("%Y-%m-%d"))
49 |
50 | # Figure out what the last month is regardless of today's date
51 | current_date = month_first - timedelta(days=1)
52 |
53 | # If a month has no data for some reason, we should use whatever
54 | # figure we have for the previous month, unless it is the current month
55 | for i, data in enumerate(linksearch_data):
56 | if data == 0 and i != len(linksearch_data) - 1:
57 | linksearch_data[i] = linksearch_data[i + 1]
58 |
59 | return dates[::-1], linksearch_data[::-1]
60 | else:
61 | return [], []
62 |
63 |
64 | def filter_linksearchtotals(queryset, filter_dict):
65 | """
66 | Adds filter conditions to a LinkSearchTotal queryset based on form results.
67 |
68 | queryset -- a LinkSearchTotal queryset
69 | filter_dict -- a dictionary of data from the user filter form
70 |
71 | Returns a queryset
72 | """
73 | if "start_date" in filter_dict:
74 | start_date = filter_dict["start_date"]
75 | if start_date:
76 | queryset = queryset.filter(date__gte=start_date)
77 |
78 | if "end_date" in filter_dict:
79 | end_date = filter_dict["end_date"]
80 | if end_date:
81 | queryset = queryset.filter(date__lte=end_date)
82 |
83 | return queryset
84 |
85 |
86 | def build_queryset_filters(form_data, collection_or_organisations):
87 | """
88 | This function parses a filter dictionary and creates Q object to filter
89 | the aggregates tables by
90 |
91 | Parameters
92 | ----------
93 | form_data: dict
94 | If the filter form has valid filters, then there will be a dictionary
95 | to filter the aggregates tables by dates or if a user is part of a user
96 | list
97 |
98 | collection_or_organisations : dict
99 | A dictionary that will have either a collection or a set of
100 | organisations to filter by.
101 |
102 | Returns
103 | -------
104 | Q : A Q object which will filter the aggregates queries
105 | """
106 | start_date = None
107 | end_date = None
108 | start_date_filter = Q()
109 | end_date_filter = Q()
110 | limit_to_user_list_filter = Q()
111 | # The aggregates queries will always be filtered by organisation
112 | if "organisations" in collection_or_organisations:
113 | collection_or_organisation_filter = Q(
114 | organisation__in=collection_or_organisations["organisations"]
115 | )
116 | elif "linkevents" in collection_or_organisations:
117 | collection_or_organisation_filter = Q()
118 | else:
119 | collection_or_organisation_filter = Q(
120 | collection=collection_or_organisations["collection"]
121 | )
122 |
123 | if "start_date" in form_data:
124 | start_date = form_data["start_date"]
125 | if start_date:
126 | if "linkevents" in collection_or_organisations:
127 | start_date_filter = Q(timestamp__gte=start_date)
128 | else:
129 | start_date_filter = Q(full_date__gte=start_date)
130 | if "end_date" in form_data:
131 | end_date = form_data["end_date"]
132 | # The end date must not be greater than today's date
133 | if end_date:
134 | if "linkevents" in collection_or_organisations:
135 | end_date_filter = Q(timestamp__lte=end_date)
136 | else:
137 | end_date_filter = Q(full_date__lte=end_date)
138 |
139 | if "limit_to_user_list" in form_data:
140 | limit_to_user_list = form_data["limit_to_user_list"]
141 | if limit_to_user_list:
142 | limit_to_user_list_filter = Q(on_user_list=True)
143 |
144 | if start_date and end_date:
145 | # If the start date is greater tham the end date, it won't filter
146 | # by date
147 | if start_date >= end_date:
148 | return collection_or_organisation_filter & limit_to_user_list_filter
149 |
150 | return (
151 | collection_or_organisation_filter
152 | & limit_to_user_list_filter
153 | & start_date_filter
154 | & end_date_filter
155 | )
156 |
157 |
158 | def batch_iterator(iterable, size=1000):
159 | """
160 | This yields successive batches from an iterable (memory-efficient).
161 |
162 | Used for large queries that use `.iterator()` for efficiency.
163 | Instead of loading all data into memory at once, this function
164 | retrieves items lazily in fixed-size batches.
165 |
166 | Parameters
167 | ----------
168 | iterable : Iterator
169 | An iterable object, typically a Django QuerySet with `.iterator()`,
170 | that returns items one by one in a memory-efficient manner.
171 |
172 | size : int
173 | The maximum number of items to include in each batch.
174 |
175 | Returns
176 | -------
177 | Iterator[List]
178 | An iterator that yields lists containing at most `size` items
179 | per batch.
180 | """
181 | iterator = iter(iterable)
182 | while batch := list(islice(iterator, size)):
183 | yield batch
184 |
--------------------------------------------------------------------------------
/extlinks/common/management/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/common/management/__init__.py
--------------------------------------------------------------------------------
/extlinks/common/management/commands/__init__.py:
--------------------------------------------------------------------------------
1 | from django.core.management.base import BaseCommand as DjangoBaseCommand
2 | from filelock import FileLock
3 | import inspect
4 | from os import remove
5 | from os.path import basename
6 |
7 |
8 | class BaseCommand(DjangoBaseCommand):
9 | """
10 | Django BaseCommand wrapper that adds file locks to management commands
11 | """
12 |
13 | def handle(self, *args, **options):
14 | lockname = basename(inspect.getfile(self.__class__))
15 | # Use a lockfile to prevent overruns.
16 | lockfile = "/tmp/{}.lock".format(lockname)
17 | lock = FileLock(lockfile)
18 | lock.acquire()
19 | try:
20 | self._handle(*args, **options)
21 | finally:
22 | lock.release()
23 | remove(lockfile)
24 |
--------------------------------------------------------------------------------
/extlinks/common/management/commands/import_twl_data.py:
--------------------------------------------------------------------------------
1 | import csv
2 |
3 | from . import BaseCommand
4 | from extlinks.links.models import URLPattern
5 | from extlinks.organisations.models import Organisation, Collection
6 | from extlinks.programs.models import Program
7 |
8 |
9 | class Command(BaseCommand):
10 | help = """
11 | Imports Programs, Orgs, Collections, and URLPatterns from The Wikipedia
12 | Library's old metrics collection system"""
13 |
14 | def add_arguments(self, parser):
15 | parser.add_argument("file_path", nargs="+", type=str)
16 |
17 | def _handle(self, *args, **options):
18 | file_path = options["file_path"][0]
19 |
20 | # Check TWL program exists, if it doesn't, create it.
21 | try:
22 | twl_program = Program.objects.get(name="The Wikipedia Library")
23 | except Program.DoesNotExist:
24 | twl_program = Program(name="The Wikipedia Library")
25 | twl_program.save()
26 |
27 | with open(file_path, "r") as input_file:
28 | csv_reader = csv.reader(input_file)
29 | next(csv_reader)
30 | for row in csv_reader:
31 | organisation = row[0]
32 | collection = row[1]
33 | urlpattern = row[2]
34 | twl_link = row[3]
35 | print(row)
36 |
37 | # Create Organisation
38 | try:
39 | organisation_object = Organisation.objects.get(name=organisation)
40 | except Organisation.DoesNotExist:
41 | organisation_object = Organisation(name=organisation)
42 | organisation_object.save()
43 | if twl_link == "x":
44 | organisation_object.program.add(twl_program)
45 |
46 | # Create Collection
47 | try:
48 | collection_object = Collection.objects.get(
49 | organisation=organisation_object,
50 | name=collection,
51 | )
52 | except Collection.DoesNotExist:
53 | collection_object = Collection(
54 | name=collection, organisation=organisation_object
55 | )
56 | collection_object.save()
57 |
58 | # Create URLPattern
59 | # We shouldn't have any duplicates here but let's be safe.
60 | try:
61 | urlpattern_object = URLPattern.objects.get(url=urlpattern)
62 | except URLPattern.DoesNotExist:
63 | urlpattern_object = URLPattern(
64 | url=urlpattern, collection=collection_object
65 | )
66 | urlpattern_object.save()
67 |
--------------------------------------------------------------------------------
/extlinks/common/swift.py:
--------------------------------------------------------------------------------
1 | import concurrent.futures
2 | import logging
3 | import os
4 |
5 | from typing import Iterable, List, Tuple, cast
6 |
7 | import swiftclient
8 | import keystoneauth1.identity.v3 as identity
9 | import keystoneauth1.session as session
10 |
11 | logger = logging.getLogger("django")
12 |
13 |
14 | def swift_connection() -> swiftclient.Connection:
15 | """
16 | Creates a swiftclient Connection configured using environment variables.
17 |
18 | This method works with v3 application credentials authentication only.
19 |
20 | Returns
21 | -------
22 | swiftclient.Connection
23 | A connection to the Swift object storage.
24 | """
25 |
26 | auth_url = os.environ.get("OPENSTACK_AUTH_URL")
27 | credential_id = os.environ.get("SWIFT_APPLICATION_CREDENTIAL_ID")
28 | credential_secret = os.environ.get("SWIFT_APPLICATION_CREDENTIAL_SECRET")
29 |
30 | if not auth_url or not credential_id or not credential_secret:
31 | raise RuntimeError(
32 | "The 'OPENSTACK_AUTH_URL', 'SWIFT_APPLICATION_CREDENTIAL_ID' and "
33 | "'SWIFT_APPLICATION_CREDENTIAL_SECRET' environment variables must "
34 | "be defined to use the Swift client"
35 | )
36 |
37 | return swiftclient.Connection(
38 | session=session.Session(
39 | auth=identity.ApplicationCredential(
40 | auth_url=auth_url,
41 | application_credential_id=credential_id,
42 | application_credential_secret=credential_secret,
43 | user_domain_id="default",
44 | )
45 | )
46 | )
47 |
48 |
49 | def get_containers(conn: swiftclient.Connection) -> List[dict]:
50 | """
51 | Retrieves a list of containers from object storage.
52 |
53 | Parameters
54 | ----------
55 | conn : swiftclient.Connection
56 | A connection to the Swift object storage.
57 |
58 | Returns
59 | -------
60 | List[dict]
61 | A list of dictionaries containing information about each container.
62 | """
63 |
64 | response = conn.get_account()
65 | if not response or len(response) < 2:
66 | raise RuntimeError("Failed to retrieve container list from Swift account.")
67 |
68 | return response[1]
69 |
70 |
71 | def ensure_container_exists(conn: swiftclient.Connection, container: str) -> bool:
72 | """
73 | Creates a new container in object storage if it doesn't already exist.
74 |
75 | Parameters
76 | ----------
77 | conn : swiftclient.Connection
78 | A connection to the Swift object storage.
79 |
80 | container : str
81 | The name of the container to create.
82 |
83 | Returns
84 | -------
85 | bool
86 | True if the container was created, False if it already existed.
87 | """
88 |
89 | containers = (c["name"] for c in get_containers(conn))
90 | if container not in containers:
91 | conn.put_container(container)
92 | return True
93 |
94 | return False
95 |
96 |
97 | def upload_file(
98 | conn: swiftclient.Connection,
99 | container: str,
100 | path: str,
101 | content_type="application/octet-stream",
102 | ):
103 | """
104 | Uploads a file on the local filesystem to the provided Swift container.
105 |
106 | Parameters
107 | ----------
108 | conn : swiftclient.Connection
109 | A connection to the Swift object storage.
110 |
111 | container : str
112 | The name of the container to upload the file to.
113 |
114 | path : str
115 | The path to the file on the local filesystem.
116 |
117 | content_type : str
118 | The content type of the file.
119 |
120 | Returns
121 | -------
122 | str
123 | The name of the object in Swift.
124 | """
125 |
126 | object_name = os.path.basename(path)
127 |
128 | with open(path, "rb") as f:
129 | conn.put_object(
130 | container,
131 | object_name,
132 | contents=f,
133 | content_type=content_type,
134 | )
135 |
136 | return object_name
137 |
138 |
139 | def download_file(conn: swiftclient.Connection, container: str, object: str) -> bytes:
140 | """
141 | Downloads a file from object storage.
142 |
143 | Parameters
144 | ----------
145 | conn : swiftclient.Connection
146 | A connection to the Swift object storage.
147 |
148 | container : str
149 | The name of the container to download the file from.
150 |
151 | object : str
152 | The name of the object to download.
153 |
154 | Returns
155 | -------
156 | bytes
157 | The contents of the object.
158 | """
159 |
160 | _, response = conn.get_object(container, object)
161 |
162 | return cast(bytes, response)
163 |
164 |
165 | def file_exists(conn: swiftclient.Connection, container: str, object: str) -> bool:
166 | """
167 | Checks if a file exists in Swift.
168 |
169 | Parameters
170 | ----------
171 | conn : swiftclient.Connection
172 | A connection to the Swift object storage.
173 |
174 | container : str
175 | The name of the container to check.
176 |
177 | object : str
178 | The name of the object to check.
179 |
180 | Returns
181 | -------
182 | bool
183 | True if the file exists, False if it doesn't.
184 | """
185 |
186 | try:
187 | conn.head_object(container, object)
188 | return True
189 | except swiftclient.ClientException as e:
190 | if e.http_status == 404:
191 | return False
192 | else:
193 | raise e
194 |
195 |
196 | def batch_upload_files(
197 | conn: swiftclient.Connection,
198 | container: str,
199 | files: Iterable[str],
200 | max_workers=10,
201 | ) -> Tuple[List[str], List[str]]:
202 | """
203 | Uploads a batch of multiple files to the given Swift container.
204 |
205 | Parameters
206 | ----------
207 | conn : swiftclient.Connection
208 | A connection to the Swift object storage.
209 |
210 | container : str
211 | The name of the container to upload the files to.
212 |
213 | files : Iterable[str]
214 | An iterable of file paths to upload.
215 |
216 | max_workers : int
217 | The maximum number of concurrent uploads to perform.
218 |
219 | Returns
220 | -------
221 | Tuple[List[str], List[str]]
222 | A tuple containing two lists. The first list contains the names of the
223 | files that were successfully uploaded. The second list contains the
224 | names of the files that failed to upload.
225 | """
226 |
227 | successful = []
228 | failed = []
229 |
230 | with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
231 | futures = {executor.submit(upload_file, conn, container, f): f for f in files}
232 |
233 | for future in concurrent.futures.as_completed(futures):
234 | path = futures[future]
235 |
236 | try:
237 | object_name = future.result()
238 | logger.info(f"Successfully uploaded '%s' as '%s'", path, object_name)
239 | successful.append(path)
240 | except Exception as exc:
241 | logger.error(f"Upload failed for '%s': %s", path, exc)
242 | failed.append(path)
243 |
244 | return successful, failed
245 |
--------------------------------------------------------------------------------
/extlinks/common/templates/common/statistics_table.html:
--------------------------------------------------------------------------------
1 |
Statistics
2 | {% if collections %}
3 |
4 |
7 |
8 | Total added:
9 |
10 |
11 |
12 | Total removed:
13 |
14 |
15 |
16 |
17 |
18 |
19 | {% if collection.linksearch_total_start %}
20 |
23 |
24 | {{ collection.linksearch_start_date }}:
25 | {{ collection.linksearch_total_start }}
26 |
27 |
28 | Current total:
29 | {{ collection.linksearch_total_current }}
30 |
31 |
32 |
33 | +
36 | {% else %}
37 | color: red;">
38 | {% endif %}
39 | {{ collection.linksearch_total_diff }}
40 |
41 |
42 | {% endif %}
43 |
46 |
47 | Total editors:
48 |
49 |
50 |
51 | Total projects:
52 |
53 |
54 |
55 | {% else %}
56 |
57 |
60 |
61 | Total added:
62 |
63 |
64 |
65 | Total removed:
66 |
67 |
68 |
69 |
70 |
71 |
72 |
75 |
76 | Total editors:
77 |
78 |
79 |
80 | Total projects:
81 |
82 |
83 |
84 | {% endif %}
85 |
--------------------------------------------------------------------------------
/extlinks/common/templates/common/top_organisations_table.html:
--------------------------------------------------------------------------------
1 | {% load common_filters %}
2 |
3 |
4 | Organisation
5 | Added Links
6 |
7 |
8 |
--------------------------------------------------------------------------------
/extlinks/common/templatetags/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/common/templatetags/__init__.py
--------------------------------------------------------------------------------
/extlinks/common/templatetags/common_filters.py:
--------------------------------------------------------------------------------
1 | from django import template
2 | from django.template.defaultfilters import stringfilter
3 |
4 | register = template.Library()
5 |
6 |
7 | @register.filter
8 | @stringfilter
9 | def replace_underscores(string):
10 | return string.replace("_", " ")
11 |
--------------------------------------------------------------------------------
/extlinks/common/tests.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import tempfile
4 |
5 | from datetime import date, datetime, timezone
6 | from unittest import mock
7 |
8 | import swiftclient
9 | import time_machine
10 |
11 | from django.test import TestCase
12 |
13 | import extlinks.common.swift as swift
14 |
15 | from extlinks.common.forms import FilterForm
16 | from extlinks.common.helpers import get_linksearchtotal_data_by_time
17 | from extlinks.links.factories import LinkSearchTotalFactory, URLPatternFactory
18 | from extlinks.links.models import LinkSearchTotal
19 |
20 | SWIFT_TEST_CREDENTIALS = {
21 | "OPENSTACK_AUTH_URL": "fakeauthurl",
22 | "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredentialid",
23 | "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredentialsecret",
24 | }
25 |
26 |
27 | class LinkSearchDataByTimeTest(TestCase):
28 | def setUp(self):
29 | url = URLPatternFactory(url="www.acme.org")
30 | # Adding LinkSearch data
31 | LinkSearchTotalFactory(url=url, date=datetime(2020, 1, 15, tzinfo=timezone.utc))
32 | LinkSearchTotalFactory(url=url, date=datetime(2020, 2, 1, tzinfo=timezone.utc))
33 | LinkSearchTotalFactory(url=url, date=datetime(2020, 2, 2, tzinfo=timezone.utc))
34 | LinkSearchTotalFactory(url=url, date=datetime(2020, 2, 18, tzinfo=timezone.utc))
35 | LinkSearchTotalFactory(url=url, date=datetime(2020, 3, 6, tzinfo=timezone.utc))
36 | LinkSearchTotalFactory(
37 | url=url, date=datetime(2020, 4, 16, tzinfo=timezone.utc), total=0
38 | )
39 |
40 | def test_linksearch_data_empty_queryset(self):
41 | linksearch_queryset = None
42 |
43 | dates, linksearch_data = get_linksearchtotal_data_by_time(linksearch_queryset)
44 |
45 | self.assertEqual(0, len(dates))
46 | self.assertEqual(0, len(linksearch_data))
47 |
48 | def test_linksearch_data(self):
49 | with time_machine.travel(date(2020, 12, 31)):
50 | linksearch = LinkSearchTotal.objects.all()
51 |
52 | dates, linksearch_data = get_linksearchtotal_data_by_time(linksearch)
53 |
54 | self.assertEqual(12, len(dates))
55 | self.assertEqual(12, len(linksearch_data))
56 |
57 |
58 | class FilterFormTest(TestCase):
59 |
60 | def test_valid_data(self):
61 | form = FilterForm(
62 | data={
63 | "start_date": "2025-02-15",
64 | "end_date": "2025-06-21",
65 | "limit_to_user_list": "on",
66 | "namespace_id": "10",
67 | # "exclude_bots": "on", # omit this key to assertFalse
68 | }
69 | )
70 | self.assertTrue(form.is_valid())
71 | self.assertEqual(form.cleaned_data["start_date"], date(2025, 2, 1))
72 | self.assertEqual(
73 | form.cleaned_data["end_date"], date(2025, 6, 30)
74 | ) # Should return last day of month
75 | self.assertTrue(form.cleaned_data["limit_to_user_list"])
76 | self.assertEqual(form.cleaned_data["namespace_id"], 10)
77 | self.assertFalse(form.cleaned_data["exclude_bots"])
78 |
79 | def test_empty_data(self):
80 | form = FilterForm(data={})
81 | self.assertTrue(form.is_valid())
82 | self.assertIsNone(form.cleaned_data["start_date"])
83 | self.assertIsNone(form.cleaned_data["end_date"])
84 | self.assertFalse(form.cleaned_data["limit_to_user_list"])
85 | self.assertIsNone(form.cleaned_data["namespace_id"])
86 | self.assertFalse(form.cleaned_data["exclude_bots"])
87 |
88 | def test_invalid_start_date(self):
89 | form = FilterForm(data={"start_date": "2025/02/01"})
90 | self.assertFalse(form.is_valid())
91 | self.assertIn("start_date", form.errors)
92 |
93 | def test_clean_start_date(self):
94 | form = FilterForm(data={"start_date": "2023-11-11"})
95 | self.assertTrue(form.is_valid())
96 | self.assertEqual(form.cleaned_data["start_date"], date(2023, 11, 1))
97 |
98 | def test_invalid_end_date(self):
99 | form = FilterForm(data={"end_date": "abcd-12-ef"})
100 | self.assertFalse(form.is_valid())
101 | self.assertIn("end_date", form.errors)
102 |
103 | def test_clean_end_date(self):
104 | form = FilterForm(data={"end_date": "2023-11-01"})
105 | self.assertTrue(form.is_valid())
106 | self.assertEqual(form.cleaned_data["end_date"], date(2023, 11, 30))
107 |
108 | def test_clean_end_date_leap_year(self):
109 | form = FilterForm(data={"end_date": "2024-02-01"})
110 | self.assertTrue(form.is_valid())
111 | self.assertEqual(form.cleaned_data["end_date"], date(2024, 2, 29))
112 |
113 | def test_clean_end_date_feb_non_leap_year(self):
114 | form = FilterForm(data={"end_date": "2025-02-01"})
115 | self.assertTrue(form.is_valid())
116 | self.assertEqual(form.cleaned_data["end_date"], date(2025, 2, 28))
117 |
118 |
119 | class SwiftConnectionTest(TestCase):
120 | @mock.patch.dict(os.environ, SWIFT_TEST_CREDENTIALS, clear=True)
121 | def test_swift_connection(self):
122 | conn = swift.swift_connection()
123 | self.assertIsInstance(conn, swiftclient.Connection)
124 |
125 | @mock.patch.dict(os.environ, {}, clear=True)
126 | def test_swift_connection_validation(self):
127 | with self.assertRaises(RuntimeError):
128 | swift.swift_connection()
129 |
130 |
131 | class SwiftUploadTest(TestCase):
132 | def setUp(self):
133 | self.tmpdir = os.path.join(tempfile.gettempdir(), "SwiftUploadTest")
134 | os.mkdir(self.tmpdir)
135 |
136 | def tearDown(self):
137 | shutil.rmtree(self.tmpdir)
138 |
139 | def write_file(self, path: str, contents="placeholder"):
140 | """
141 | Writes a text file to the temporary directory.
142 | """
143 |
144 | full_path = os.path.join(self.tmpdir, path)
145 | with open(full_path, "w") as f:
146 | f.write(contents)
147 |
148 | return full_path
149 |
150 | @mock.patch("swiftclient.Connection")
151 | @mock.patch.dict(os.environ, SWIFT_TEST_CREDENTIALS, clear=True)
152 | def test_swift_upload(self, mock_swift_connection):
153 | """
154 | Test that we can upload a file to Swift using the helper methods.
155 | """
156 |
157 | mock_conn = mock_swift_connection.return_value
158 | mock_conn.put_object.return_value = ""
159 |
160 | swift.upload_file(
161 | swift.swift_connection(), "fakecontainer", self.write_file("file.txt")
162 | )
163 | mock_conn.put_object.assert_called_once_with(
164 | "fakecontainer", "file.txt", contents=mock.ANY, content_type=mock.ANY
165 | )
166 |
167 | @mock.patch("swiftclient.Connection")
168 | @mock.patch.dict(os.environ, SWIFT_TEST_CREDENTIALS, clear=True)
169 | def test_swift_batch_upload(self, mock_swift_connection):
170 | """
171 | Test that we can upload a batch of files to Swift.
172 | """
173 |
174 | mock_conn = mock_swift_connection.return_value
175 | mock_conn.put_object.return_value = ""
176 |
177 | files = ["file1.txt", "file2.txt", "file3.txt", "file4.txt", "file5.txt"]
178 | for file in files:
179 | self.write_file(os.path.join(self.tmpdir, file))
180 |
181 | conn = swift.swift_connection()
182 | swift.batch_upload_files(
183 | conn,
184 | "fakecontainer",
185 | (os.path.join(self.tmpdir, file) for file in files),
186 | max_workers=3,
187 | )
188 | mock_conn.put_object.assert_has_calls(
189 | (
190 | mock.call(
191 | "fakecontainer",
192 | file,
193 | contents=mock.ANY,
194 | content_type=mock.ANY,
195 | )
196 | for file in files
197 | ),
198 | any_order=True,
199 | )
200 |
--------------------------------------------------------------------------------
/extlinks/common/urls.py:
--------------------------------------------------------------------------------
1 | from django.urls import path
2 |
3 | from extlinks.common.views import (
4 | CSVProjectTotals,
5 | CSVUserTotals,
6 | )
7 |
8 | # Shared URL paths. These get namespaced by each app's urlpatterns.
9 | urlpatterns = [
10 | path(
11 | "/csv/project_totals",
12 | CSVProjectTotals.as_view(),
13 | name="csv_project_totals",
14 | ),
15 | path("/csv/user_totals", CSVUserTotals.as_view(), name="csv_user_totals"),
16 | ]
17 |
--------------------------------------------------------------------------------
/extlinks/common/views.py:
--------------------------------------------------------------------------------
1 | import csv
2 |
3 | from django.contrib.contenttypes.models import ContentType
4 | from django.db.models import Q, Sum
5 | from django.http import HttpResponse
6 | from django.views.generic import View
7 |
8 | from extlinks.aggregates.models import (
9 | LinkAggregate,
10 | PageProjectAggregate,
11 | UserAggregate,
12 | )
13 | from extlinks.common.helpers import build_queryset_filters
14 | from extlinks.links.models import URLPattern, LinkEvent
15 | from extlinks.organisations.models import Collection, Organisation
16 | from extlinks.programs.models import Program
17 |
18 |
19 | # CSV views borrowed from
20 | # https://github.com/WikipediaLibrary/TWLight/blob/master/TWLight/graphs/views.py
21 | # These views are a little hacky in how they determine whether we need the CSV
22 | # for an organisation or partner page, but this seems to work.
23 | class _CSVDownloadView(View):
24 | """
25 | Base view powering CSV downloads. Not intended to be used directly.
26 | URLs should point at subclasses of this view. Subclasses should implement a
27 | _write_data() method.
28 | """
29 |
30 | def get(self, request, *args, **kwargs):
31 | # Create the HttpResponse object with the appropriate CSV header.
32 | response = HttpResponse(content_type="text/csv")
33 | response["Content-Disposition"] = 'attachment; filename="data.csv"'
34 |
35 | self._write_data(response)
36 |
37 | return response
38 |
39 | def _write_data(self, response):
40 | raise NotImplementedError
41 |
42 |
43 | class CSVOrgTotals(_CSVDownloadView):
44 | def _write_data(self, response):
45 | program_pk = self.kwargs["pk"]
46 | queryset_filter = _get_queryset_filter(
47 | program_pk, self.request.build_absolute_uri(), self.request.GET
48 | )
49 |
50 | top_orgs = (
51 | LinkAggregate.objects.filter(queryset_filter)
52 | .values("organisation__pk", "organisation__name")
53 | .annotate(
54 | links_added=Sum("total_links_added"),
55 | links_removed=Sum("total_links_removed"),
56 | links_diff=Sum("total_links_added") - Sum("total_links_removed"),
57 | )
58 | .order_by("-links_diff", "-links_added", "-links_removed")
59 | )
60 |
61 | writer = csv.writer(response)
62 |
63 | writer.writerow(["Organisation", "Links added", "Links removed", "Net Change"])
64 |
65 | for org in top_orgs:
66 | writer.writerow(
67 | [
68 | org["organisation__name"],
69 | org["links_added"],
70 | org["links_removed"],
71 | org["links_diff"],
72 | ]
73 | )
74 |
75 |
76 | class CSVPageTotals(_CSVDownloadView):
77 | def _write_data(self, response):
78 | pk = self.kwargs["pk"]
79 | queryset_filter = _get_queryset_filter(
80 | pk, self.request.build_absolute_uri(), self.request.GET
81 | )
82 |
83 | top_pages = (
84 | PageProjectAggregate.objects.filter(queryset_filter)
85 | .values("project_name", "page_name")
86 | .annotate(
87 | links_added=Sum("total_links_added"),
88 | links_removed=Sum("total_links_removed"),
89 | links_diff=Sum("total_links_added") - Sum("total_links_removed"),
90 | )
91 | .order_by("-links_diff", "-links_added", "-links_removed")
92 | )
93 | writer = csv.writer(response)
94 |
95 | writer.writerow(
96 | ["Page title", "Project", "Links added", "Links removed", "Net Change"]
97 | )
98 |
99 | for page in top_pages:
100 | writer.writerow(
101 | [
102 | page["page_name"],
103 | page["project_name"],
104 | page["links_added"],
105 | page["links_removed"],
106 | page["links_diff"],
107 | ]
108 | )
109 |
110 |
111 | class CSVProjectTotals(_CSVDownloadView):
112 | def _write_data(self, response):
113 | pk = self.kwargs["pk"]
114 | queryset_filter = _get_queryset_filter(
115 | pk, self.request.build_absolute_uri(), self.request.GET
116 | )
117 |
118 | top_projects = (
119 | PageProjectAggregate.objects.filter(queryset_filter)
120 | .values("project_name")
121 | .annotate(
122 | links_added=Sum("total_links_added"),
123 | links_removed=Sum("total_links_removed"),
124 | links_diff=Sum("total_links_added") - Sum("total_links_removed"),
125 | )
126 | .order_by("-links_diff", "-links_added", "-links_removed")
127 | )
128 | writer = csv.writer(response)
129 |
130 | writer.writerow(["Project", "Links added", "Links removed", "Net Change"])
131 |
132 | for project in top_projects:
133 | writer.writerow(
134 | [
135 | project["project_name"],
136 | project["links_added"],
137 | project["links_removed"],
138 | project["links_diff"],
139 | ]
140 | )
141 |
142 |
143 | class CSVUserTotals(_CSVDownloadView):
144 | def _write_data(self, response):
145 | pk = self.kwargs["pk"]
146 | queryset_filter = _get_queryset_filter(
147 | pk, self.request.build_absolute_uri(), self.request.GET
148 | )
149 |
150 | top_users = (
151 | UserAggregate.objects.filter(queryset_filter)
152 | .values("username")
153 | .annotate(
154 | links_added=Sum("total_links_added"),
155 | links_removed=Sum("total_links_removed"),
156 | links_diff=Sum("total_links_added") - Sum("total_links_removed"),
157 | )
158 | .order_by("-links_diff", "-links_added", "-links_removed")
159 | )
160 | writer = csv.writer(response)
161 |
162 | writer.writerow(["Username", "Links added", "Links removed", "Net Change"])
163 |
164 | for user in top_users:
165 | writer.writerow(
166 | [
167 | user["username"],
168 | user["links_added"],
169 | user["links_removed"],
170 | user["links_diff"],
171 | ]
172 | )
173 |
174 |
175 | def _get_queryset_filter(pk, uri, filters):
176 | """
177 | This function returns a Q object with filters depending on which URL a user
178 | is requesting information from
179 |
180 | Parameters
181 | ----------
182 | pk: int
183 | The primary key of a collection or a program, depending on the origin of
184 | the request
185 |
186 | uri: str
187 | The origin URL from the request. If the URL is from the organisations view,
188 | then we will obtain the collection. Otherwise, if the URL is from the
189 | programs view, we will obtain the organisations associated to that program
190 |
191 | filters: dict
192 | The filters (if there are any) that were passed in the request
193 |
194 | Returns
195 | -------
196 | Q : A Q object which will filter the aggregates queries
197 | """
198 | # If we came from an organisation page, then we are passing the collection id
199 | if "/organisations" in uri:
200 | collection = Collection.objects.get(pk=pk)
201 | queryset_filter = build_queryset_filters(filters, {"collection": collection})
202 | else:
203 | program = Program.objects.prefetch_related("organisation_set").get(pk=pk)
204 | queryset_filter = build_queryset_filters(
205 | filters, {"organisations": program.organisation_set.all()}
206 | )
207 |
208 | return queryset_filter
209 |
--------------------------------------------------------------------------------
/extlinks/healthcheck/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/healthcheck/__init__.py
--------------------------------------------------------------------------------
/extlinks/healthcheck/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 |
3 | # Register your models here.
4 |
--------------------------------------------------------------------------------
/extlinks/healthcheck/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/healthcheck/migrations/__init__.py
--------------------------------------------------------------------------------
/extlinks/healthcheck/models.py:
--------------------------------------------------------------------------------
1 | from django.db import models
2 |
3 | # Create your models here.
4 |
--------------------------------------------------------------------------------
/extlinks/healthcheck/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 |
3 | # Create your tests here.
4 |
--------------------------------------------------------------------------------
/extlinks/healthcheck/urls.py:
--------------------------------------------------------------------------------
1 | from django.urls import path
2 |
3 | from .views import (
4 | AggregatesCronHealthCheckView,
5 | CommonCronHealthCheckView,
6 | LinksCronHealthCheckView,
7 | OrganizationsCronHealthCheckView,
8 | LinkEventHealthCheckView,
9 | MonthlyAggregatesCronHealthCheckView,
10 | )
11 |
12 | urlpatterns = [
13 | path("link_event", LinkEventHealthCheckView.as_view(), name="link_event"),
14 | path("agg_crons", AggregatesCronHealthCheckView.as_view(), name="agg_crons"),
15 | path("common_crons", CommonCronHealthCheckView.as_view(), name="common_crons"),
16 | path("link_crons", LinksCronHealthCheckView.as_view(), name="link_crons"),
17 | path("org_crons", OrganizationsCronHealthCheckView.as_view(), name="org_crons"),
18 | path("month_agg_crons", MonthlyAggregatesCronHealthCheckView.as_view(), name="month_agg_crons"),
19 | ]
20 |
--------------------------------------------------------------------------------
/extlinks/healthcheck/views.py:
--------------------------------------------------------------------------------
1 | from datetime import timedelta
2 | import os
3 | from django.http import JsonResponse
4 | from django.views import View
5 | from django.utils.decorators import method_decorator
6 | from django.utils.timezone import now
7 | from django.views.decorators.cache import cache_page
8 | from extlinks.aggregates.models import (
9 | LinkAggregate,
10 | UserAggregate,
11 | PageProjectAggregate,
12 | )
13 | from extlinks.links.models import LinkEvent, LinkSearchTotal
14 | from extlinks.organisations.models import Organisation
15 |
16 |
17 | @method_decorator(cache_page(60 * 1), name="dispatch")
18 | class LinkEventHealthCheckView(View):
19 | """
20 | Healthcheck that passes only if the latest link event is less than a day old
21 | """
22 |
23 | def get(self, request, *args, **kwargs):
24 | status_code = 500
25 | status_msg = "error"
26 | try:
27 | latest_linkevent_datetime = LinkEvent.objects.all().latest().timestamp
28 | cutoff_datetime = now() - timedelta(days=1)
29 | if latest_linkevent_datetime > cutoff_datetime:
30 | status_code = 200
31 | status_msg = "ok"
32 | else:
33 | status_msg = "out of date"
34 | except LinkEvent.DoesNotExist:
35 | status_code = 404
36 | status_msg = "not found"
37 | response = JsonResponse({"status": status_msg})
38 | response.status_code = status_code
39 | return response
40 |
41 |
42 | @method_decorator(cache_page(60 * 1), name="dispatch")
43 | class AggregatesCronHealthCheckView(View):
44 | """
45 | Healthcheck that passes only if the link aggregate jobs have all run successfully in the last 2 days
46 | """
47 |
48 | def get_most_recent(self, aggregate, monthly=False):
49 | try:
50 | if monthly:
51 | aggregate.objects.filter(day=0).latest("full_date").full_date
52 | else:
53 | aggregate.objects.exclude(day=0).latest("full_date").full_date
54 | except aggregate.DoesNotExist:
55 | pass
56 |
57 | def get(self, request, *args, **kwargs):
58 | status_code = 500
59 | status_msg = "error"
60 | try:
61 | latest_link_aggregates_cron_endtime = self.get_most_recent(LinkAggregate)
62 | latest_user_aggregates_cron_endtime = self.get_most_recent(UserAggregate)
63 | latest_pageproject_aggregates_cron_endtime = self.get_most_recent(
64 | PageProjectAggregate
65 | )
66 | cutoff_datetime = now() - timedelta(days=2)
67 | if latest_link_aggregates_cron_endtime < cutoff_datetime:
68 | status_msg = "out of date"
69 | elif latest_user_aggregates_cron_endtime < cutoff_datetime:
70 | status_msg = "out of date"
71 | elif latest_pageproject_aggregates_cron_endtime < cutoff_datetime:
72 | status_msg = "out of date"
73 | else:
74 | status_code = 200
75 | status_msg = "ok"
76 | except:
77 | status_code = 404
78 | status_msg = "not found"
79 | response = JsonResponse({"status": status_msg})
80 | response.status_code = status_code
81 | return response
82 |
83 |
84 | @method_decorator(cache_page(60 * 1), name="dispatch")
85 | class MonthlyAggregatesCronHealthCheckView(View):
86 | """
87 | Healthcheck that passes only if the monthly aggregate jobs have all run successfully in the last month
88 | """
89 |
90 | def get(self, request, *args, **kwargs):
91 | status_code = 500
92 | status_msg = "error"
93 | try:
94 | latest_link_aggregates_cron_endtime = self.get_most_recent(
95 | LinkAggregate, True
96 | )
97 | latest_user_aggregates_cron_endtime = self.get_most_recent(
98 | UserAggregate, True
99 | )
100 | latest_pageproject_aggregates_cron_endtime = self.get_most_recent(
101 | PageProjectAggregate, True
102 | )
103 | # Monthly jobs may take some time to run, let's give 35 days to make sure
104 | cutoff_datetime = now() - timedelta(days=35)
105 | if latest_link_aggregates_cron_endtime < cutoff_datetime:
106 | status_msg = "out of date"
107 | elif latest_user_aggregates_cron_endtime < cutoff_datetime:
108 | status_msg = "out of date"
109 | elif latest_pageproject_aggregates_cron_endtime < cutoff_datetime:
110 | status_msg = "out of date"
111 | else:
112 | status_code = 200
113 | status_msg = "ok"
114 | except:
115 | status_code = 404
116 | status_msg = "not found"
117 | response = JsonResponse({"status": status_msg})
118 | response.status_code = status_code
119 | return response
120 |
121 |
122 | @method_decorator(cache_page(60 * 1), name="dispatch")
123 | class CommonCronHealthCheckView(View):
124 | """
125 | Healthcheck that passes only if a backup file has been created in the last 3 days
126 | """
127 |
128 | def get(self, request, *args, **kwargs):
129 | status_code = 500
130 | status_msg = "error"
131 | status_msg = "out of date"
132 | for i in range(3):
133 | date = now() - timedelta(days=i)
134 | filename = "links_linkevent_{}.json.gz".format(date.strftime("%Y%m%d"))
135 | file = os.path.join(os.environ["HOST_BACKUP_DIR"], filename)
136 | if os.path.isfile(file):
137 | status_code = 200
138 | status_msg = "ok"
139 | break
140 | response = JsonResponse({"status": status_msg})
141 | response.status_code = status_code
142 | return response
143 |
144 |
145 | @method_decorator(cache_page(60 * 1), name="dispatch")
146 | class LinksCronHealthCheckView(View):
147 | """
148 | Healthcheck that passes only if the links jobs have all run successfully in the last 9 days
149 | """
150 |
151 | def get(self, request, *args, **kwargs):
152 | status_code = 500
153 | status_msg = "error"
154 | try:
155 | latest_total_links_endtime = LinkSearchTotal.objects.latest("date").date
156 | cutoff_datetime = now().date() - timedelta(days=9)
157 | if latest_total_links_endtime < cutoff_datetime:
158 | status_msg = "out of date"
159 | else:
160 | status_code = 200
161 | status_msg = "ok"
162 | except:
163 | status_code = 404
164 | status_msg = "not found"
165 | response = JsonResponse({"status": status_msg})
166 | response.status_code = status_code
167 | return response
168 |
169 |
170 | @method_decorator(cache_page(60 * 1), name="dispatch")
171 | class OrganizationsCronHealthCheckView(View):
172 | """
173 | Healthcheck that passes only if the Organizations jobs have all run successfully in the last 2 hours
174 | """
175 |
176 | def get(self, request, *args, **kwargs):
177 | status_code = 500
178 | status_msg = "error"
179 | try:
180 | latest_user_lists_endtime = Organisation.objects.latest(
181 | "username_list_updated"
182 | ).username_list_updated
183 | cutoff_datetime = now() - timedelta(hours=2)
184 | if latest_user_lists_endtime < cutoff_datetime:
185 | status_msg = "out of date"
186 | else:
187 | status_code = 200
188 | status_msg = "ok"
189 | except:
190 | status_code = 404
191 | status_msg = "not found"
192 | response = JsonResponse({"status": status_msg})
193 | response.status_code = status_code
194 | return response
195 |
--------------------------------------------------------------------------------
/extlinks/links/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/links/__init__.py
--------------------------------------------------------------------------------
/extlinks/links/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 | from django.contrib.contenttypes.admin import GenericTabularInline
3 |
4 | from .models import URLPattern, LinkSearchTotal, LinkEvent
5 |
6 |
7 | class LinkEventURLPatternAdminInline(GenericTabularInline):
8 | model = LinkEvent
9 | # Although not ideal, changing this to False has improved performance
10 | show_change_link = False
11 | exclude = ["user_id", "url"]
12 | readonly_fields = [
13 | "link",
14 | "timestamp",
15 | "domain",
16 | "rev_id",
17 | "page_title",
18 | "page_namespace",
19 | "event_id",
20 | "user_is_bot",
21 | "hash_link_event_id",
22 | "change",
23 | "username",
24 | "on_user_list",
25 | ]
26 |
27 | def get_queryset(self, request):
28 | qs = super().get_queryset(request)
29 |
30 | return qs.select_related("username")
31 |
32 |
33 | class URLPatternAdmin(admin.ModelAdmin):
34 | list_display = ("url",)
35 | exclude = ["collections"]
36 | autocomplete_fields = ["collection"]
37 | inlines = [
38 | LinkEventURLPatternAdminInline,
39 | ]
40 |
41 |
42 | admin.site.register(URLPattern, URLPatternAdmin)
43 |
44 |
45 | class LinkSearchTotalAdmin(admin.ModelAdmin):
46 | list_display = ("url", "date", "total")
47 | list_select_related = ["url"]
48 |
49 |
50 | admin.site.register(LinkSearchTotal, LinkSearchTotalAdmin)
51 |
52 |
53 | class LinkEventAdmin(admin.ModelAdmin):
54 | list_display = ("link", "timestamp", "domain", "username", "change")
55 | list_select_related = ["username", "content_type"]
56 | readonly_fields = ["url_pattern_display", "username"]
57 | exclude = ["content_type", "object_id", "url"]
58 |
59 | @admin.display(description="URLPattern")
60 | def url_pattern_display(self, instance):
61 | return URLPattern.objects.filter(pk=instance.content_object.pk).first()
62 |
63 |
64 | admin.site.register(LinkEvent, LinkEventAdmin)
65 |
--------------------------------------------------------------------------------
/extlinks/links/factories.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timezone
2 | import factory
3 | import random
4 |
5 | from extlinks.organisations.factories import UserFactory, CollectionFactory
6 | from .models import LinkEvent, LinkSearchTotal, URLPattern
7 |
8 | class URLPatternFactory(factory.django.DjangoModelFactory):
9 | class Meta:
10 | model = URLPattern
11 | strategy = factory.CREATE_STRATEGY
12 |
13 | # factory.Faker returns a Faker object by default, rather than str
14 | url = str(factory.Faker("url", schemes=["https"]))[8:-1]
15 |
16 |
17 | class LinkEventFactory(factory.django.DjangoModelFactory):
18 | class Meta:
19 | model = LinkEvent
20 | strategy = factory.CREATE_STRATEGY
21 |
22 | # We don't define any automatically generated link here, because it
23 | # needs to directly correspond to the url field for this object too.
24 | timestamp = datetime.now(timezone.utc)
25 | domain = "en.wikipedia.org"
26 | username = factory.SubFactory(UserFactory)
27 | rev_id = random.randint(10000000, 100000000)
28 | user_id = random.randint(10000000, 100000000)
29 | page_title = factory.Faker("word")
30 | page_namespace = 0
31 | event_id = factory.Faker("uuid4")
32 | change = LinkEvent.ADDED
33 | on_user_list = False
34 |
35 |
36 | class LinkSearchTotalFactory(factory.django.DjangoModelFactory):
37 | class Meta:
38 | model = LinkSearchTotal
39 | strategy = factory.CREATE_STRATEGY
40 |
41 | url = factory.SubFactory(URLPatternFactory)
42 | date = datetime.today()
43 | total = random.randint(1, 1000)
44 |
--------------------------------------------------------------------------------
/extlinks/links/helpers.py:
--------------------------------------------------------------------------------
1 | from urllib.parse import unquote
2 |
3 | from .models import URLPattern
4 |
5 |
6 | def split_url_for_query(url):
7 | """
8 | Given a URL pattern, split it into two components:
9 | url_optimised: URL and domain name in the el_index format
10 | (https://www.mediawiki.org/wiki/Manual:Externallinks_table#el_index)
11 | url_pattern_end: Anything following the domain name
12 | """
13 | url = url.strip() # Catch any trailing spaces
14 | # Start after *. if present
15 | if url.startswith("*."):
16 | url = url[2:]
17 |
18 | url_start = url.split("/")[0].split(".")[::-1]
19 | url_optimised = ".".join(url_start) + ".%"
20 |
21 | if "/" in url:
22 | url_end = "/".join(url.split("/")[1:])
23 | url_pattern_end = "%./" + url_end + "%"
24 | else:
25 | url_pattern_end = "%"
26 |
27 | return url_optimised, url_pattern_end
28 |
29 |
30 | def link_is_tracked(link):
31 | all_urlpatterns = URLPattern.objects.all()
32 | tracked_links_list = list(all_urlpatterns.values_list("url", flat=True))
33 | proxied_url = False
34 |
35 | # If this looks like a TWL proxied URL we're going to need to match
36 | # it against a longer list of strings
37 | if "wikipedialibrary.idm.oclc" in link:
38 | proxied_url = True
39 | proxied_urls = [urlpattern.get_proxied_url for urlpattern in all_urlpatterns]
40 | tracked_links_list.extend(proxied_urls)
41 |
42 | # This is a quick check so we can filter the majority of events
43 | # which won't be matching our filters
44 | if any(links in link for links in tracked_links_list):
45 | # Then we do a more detailed check, to make sure this is the
46 | # root URL.
47 | for tracked_link in tracked_links_list:
48 | # If we track apa.org, we don't want to match iaapa.org
49 | # so we make sure the URL is actually pointing at apa.org
50 | url_starts = ["//" + tracked_link, "." + tracked_link]
51 | if proxied_url:
52 | # Proxy URLs may contain //www- not //www.
53 | url_starts.append("-" + tracked_link)
54 |
55 | # We want to avoid link additions from e.g. InternetArchive
56 | # where the URL takes the structure
57 | # https://web.archive.org/https://test.com/
58 | protocol_count = link.count("//")
59 |
60 | if any(start in link for start in url_starts) and protocol_count < 2:
61 | return True
62 | else:
63 | return False
64 |
--------------------------------------------------------------------------------
/extlinks/links/management/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/links/management/__init__.py
--------------------------------------------------------------------------------
/extlinks/links/management/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/links/management/commands/__init__.py
--------------------------------------------------------------------------------
/extlinks/links/management/commands/fix_proxy_linkevents_on_user_list.py:
--------------------------------------------------------------------------------
1 | from extlinks.common.management.commands import BaseCommand
2 | from django.core.management import call_command
3 |
4 | from extlinks.aggregates.models import (
5 | LinkAggregate,
6 | PageProjectAggregate,
7 | UserAggregate,
8 | )
9 | from extlinks.links.models import LinkEvent
10 |
11 |
12 | class Command(BaseCommand):
13 | help = "Fixes all those proxy linkevents that aren't in the user list"
14 |
15 | def _handle(self, *args, **options):
16 | proxy_not_on_user_list_linkevents = LinkEvent.objects.filter(
17 | link__contains="wikipedialibrary.idm.oclc", on_user_list=False
18 | )
19 |
20 | if proxy_not_on_user_list_linkevents.exists():
21 | earliest_link_date = proxy_not_on_user_list_linkevents.earliest(
22 | "timestamp"
23 | ).timestamp
24 | collection_list = set()
25 | for linkevent in proxy_not_on_user_list_linkevents:
26 | # Get URLPatterns associated with the linkevent
27 | url = linkevent.content_object
28 | # Get the organisation from the first url
29 | if url:
30 | collections = url.collections.all()
31 | for collection in collections:
32 | collection_list.add(collection.id)
33 | organisation = collection.organisation
34 | username_list = organisation.username_list
35 | if username_list:
36 | if linkevent.username in username_list.all():
37 | linkevent.on_user_list = True
38 | linkevent.save()
39 |
40 | if collection_list:
41 | LinkAggregate.objects.filter(
42 | collection__in=collection_list, full_date__gte=earliest_link_date
43 | ).delete()
44 | PageProjectAggregate.objects.filter(
45 | collection__in=collection_list, full_date__gte=earliest_link_date
46 | ).delete()
47 | UserAggregate.objects.filter(
48 | collection__in=collection_list, full_date__gte=earliest_link_date
49 | ).delete()
50 |
51 | call_command("fill_link_aggregates", collections=collection_list)
52 | call_command("fill_pageproject_aggregates", collections=collection_list)
53 | call_command("fill_user_aggregates", collections=collection_list)
54 |
--------------------------------------------------------------------------------
/extlinks/links/management/commands/linkevent_example_data.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta, timezone
2 | import random
3 | from faker import Faker
4 |
5 | from extlinks.common.management.commands import BaseCommand
6 |
7 | from extlinks.organisations.models import User
8 | from ...models import URLPattern, LinkEvent
9 |
10 |
11 | class Command(BaseCommand):
12 | help = "Backfills a set of linkevents for each url pattern"
13 |
14 | def add_arguments(self, parser):
15 | parser.add_argument("num_events", nargs="+", type=int)
16 |
17 | def _handle(self, *args, **options):
18 | # Number of link events to log in total
19 | num_events = options["num_events"][0]
20 |
21 | fake = Faker()
22 | languages = ["en", "de", "fr", "he", "hi", "ta"]
23 | users = User.objects.all()
24 | # Hacky way of adding a weighted random choice of change type.
25 | # Addition is likely to be more prevalent.
26 | change_choices = [
27 | LinkEvent.ADDED,
28 | LinkEvent.ADDED,
29 | LinkEvent.ADDED,
30 | LinkEvent.REMOVED,
31 | ]
32 |
33 | urlpatterns = URLPattern.objects.all()
34 |
35 | for _ in range(num_events):
36 | urlpattern = random.choice(urlpatterns)
37 | organisation = urlpattern.collections.first().organisation
38 | random_user = random.choice(users)
39 |
40 | # If this org limits by user, choose either a random user who
41 | # isn't on the org's user list, or from the org's user list.
42 | on_user_list = False
43 | if organisation.limit_by_user:
44 | username_list = organisation.username_list.all()
45 | if random_user in username_list:
46 | on_user_list = True
47 |
48 | link_event = LinkEvent.objects.create(
49 | link=urlpattern.url + "/" + fake.word(),
50 | timestamp=fake.date_time_between(
51 | start_date=datetime.now() - timedelta(days=365),
52 | end_date="now",
53 | tzinfo=timezone.utc,
54 | ),
55 | domain=random.choice(languages) + ".wikipedia.org",
56 | username=random_user,
57 | rev_id=random.randint(10000000, 100000000),
58 | user_id=random.randint(10000000, 100000000),
59 | page_title=fake.word(),
60 | page_namespace=0,
61 | event_id=fake.uuid4(),
62 | change=random.choice(change_choices),
63 | on_user_list=on_user_list,
64 | )
65 | urlpattern.link_events.add(link_event)
66 | urlpattern.save()
67 |
--------------------------------------------------------------------------------
/extlinks/links/management/commands/linksearchtotal_collect.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import MySQLdb
3 | import os
4 |
5 | from extlinks.common.management.commands import BaseCommand
6 | from django.db import close_old_connections
7 |
8 | from extlinks.links.helpers import split_url_for_query
9 | from extlinks.links.models import LinkSearchTotal, URLPattern
10 | from extlinks.settings.base import BASE_DIR
11 |
12 |
13 | class Command(BaseCommand):
14 | help = "Updates link totals from externallinks table"
15 |
16 | def _handle(self, *args, **options):
17 | protocols = ["http", "https"]
18 |
19 | with open(os.path.join(BASE_DIR, "wiki-list.csv"), "r") as wiki_list:
20 | csv_reader = csv.reader(wiki_list)
21 | wiki_list_data = []
22 | for row in csv_reader:
23 | wiki_list_data.append(row[0])
24 |
25 | all_urlpatterns = URLPattern.objects.all()
26 |
27 | total_links_dictionary = {}
28 | for i, language in enumerate(wiki_list_data):
29 | db = MySQLdb.connect(
30 | host="{lang}wiki.analytics.db.svc.wikimedia.cloud".format(
31 | lang=language
32 | ),
33 | user=os.environ["REPLICA_DB_USER"],
34 | passwd=os.environ["REPLICA_DB_PASSWORD"],
35 | db="{lang}wiki_p".format(lang=language),
36 | )
37 |
38 | cur = db.cursor()
39 |
40 | for urlpattern in all_urlpatterns:
41 | # For the first language, initialise tracking
42 | if i == 0:
43 | total_links_dictionary[urlpattern.pk] = 0
44 |
45 | url = urlpattern.url
46 | optimised_url, url_pattern_end = split_url_for_query(url)
47 |
48 | for protocol in protocols:
49 | url_pattern_start = protocol + "://" + optimised_url
50 |
51 | cur.execute(
52 | """SELECT COUNT(*) FROM externallinks
53 | WHERE el_to_domain_index LIKE '{url_start}'
54 | AND el_to_domain_index LIKE '{url_end}'
55 | """.format(
56 | url_start=url_pattern_start, url_end=url_pattern_end
57 | )
58 | )
59 |
60 | this_num_urls = cur.fetchone()[0]
61 |
62 | total_links_dictionary[urlpattern.pk] += this_num_urls
63 |
64 | for urlpattern_pk, total_count in total_links_dictionary.items():
65 | linksearch_object = LinkSearchTotal(
66 | url=URLPattern.objects.get(pk=urlpattern_pk), total=total_count
67 | )
68 | linksearch_object.save()
69 |
70 | close_old_connections()
71 |
--------------------------------------------------------------------------------
/extlinks/links/management/commands/linksearchtotal_example_data.py:
--------------------------------------------------------------------------------
1 | from datetime import date, timedelta
2 | import random
3 |
4 | from extlinks.common.management.commands import BaseCommand
5 |
6 | from ...models import URLPattern, LinkSearchTotal
7 |
8 |
9 | class Command(BaseCommand):
10 | help = "Backfills a set of LinkSearchTotals for each url pattern"
11 |
12 | def add_arguments(self, parser):
13 | parser.add_argument("weeks", nargs="+", type=int)
14 |
15 | def _handle(self, *args, **options):
16 | # The number of weeks to go back
17 | num_dates = options["weeks"][0]
18 |
19 | urlpatterns = URLPattern.objects.all()
20 | for urlpattern in urlpatterns:
21 | date_total = random.randint(500, 30000)
22 | this_date = date.today()
23 |
24 | for _ in range(num_dates):
25 |
26 | # Each week, going backwards, we lose between 0 and 10%
27 | # of the total number of links.
28 | less_total = random.randint(0, int(date_total * 0.1))
29 | date_total -= less_total
30 |
31 | new_total = LinkSearchTotal(
32 | url=urlpattern, date=this_date, total=date_total
33 | )
34 | new_total.save()
35 |
36 | this_date = this_date - timedelta(days=7)
37 |
--------------------------------------------------------------------------------
/extlinks/links/management/commands/remove_ezproxy_collection.py:
--------------------------------------------------------------------------------
1 | from django.contrib.contenttypes.models import ContentType
2 | from extlinks.common.management.commands import BaseCommand
3 | from django.core.management import call_command
4 |
5 | from extlinks.aggregates.models import (
6 | LinkAggregate,
7 | PageProjectAggregate,
8 | UserAggregate,
9 | )
10 | from extlinks.links.models import URLPattern, LinkEvent
11 | from extlinks.organisations.models import Organisation, Collection
12 |
13 |
14 | class Command(BaseCommand):
15 | help = "Deletes the EZProxy collection and organisation and reassigns those LinkEvents to new URLPatterns"
16 |
17 | def _handle(self, *args, **options):
18 | ezproxy_org = self._get_ezproxy_organisation()
19 | ezproxy_collection = self._get_ezproxy_collection()
20 | url_patterns = ezproxy_collection.get_url_patterns().all()
21 |
22 | linkevents = LinkEvent.objects.get_queryset()
23 | for url_pattern in url_patterns:
24 | linkevents.filter(object_id=url_pattern.id)
25 | collections = Collection.objects.all()
26 | self._process_linkevents_collections(linkevents, collections)
27 | self._delete_aggregates_ezproxy(ezproxy_org, ezproxy_collection, url_patterns)
28 |
29 | def _get_ezproxy_organisation(self):
30 | """
31 | Gets the EZProxy organisation, or returns None if it's already been deleted
32 |
33 | Parameters
34 | ----------
35 |
36 | Returns
37 | -------
38 | Organisation object or None
39 | """
40 | if Organisation.objects.filter(name="Wikipedia Library OCLC EZProxy").exists():
41 | return Organisation.objects.get(name="Wikipedia Library OCLC EZProxy")
42 |
43 | return None
44 |
45 | def _get_ezproxy_collection(self):
46 | """
47 | Gets the EZProxy collection, or returns None if it's already been deleted
48 |
49 | Parameters
50 | ----------
51 |
52 | Returns
53 | -------
54 | Collection object or None
55 | """
56 | if Collection.objects.filter(name="EZProxy").exists():
57 | return Collection.objects.get(name="EZProxy")
58 |
59 | return None
60 |
61 | def _get_ezproxy_url_patterns(self, collection):
62 | """
63 | Gets the EZProxy collection, or returns None if it's already been deleted
64 |
65 | Parameters
66 | ----------
67 | collection: The collection the URLPatterns belong to
68 |
69 | Returns
70 | -------
71 | URLPattern object or None
72 | """
73 | if collection and URLPattern.objects.filter(collection=collection).exists():
74 | return URLPattern.objects.get(collection=collection)
75 |
76 | return None
77 |
78 | def _delete_aggregates_ezproxy(self, ezproxy_org, ezproxy_collection, url_patterns):
79 | """
80 | Deletes any aggregate with the EZProxy collection and organisation,
81 | then deletes the collection, organisation and url patterns
82 |
83 | Parameters
84 | ----------
85 | ezproxy_org: Organisation
86 | The organisation to filter and delete the aggregates tables and that
87 | will later be deleted
88 |
89 | ezproxy_collection: Collection
90 | The collection to filter and delete the aggregates tables and that
91 | will later be deleted
92 |
93 | url_patterns: URLPattern
94 | The EZProxy URLPatterns that will be deleted
95 |
96 | Returns
97 | -------
98 |
99 | """
100 | LinkAggregate.objects.filter(
101 | organisation=ezproxy_org, collection=ezproxy_collection
102 | ).delete()
103 | PageProjectAggregate.objects.filter(
104 | organisation=ezproxy_org, collection=ezproxy_collection
105 | ).delete()
106 | UserAggregate.objects.filter(
107 | organisation=ezproxy_org, collection=ezproxy_collection
108 | ).delete()
109 |
110 | url_patterns.delete()
111 | ezproxy_collection.delete()
112 | ezproxy_org.delete()
113 |
114 | def _process_linkevents_collections(self, linkevents, collections):
115 | """
116 | Loops through all collections to get their url patterns. If a linkevent
117 | link coincides with a URLPattern, it is added to that LinkEvent. That way,
118 | it will be counted when the aggregates commands are run again
119 |
120 | Parameters
121 | ----------
122 | linkevents: Queryset[LinkEvent]
123 | LinkEvent that have no URLPatterns assigned (therefore no collection assigned)
124 |
125 | collections: Queryset[Collection]
126 | All of the collections
127 |
128 | Returns
129 | -------
130 |
131 | """
132 | for collection in collections:
133 | linkevents_changed = 0
134 | collection_urls = collection.get_url_patterns()
135 | for url_pattern in collection_urls:
136 | for linkevent in linkevents:
137 | proxy_url = url_pattern.url.replace(".", "-")
138 | if url_pattern.url in linkevent.link or proxy_url in linkevent.link:
139 | url_pattern.link_events.add(linkevent)
140 | url_pattern.save()
141 | linkevents_changed += 1
142 | if linkevents_changed > 0:
143 | # There have been changes to this collection, so we must delete
144 | # the aggregates tables for that collection and run the commands
145 | # for it
146 | LinkAggregate.objects.filter(collection=collection).delete()
147 | PageProjectAggregate.objects.filter(collection=collection).delete()
148 | UserAggregate.objects.filter(collection=collection).delete()
149 |
150 | call_command("fill_link_aggregates", collections=[collection.pk])
151 | call_command("fill_pageproject_aggregates", collections=[collection.pk])
152 | call_command("fill_user_aggregates", collections=[collection.pk])
153 |
--------------------------------------------------------------------------------
/extlinks/links/management/commands/upload_all_archived.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from extlinks.common.management.commands import BaseCommand
4 | from django.core.management import call_command
5 |
6 |
7 | class Command(BaseCommand):
8 | help = "Uploads all archives currently located in the backup directory to object storage"
9 |
10 | def add_arguments(self, parser):
11 | parser.add_argument(
12 | "--dir",
13 | help="The directory from which to upload archives.",
14 | type=str
15 | )
16 |
17 | def _handle(self, *args, **options):
18 | path = options['dir']
19 | for filename in os.listdir(path):
20 | if filename.endswith('.json.gz') and filename.startswith('links_linkevent_'):
21 | file_path = os.path.join(path, filename)
22 | if os.path.isfile(file_path):
23 | call_command("linkevents_archive", "upload", file_path)
24 |
--------------------------------------------------------------------------------
/extlinks/links/migrations/0001_initial.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2 on 2019-05-20 14:01
2 |
3 | from django.db import migrations, models
4 | import django.db.models.deletion
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | initial = True
10 |
11 | dependencies = [
12 | ("organisations", "0001_initial"),
13 | ]
14 |
15 | operations = [
16 | migrations.CreateModel(
17 | name="URLPattern",
18 | fields=[
19 | (
20 | "id",
21 | models.AutoField(
22 | auto_created=True,
23 | primary_key=True,
24 | serialize=False,
25 | verbose_name="ID",
26 | ),
27 | ),
28 | ("url", models.CharField(max_length=60)),
29 | (
30 | "collection",
31 | models.ForeignKey(
32 | null=True,
33 | on_delete=django.db.models.deletion.SET_NULL,
34 | related_name="url",
35 | to="organisations.Collection",
36 | ),
37 | ),
38 | ],
39 | options={
40 | "verbose_name_plural": "URL patterns",
41 | "verbose_name": "URL pattern",
42 | },
43 | ),
44 | migrations.CreateModel(
45 | name="LinkSearchTotal",
46 | fields=[
47 | (
48 | "id",
49 | models.AutoField(
50 | auto_created=True,
51 | primary_key=True,
52 | serialize=False,
53 | verbose_name="ID",
54 | ),
55 | ),
56 | ("date", models.DateField(auto_now_add=True)),
57 | ("total", models.PositiveIntegerField()),
58 | (
59 | "url",
60 | models.ForeignKey(
61 | null=True,
62 | on_delete=django.db.models.deletion.SET_NULL,
63 | to="links.URLPattern",
64 | ),
65 | ),
66 | ],
67 | options={
68 | "verbose_name_plural": "LinkSearch totals",
69 | "verbose_name": "LinkSearch total",
70 | },
71 | ),
72 | migrations.CreateModel(
73 | name="LinkEvent",
74 | fields=[
75 | (
76 | "id",
77 | models.AutoField(
78 | auto_created=True,
79 | primary_key=True,
80 | serialize=False,
81 | verbose_name="ID",
82 | ),
83 | ),
84 | ("link", models.CharField(max_length=2083)),
85 | ("timestamp", models.DateTimeField()),
86 | ("domain", models.CharField(max_length=32)),
87 | ("username", models.CharField(max_length=255)),
88 | ("rev_id", models.PositiveIntegerField(null=True)),
89 | ("user_id", models.PositiveIntegerField()),
90 | ("page_title", models.CharField(max_length=255)),
91 | ("page_namespace", models.IntegerField()),
92 | ("event_id", models.CharField(max_length=36)),
93 | ("change", models.IntegerField(choices=[(0, "Removed"), (1, "Added")])),
94 | ("on_user_list", models.BooleanField(default=False)),
95 | (
96 | "url",
97 | models.ManyToManyField(
98 | related_name="linkevent", to="links.URLPattern"
99 | ),
100 | ),
101 | ],
102 | options={
103 | "get_latest_by": "timestamp",
104 | },
105 | ),
106 | ]
107 |
--------------------------------------------------------------------------------
/extlinks/links/migrations/0002_auto_20190520_1530.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2 on 2019-05-20 15:30
2 |
3 | import datetime
4 | from django.db import migrations, models
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ("links", "0001_initial"),
11 | ]
12 |
13 | operations = [
14 | migrations.AlterField(
15 | model_name="linksearchtotal",
16 | name="date",
17 | field=models.DateField(default=datetime.date.today),
18 | ),
19 | ]
20 |
--------------------------------------------------------------------------------
/extlinks/links/migrations/0003_auto_20190530_1045.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2 on 2019-05-30 10:45
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ("links", "0002_auto_20190520_1530"),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name="urlpattern",
15 | name="url",
16 | field=models.CharField(max_length=150),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/extlinks/links/migrations/0004_auto_20190603_1110.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2 on 2019-06-03 11:10
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ("links", "0003_auto_20190530_1045"),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name="linkevent",
15 | name="user_id",
16 | field=models.PositiveIntegerField(null=True),
17 | ),
18 | migrations.AddConstraint(
19 | model_name="linksearchtotal",
20 | constraint=models.UniqueConstraint(
21 | fields=("url", "date"), name="unique_date_total"
22 | ),
23 | ),
24 | ]
25 |
--------------------------------------------------------------------------------
/extlinks/links/migrations/0005_linkevent_user_is_bot.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2 on 2019-06-04 09:43
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ("links", "0004_auto_20190603_1110"),
10 | ]
11 |
12 | operations = [
13 | migrations.AddField(
14 | model_name="linkevent",
15 | name="user_is_bot",
16 | field=models.BooleanField(default=False),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/extlinks/links/migrations/0006_auto_20190628_1221.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2 on 2019-06-28 12:21
2 |
3 | from django.db import migrations, models
4 | import django.db.models.deletion
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ("organisations", "0005_auto_20190628_1221"),
11 | ("links", "0005_linkevent_user_is_bot"),
12 | ]
13 |
14 | operations = [
15 | migrations.AlterField(
16 | model_name="linkevent",
17 | name="username",
18 | field=models.ForeignKey(
19 | null=True,
20 | on_delete=django.db.models.deletion.SET_NULL,
21 | to="organisations.User",
22 | ),
23 | ),
24 | ]
25 |
--------------------------------------------------------------------------------
/extlinks/links/migrations/0007_auto_20190730_1355.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2.3 on 2019-07-30 13:55
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ("links", "0006_auto_20190628_1221"),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name="linkevent",
15 | name="change",
16 | field=models.IntegerField(
17 | choices=[(0, "Removed"), (1, "Added")], db_index=True
18 | ),
19 | ),
20 | migrations.AlterField(
21 | model_name="linkevent",
22 | name="domain",
23 | field=models.CharField(db_index=True, max_length=32),
24 | ),
25 | ]
26 |
--------------------------------------------------------------------------------
/extlinks/links/migrations/0008_fill_proquest_openurl.py:
--------------------------------------------------------------------------------
1 | from django.db import migrations
2 |
3 |
4 | def add_link_events_to_proquest_openurl_collection(apps, schema_editor):
5 | LinkEvent = apps.get_model("links", "LinkEvent")
6 | Collection = apps.get_model("organisations", "Collection")
7 | URLPattern = apps.get_model("links", "URLPattern")
8 | proquest_openurl_collection = Collection.objects.filter(name="Proquest OpenURL")
9 | if proquest_openurl_collection:
10 | proquest_openurl_linkevents = LinkEvent.objects.filter(
11 | link__icontains="gateway.proquest.com/openurl"
12 | )
13 | for proquest_openurl_linkevent in proquest_openurl_linkevents:
14 | proquest_openurl_linkevent.url.add(proquest_openurl_collection[0].url.get())
15 | proquest_openurl_linkevent.save()
16 |
17 |
18 | class Migration(migrations.Migration):
19 |
20 | dependencies = [("links", "0007_auto_20190730_1355")]
21 |
22 | operations = [migrations.RunPython(add_link_events_to_proquest_openurl_collection)]
23 |
--------------------------------------------------------------------------------
/extlinks/links/migrations/0009_auto_20230215_1656.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.14 on 2023-02-15 16:56
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('links', '0008_fill_proquest_openurl'),
10 | ]
11 |
12 | operations = [
13 | migrations.AddField(
14 | model_name='linkevent',
15 | name='hash_link_event_id',
16 | field=models.CharField(blank=True, max_length=256),
17 | ),
18 | migrations.AddIndex(
19 | model_name='linkevent',
20 | index=models.Index(fields=['hash_link_event_id'], name='links_linke_hash_li_594ad2_idx'),
21 | ),
22 | ]
23 |
--------------------------------------------------------------------------------
/extlinks/links/migrations/0010_data_link_event_id_hash.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.14 on 2023-02-15 17:15
2 | import hashlib
3 | from django.db import migrations
4 |
5 |
6 | def add_hash_link_event_id(apps, schema_editor):
7 | LinkEvent = apps.get_model("links", "LinkEvent")
8 | unhashed = LinkEvent.objects.filter(hash_link_event_id__exact='')
9 | for i in range(100):
10 | if unhashed.count() == 0:
11 | break
12 | else:
13 | for event in unhashed.all()[:100000]:
14 | link_event_id = event.link + event.event_id
15 | hash = hashlib.sha256()
16 | hash.update(link_event_id.encode("utf-8"))
17 | event.hash_link_event_id = hash.hexdigest()
18 | event.save(update_fields=(['hash_link_event_id']))
19 |
20 |
21 | class Migration(migrations.Migration):
22 |
23 | dependencies = [
24 | ("links", "0009_auto_20230215_1656"),
25 | ]
26 |
27 | operations = [migrations.RunPython(add_hash_link_event_id)]
28 |
--------------------------------------------------------------------------------
/extlinks/links/migrations/0011_auto_20230217_1326.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.14 on 2023-02-17 13:26
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('links', '0010_data_link_event_id_hash'),
10 | ]
11 |
12 | operations = [
13 | migrations.AddIndex(
14 | model_name='linkevent',
15 | index=models.Index(fields=['timestamp'], name='links_linke_timesta_4a56de_idx'),
16 | ),
17 | ]
18 |
--------------------------------------------------------------------------------
/extlinks/links/migrations/0012_alter_linkevent_id_alter_linksearchtotal_id_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 4.2.7 on 2023-11-07 19:28
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('links', '0011_auto_20230217_1326'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='linkevent',
15 | name='id',
16 | field=models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID'),
17 | ),
18 | migrations.AlterField(
19 | model_name='linksearchtotal',
20 | name='id',
21 | field=models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID'),
22 | ),
23 | migrations.AlterField(
24 | model_name='urlpattern',
25 | name='id',
26 | field=models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID'),
27 | ),
28 | ]
29 |
--------------------------------------------------------------------------------
/extlinks/links/migrations/0013_add_linkevent_url_linkevent_content_type_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 4.2.14 on 2024-09-27 17:43
2 |
3 | from django.db import migrations, models
4 | import django.db.models.deletion
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('contenttypes', '0002_remove_content_type_name'),
11 | ('organisations', '0008_alter_collection_id_alter_organisation_id_and_more'),
12 | ('links', '0012_alter_linkevent_id_alter_linksearchtotal_id_and_more'),
13 | ]
14 |
15 | operations = [
16 | migrations.AddField(
17 | model_name='linkevent',
18 | name='content_type',
19 | field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='content_type', to='contenttypes.contenttype'),
20 | ),
21 | migrations.AddField(
22 | model_name='linkevent',
23 | name='object_id',
24 | field=models.PositiveIntegerField(null=True),
25 | ),
26 | migrations.AddField(
27 | model_name='urlpattern',
28 | name='collections',
29 | field=models.ManyToManyField(related_name='urlpatterns', to='organisations.collection'),
30 | ),
31 | migrations.AddIndex(
32 | model_name='linkevent',
33 | index=models.Index(fields=['content_type', 'object_id'], name='links_linke_content_1a162a_idx'),
34 | ),
35 | ]
36 |
--------------------------------------------------------------------------------
/extlinks/links/migrations/0014_migrate_url_pattern_relationships.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 4.2.14 on 2024-08-20 17:14
2 | from django.contrib.contenttypes.models import ContentType
3 | from django.core.paginator import Paginator
4 | from django.core.serializers import deserialize
5 | from django.db import migrations
6 | from extlinks.organisations.models import Collection
7 |
8 | import logging
9 |
10 | logger = logging.getLogger(__name__)
11 | handler = logging.StreamHandler()
12 | logger.addHandler(handler)
13 | logger.setLevel(logging.INFO)
14 |
15 | def import_url_collection_json():
16 | with open("./extlinks/links/migrations/urlpatterns.json", "r") as f:
17 | logger.info("\timporting urlpatterns ...")
18 | url_pattern_data = deserialize("json", f)
19 | for url_pattern in url_pattern_data:
20 | # ensuring the related collection exists before attempting to save url pattern
21 | if Collection.objects.filter(pk=url_pattern.object.collection_id).first():
22 | url_pattern.save()
23 |
24 |
25 | def process_link_event(link_event):
26 | #logger.info("\tprocessing linkevent: {}".format(link_event.id))
27 | urlpatterns = (
28 | link_event.url.all()
29 | .order_by("-url__length")
30 | )
31 |
32 | # Find opportunities for early exit
33 | if not urlpatterns:
34 | logger.info("\t\tlinkevent {}:\tno url patterns found!".format(link_event.id))
35 | return
36 | pattern_count = len(urlpatterns)
37 | if pattern_count == 1:
38 | link_event.object_id = urlpatterns[0].pk
39 | link_event.content_type_id = ContentType.objects.get(model="urlpattern").id
40 | link_event.content_object = urlpatterns[0]
41 | link_event.save()
42 | return
43 | elif pattern_count > 2:
44 | logger.info("\t\tlinkevent {}:\tmore than 2 url patterns found!".format(link_event.id))
45 | return
46 |
47 | # @FIXME: unreachable?
48 | # Save the longest (i.e. most specific) URL pattern in the link event
49 | # link_event.object_id = urlpatterns[0].pk
50 | # link_event.content_type_id = ContentType.objects.get(model="urlpattern").id
51 | # link_event.content_object = urlpatterns[0]
52 | # link_event.save()
53 |
54 | def migrate_relationships(apps, schema):
55 | logger.info("\n")
56 | import_url_collection_json()
57 | LinkEvent = apps.get_model("links", "LinkEvent")
58 | paginator = Paginator(
59 | LinkEvent.objects.filter(content_type__isnull=True).order_by("id"), 1000
60 | )
61 | last_page = paginator.page_range[-1]
62 | for page_num in paginator.page_range:
63 | logger.info("\tprocessing linkevent page: {page_num}/{last_page}".format(page_num=page_num,last_page=last_page))
64 | for link_event in paginator.page(page_num).object_list:
65 | process_link_event(link_event)
66 |
67 |
68 | class Migration(migrations.Migration):
69 | atomic = False
70 |
71 | dependencies = [
72 | ("links", "0013_add_linkevent_url_linkevent_content_type_and_more"),
73 | ]
74 |
75 | operations = [
76 | migrations.RunPython(migrate_relationships),
77 | ]
78 |
--------------------------------------------------------------------------------
/extlinks/links/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/links/migrations/__init__.py
--------------------------------------------------------------------------------
/extlinks/links/models.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import logging
3 | from datetime import date
4 |
5 | from django.contrib.contenttypes.fields import GenericRelation, GenericForeignKey
6 | from django.contrib.contenttypes.models import ContentType
7 | from django.core.cache import cache
8 | from django.db import models
9 | from django.db.models.signals import post_save
10 | from django.dispatch import receiver
11 | from django.utils.functional import cached_property
12 |
13 | logger = logging.getLogger("django")
14 |
15 |
16 |
17 | class URLPatternManager(models.Manager):
18 | models.CharField.register_lookup(models.functions.Length)
19 | def cached(self):
20 | cached_patterns = cache.get('url_pattern_cache')
21 | if not cached_patterns:
22 | cached_patterns = self.all()
23 | logger.info('set url_pattern_cache')
24 | cache.set('url_pattern_cache', cached_patterns, None)
25 | return cached_patterns
26 |
27 | def matches(self, link):
28 | # All URL patterns matching this link
29 | tracked_urls = self.cached()
30 | return [
31 | pattern
32 | for pattern in tracked_urls
33 | if pattern.url in link or pattern.get_proxied_url in link
34 | ]
35 |
36 | class URLPattern(models.Model):
37 | class Meta:
38 | app_label = "links"
39 | verbose_name = "URL pattern"
40 | verbose_name_plural = "URL patterns"
41 |
42 | objects = URLPatternManager()
43 | # This doesn't have to look like a 'real' URL so we'll use a CharField.
44 | url = models.CharField(max_length=150)
45 | link_events = GenericRelation("LinkEvent",
46 | null=True,
47 | blank=True,
48 | default=None,
49 | related_query_name="url_pattern",
50 | on_delete=models.SET_NULL)
51 | collection = models.ForeignKey(
52 | "organisations.Collection",
53 | null=True,
54 | on_delete=models.SET_NULL,
55 | related_name="url",
56 | )
57 | collections = models.ManyToManyField(
58 | "organisations.Collection", related_name="urlpatterns"
59 | )
60 |
61 | def __str__(self):
62 | return self.url
63 |
64 | @cached_property
65 | def get_proxied_url(self):
66 | # This isn't everything that happens, but it's good enough
67 | # for us to make a decision about whether we have a match.
68 | return self.url.replace(".", "-")
69 |
70 |
71 | @receiver(post_save, sender=URLPattern)
72 | def delete_url_pattern_cache(sender, instance, **kwargs):
73 | if cache.delete("url_pattern_cache"):
74 | logger.info("delete url_pattern_cache")
75 |
76 |
77 | class LinkSearchTotal(models.Model):
78 | class Meta:
79 | app_label = "links"
80 | verbose_name = "LinkSearch total"
81 | verbose_name_plural = "LinkSearch totals"
82 | # We only want one record for each URL on any particular date
83 | constraints = [
84 | models.UniqueConstraint(fields=["url", "date"], name="unique_date_total")
85 | ]
86 |
87 | url = models.ForeignKey(URLPattern, null=True, on_delete=models.SET_NULL)
88 |
89 | date = models.DateField(default=date.today)
90 | total = models.PositiveIntegerField()
91 |
92 |
93 | class LinkEvent(models.Model):
94 | """
95 | Stores data from the page-links-change EventStream
96 |
97 | https://stream.wikimedia.org/?doc#!/Streams/get_v2_stream_page_links_change
98 | """
99 |
100 | class Meta:
101 | app_label = "links"
102 | get_latest_by = "timestamp"
103 | indexes = [
104 | models.Index(
105 | fields=[
106 | "hash_link_event_id",
107 | ]
108 | ),
109 | models.Index(
110 | fields=[
111 | "timestamp",
112 | ]
113 | ),
114 | models.Index(fields=["content_type", "object_id"]),
115 | ]
116 | url = models.ManyToManyField(URLPattern, related_name="linkevent")
117 | # URLs should have a max length of 2083
118 | link = models.CharField(max_length=2083)
119 | timestamp = models.DateTimeField()
120 | domain = models.CharField(max_length=32, db_index=True)
121 | content_type = models.ForeignKey(ContentType, on_delete=models.SET_NULL, related_name="content_type", null=True)
122 | object_id = models.PositiveIntegerField(null=True)
123 | content_object = GenericForeignKey("content_type", "object_id")
124 |
125 | username = models.ForeignKey(
126 | "organisations.User",
127 | null=True,
128 | on_delete=models.SET_NULL,
129 | )
130 | # rev_id has null=True because some tracked revisions don't have a
131 | # revision ID, like page moves.
132 | rev_id = models.PositiveIntegerField(null=True)
133 | # IPs have no user_id, so this can be blank too.
134 | user_id = models.PositiveIntegerField(null=True)
135 | page_title = models.CharField(max_length=255)
136 | page_namespace = models.IntegerField()
137 | event_id = models.CharField(max_length=36)
138 | user_is_bot = models.BooleanField(default=False)
139 | hash_link_event_id = models.CharField(max_length=256, blank=True)
140 |
141 | # Were links added or removed?
142 | REMOVED = 0
143 | ADDED = 1
144 |
145 | CHANGE_CHOICES = (
146 | (REMOVED, "Removed"),
147 | (ADDED, "Added"),
148 | )
149 |
150 | change = models.IntegerField(choices=CHANGE_CHOICES, db_index=True)
151 |
152 | # Flags whether this event was from a user on the user list for the
153 | # organisation tracking its URL.
154 | on_user_list = models.BooleanField(default=False)
155 |
156 | @property
157 | def get_organisation(self):
158 | url_pattern = URLPattern.objects.all()
159 | for url_pattern in url_pattern:
160 | link_events = url_pattern.link_events.all()
161 | if self in link_events:
162 | return url_pattern.collection.organisation
163 |
164 | def save(self, **kwargs):
165 | link_event_id = self.link + self.event_id
166 | hash = hashlib.sha256()
167 | hash.update(link_event_id.encode("utf-8"))
168 | self.hash_link_event_id = hash.hexdigest()
169 | super().save(**kwargs)
170 |
--------------------------------------------------------------------------------
/extlinks/logs/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
--------------------------------------------------------------------------------
/extlinks/organisations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/organisations/__init__.py
--------------------------------------------------------------------------------
/extlinks/organisations/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 |
3 | from .models import Organisation, Collection, User
4 |
5 |
6 | class UserAdmin(admin.ModelAdmin):
7 | list_display = ("username",)
8 |
9 |
10 | admin.site.register(User, UserAdmin)
11 |
12 |
13 | class OrganisationAdmin(admin.ModelAdmin):
14 | list_display = ("name",)
15 | list_filter = ("name",)
16 | exclude = ("username_list",)
17 |
18 |
19 | admin.site.register(Organisation, OrganisationAdmin)
20 |
21 |
22 | class CollectionAdmin(admin.ModelAdmin):
23 | list_display = ("name", "organisation")
24 | list_filter = ("name", "organisation")
25 | list_select_related = ["organisation"]
26 | search_fields = ["name"]
27 |
28 |
29 | admin.site.register(Collection, CollectionAdmin)
30 |
--------------------------------------------------------------------------------
/extlinks/organisations/factories.py:
--------------------------------------------------------------------------------
1 | import factory
2 |
3 | from .models import User, Organisation, Collection
4 |
5 |
6 | class UserFactory(factory.django.DjangoModelFactory):
7 |
8 | class Meta:
9 | model = User
10 | strategy = factory.CREATE_STRATEGY
11 |
12 | username = factory.Faker('name')
13 |
14 |
15 | class OrganisationFactory(factory.django.DjangoModelFactory):
16 |
17 | class Meta:
18 | model = Organisation
19 | strategy = factory.CREATE_STRATEGY
20 |
21 | name = factory.Faker('company')
22 |
23 | @factory.post_generation
24 | def program(self, create, extracted, **kwargs):
25 | if not create:
26 | return
27 | if extracted:
28 | for program in extracted:
29 | self.program.add(program)
30 |
31 |
32 | class CollectionFactory(factory.django.DjangoModelFactory):
33 |
34 | class Meta:
35 | model = Collection
36 | strategy = factory.CREATE_STRATEGY
37 |
38 | name = factory.Faker('word')
39 | organisation = factory.SubFactory(OrganisationFactory)
40 |
--------------------------------------------------------------------------------
/extlinks/organisations/management/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/organisations/management/__init__.py
--------------------------------------------------------------------------------
/extlinks/organisations/management/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/organisations/management/commands/__init__.py
--------------------------------------------------------------------------------
/extlinks/organisations/management/commands/users_update_lists.py:
--------------------------------------------------------------------------------
1 | import os
2 | import requests
3 |
4 | from extlinks.common.management.commands import BaseCommand
5 | from django.db import close_old_connections
6 | from django.utils.timezone import now
7 |
8 | from extlinks.organisations.models import Organisation, User
9 |
10 |
11 | class Command(BaseCommand):
12 | help = "Updates organisation user lists who have a user_list_url"
13 |
14 | def _handle(self, *args, **options):
15 | user_list_orgs = Organisation.objects.filter(username_list_url__isnull=False)
16 |
17 | for organisation in user_list_orgs:
18 | username_list_url = organisation.username_list_url
19 |
20 | # TODO: Hacky way to get TWL working, needs to be flexible.
21 | auth_key = os.environ["TWL_API_TOKEN"]
22 | response = requests.get(
23 | username_list_url,
24 | headers={"Authorization": "Token {}".format(auth_key)},
25 | )
26 | if response.status_code == 200:
27 | json_response = response.json()
28 | else:
29 | continue
30 |
31 | # If we got a valid response, clear the previous username list
32 | organisation.username_list.clear()
33 |
34 | for result in json_response:
35 | username = result["wp_username"]
36 |
37 | user_object, _ = User.objects.get_or_create(username=username)
38 |
39 | organisation.username_list.add(user_object)
40 | # Useful for health check
41 | organisation.username_list_updated = now()
42 | organisation.save()
43 |
44 | close_old_connections()
45 |
--------------------------------------------------------------------------------
/extlinks/organisations/migrations/0001_initial.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2 on 2019-05-20 14:01
2 |
3 | from django.db import migrations, models
4 | import django.db.models.deletion
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | initial = True
10 |
11 | dependencies = [
12 | ("programs", "0001_initial"),
13 | ]
14 |
15 | operations = [
16 | migrations.CreateModel(
17 | name="Organisation",
18 | fields=[
19 | (
20 | "id",
21 | models.AutoField(
22 | auto_created=True,
23 | primary_key=True,
24 | serialize=False,
25 | verbose_name="ID",
26 | ),
27 | ),
28 | ("name", models.CharField(max_length=40)),
29 | ("limit_by_user", models.BooleanField(default=False)),
30 | ("username_list", models.TextField(blank=True, null=True)),
31 | ("username_list_url", models.URLField(blank=True, null=True)),
32 | (
33 | "program",
34 | models.ForeignKey(
35 | blank=True,
36 | null=True,
37 | on_delete=django.db.models.deletion.SET_NULL,
38 | to="programs.Program",
39 | ),
40 | ),
41 | ],
42 | ),
43 | migrations.CreateModel(
44 | name="Collection",
45 | fields=[
46 | (
47 | "id",
48 | models.AutoField(
49 | auto_created=True,
50 | primary_key=True,
51 | serialize=False,
52 | verbose_name="ID",
53 | ),
54 | ),
55 | ("name", models.CharField(max_length=40)),
56 | (
57 | "organisation",
58 | models.ForeignKey(
59 | null=True,
60 | on_delete=django.db.models.deletion.SET_NULL,
61 | to="organisations.Organisation",
62 | ),
63 | ),
64 | ],
65 | ),
66 | ]
67 |
--------------------------------------------------------------------------------
/extlinks/organisations/migrations/0002_auto_20190603_1255.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2 on 2019-06-03 12:55
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ("organisations", "0001_initial"),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterModelOptions(
14 | name="collection",
15 | options={"ordering": ["name"]},
16 | ),
17 | migrations.AlterModelOptions(
18 | name="organisation",
19 | options={"ordering": ["name"]},
20 | ),
21 | ]
22 |
--------------------------------------------------------------------------------
/extlinks/organisations/migrations/0003_auto_20190603_1325.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2 on 2019-06-03 13:25
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ("programs", "0002_auto_20190603_1255"),
10 | ("organisations", "0002_auto_20190603_1255"),
11 | ]
12 |
13 | operations = [
14 | migrations.RemoveField(
15 | model_name="organisation",
16 | name="program",
17 | ),
18 | migrations.AddField(
19 | model_name="organisation",
20 | name="program",
21 | field=models.ManyToManyField(blank=True, null=True, to="programs.Program"),
22 | ),
23 | ]
24 |
--------------------------------------------------------------------------------
/extlinks/organisations/migrations/0004_auto_20190603_1325.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2 on 2019-06-03 13:25
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ("organisations", "0003_auto_20190603_1325"),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name="organisation",
15 | name="program",
16 | field=models.ManyToManyField(to="programs.Program"),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/extlinks/organisations/migrations/0005_auto_20190628_1221.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2 on 2019-06-28 12:21
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ("organisations", "0004_auto_20190603_1325"),
10 | ]
11 |
12 | operations = [
13 | migrations.CreateModel(
14 | name="User",
15 | fields=[
16 | (
17 | "id",
18 | models.AutoField(
19 | auto_created=True,
20 | primary_key=True,
21 | serialize=False,
22 | verbose_name="ID",
23 | ),
24 | ),
25 | ("username", models.CharField(max_length=235)),
26 | ],
27 | ),
28 | migrations.RemoveField(
29 | model_name="organisation",
30 | name="limit_by_user",
31 | ),
32 | migrations.RemoveField(
33 | model_name="organisation",
34 | name="username_list",
35 | ),
36 | migrations.AddField(
37 | model_name="organisation",
38 | name="username_list",
39 | field=models.ManyToManyField(to="organisations.User"),
40 | ),
41 | ]
42 |
--------------------------------------------------------------------------------
/extlinks/organisations/migrations/0006_auto_20190730_1355.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2.3 on 2019-07-30 13:55
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ("organisations", "0005_auto_20190628_1221"),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name="organisation",
15 | name="program",
16 | field=models.ManyToManyField(blank=True, to="programs.Program"),
17 | ),
18 | migrations.AlterField(
19 | model_name="organisation",
20 | name="username_list",
21 | field=models.ManyToManyField(blank=True, to="organisations.User"),
22 | ),
23 | ]
24 |
--------------------------------------------------------------------------------
/extlinks/organisations/migrations/0007_auto_20230216_1931.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.14 on 2023-02-16 19:31
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('organisations', '0006_auto_20190730_1355'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='user',
15 | name='username',
16 | field=models.CharField(max_length=235, unique=True),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/extlinks/organisations/migrations/0008_alter_collection_id_alter_organisation_id_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 4.2.7 on 2023-11-07 19:28
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('organisations', '0007_auto_20230216_1931'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='collection',
15 | name='id',
16 | field=models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID'),
17 | ),
18 | migrations.AlterField(
19 | model_name='organisation',
20 | name='id',
21 | field=models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID'),
22 | ),
23 | migrations.AlterField(
24 | model_name='user',
25 | name='id',
26 | field=models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID'),
27 | ),
28 | ]
29 |
--------------------------------------------------------------------------------
/extlinks/organisations/migrations/0009_organisation_username_list_updated.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 4.2.20 on 2025-04-02 03:32
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ("organisations", "0008_alter_collection_id_alter_organisation_id_and_more"),
10 | ]
11 |
12 | operations = [
13 | migrations.AddField(
14 | model_name="organisation",
15 | name="username_list_updated",
16 | field=models.DateTimeField(auto_now=True),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/extlinks/organisations/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/organisations/migrations/__init__.py
--------------------------------------------------------------------------------
/extlinks/organisations/models.py:
--------------------------------------------------------------------------------
1 | from django.contrib.contenttypes.models import ContentType
2 | from django.db import models
3 |
4 | from extlinks.links.models import LinkEvent, URLPattern
5 |
6 |
7 | class User(models.Model):
8 | class Meta:
9 | app_label = "organisations"
10 |
11 | username = models.CharField(max_length=235, unique=True)
12 |
13 | def __str__(self):
14 | return self.username
15 |
16 |
17 | class Organisation(models.Model):
18 | class Meta:
19 | app_label = "organisations"
20 | ordering = ["name"]
21 |
22 | name = models.CharField(max_length=40)
23 |
24 | # programs.Program syntax required to avoid circular import.
25 | program = models.ManyToManyField("programs.Program", blank=True)
26 |
27 | username_list = models.ManyToManyField(User, blank=True)
28 | # If a URL is placed here, we'll use it to regularly update username_list
29 | username_list_url = models.URLField(blank=True, null=True)
30 | username_list_updated = models.DateTimeField(auto_now=True)
31 |
32 | def __str__(self):
33 | return self.name
34 |
35 | @property
36 | def limit_by_user(self):
37 | return self.username_list.exists()
38 |
39 |
40 | class Collection(models.Model):
41 | class Meta:
42 | app_label = "organisations"
43 | ordering = ["name"]
44 |
45 | name = models.CharField(max_length=40)
46 |
47 | organisation = models.ForeignKey(Organisation, null=True, on_delete=models.SET_NULL)
48 |
49 | def __str__(self):
50 | return self.name
51 |
52 | def get_linkevents(self):
53 | url_patterns = URLPattern.objects.filter(collections__name__contains=self.name)
54 | url_pattern_type = ContentType.objects.get_for_model(URLPattern)
55 | return LinkEvent.objects.filter(
56 | content_type__pk=url_pattern_type.id, object_id__in=url_patterns
57 | )
58 |
59 | def get_url_patterns(self):
60 | return URLPattern.objects.filter(collections__name__contains=self.name)
61 |
--------------------------------------------------------------------------------
/extlinks/organisations/templates/organisations/organisation_detail.html:
--------------------------------------------------------------------------------
1 | {% extends 'base.html' %}
2 |
3 | {% block content %}
4 |
5 |
18 |
19 |
20 |
21 |
{{ object }}
22 |
23 |
62 |
63 | {% include "organisations/organisation_charts_include.html" %}
64 |
65 |
66 | {% endblock %}
67 |
--------------------------------------------------------------------------------
/extlinks/organisations/templates/organisations/organisation_list.html:
--------------------------------------------------------------------------------
1 | {% extends 'base.html' %}
2 |
3 | {% block content %}
4 |
5 |
Organisations
6 |
7 | {% for organisation in object_list %}
8 |
9 |
10 |
{{ organisation }}
11 | {% if organisation.collection_count > 1 %}
12 |
Collections: {{ organisation.collection_count }}
13 | {% endif %}
14 |
Overview
15 |
16 |
17 | {% empty %}
18 | No organisations.
19 | {% endfor %}
20 |
21 |
22 | {% endblock %}
23 |
--------------------------------------------------------------------------------
/extlinks/organisations/urls.py:
--------------------------------------------------------------------------------
1 | from django.urls import path
2 |
3 | from extlinks.common.views import CSVPageTotals
4 | from extlinks.common.urls import urlpatterns as shared_urls
5 | from .views import (
6 | OrganisationDetailView,
7 | OrganisationListView,
8 | get_editor_count,
9 | get_project_count,
10 | get_links_count,
11 | get_top_pages,
12 | get_top_projects,
13 | get_top_users,
14 | get_latest_link_events,
15 | )
16 |
17 | urlpatterns = [
18 | path("", OrganisationListView.as_view(), name="list"),
19 | path("", OrganisationDetailView.as_view(), name="detail"),
20 | path("editor_count/", get_editor_count, name="editor_count"),
21 | path("project_count/", get_project_count, name="project_count"),
22 | path("links_count/", get_links_count, name="links_count"),
23 | path("top_pages/", get_top_pages, name="top_pages"),
24 | path("top_projects/", get_top_projects, name="top_projects"),
25 | path("top_users/", get_top_users, name="top_users"),
26 | path("latest_link_events/", get_latest_link_events, name="latest_link_events"),
27 | # CSV downloads
28 | path("/csv/page_totals", CSVPageTotals.as_view(), name="csv_page_totals"),
29 | ]
30 |
31 | urlpatterns += shared_urls
32 |
--------------------------------------------------------------------------------
/extlinks/programs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/programs/__init__.py
--------------------------------------------------------------------------------
/extlinks/programs/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 |
3 | from .models import Program
4 |
5 |
6 | class ProgramAdmin(admin.ModelAdmin):
7 | list_display = ("name",)
8 |
9 |
10 | admin.site.register(Program, ProgramAdmin)
11 |
--------------------------------------------------------------------------------
/extlinks/programs/factories.py:
--------------------------------------------------------------------------------
1 | import factory
2 |
3 | from .models import Program
4 |
5 |
6 | class ProgramFactory(factory.django.DjangoModelFactory):
7 | class Meta:
8 | model = Program
9 | strategy = factory.CREATE_STRATEGY
10 |
11 | name = factory.Faker("company")
12 | description = factory.Faker("text", max_nb_chars=200)
13 |
--------------------------------------------------------------------------------
/extlinks/programs/management/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/programs/management/__init__.py
--------------------------------------------------------------------------------
/extlinks/programs/management/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/programs/management/commands/__init__.py
--------------------------------------------------------------------------------
/extlinks/programs/management/commands/programs_example_data.py:
--------------------------------------------------------------------------------
1 | import random
2 | from faker import Faker
3 |
4 | from extlinks.common.management.commands import BaseCommand
5 |
6 | from extlinks.links.models import URLPattern
7 | from extlinks.organisations.models import Organisation, Collection, User
8 | from extlinks.programs.models import Program
9 |
10 |
11 | class Command(BaseCommand):
12 | help = "Creates a range of test programs, organisations, and collections"
13 |
14 | def add_arguments(self, parser):
15 | parser.add_argument("num", nargs="+", type=int)
16 |
17 | def _handle(self, *args, **options):
18 | num_programs = options["num"][0]
19 |
20 | fake = Faker()
21 |
22 | for i in range(num_programs):
23 | new_program = Program(
24 | name="Program {num}".format(num=i),
25 | description=fake.text(max_nb_chars=200),
26 | )
27 | new_program.save()
28 |
29 | for j in range(random.randint(1, 20)):
30 | # Will this org limit by user?
31 | limit_by_user = random.choice([True, False])
32 |
33 | new_org = Organisation(name=fake.company())
34 | new_org.save()
35 | if limit_by_user:
36 | # Between 10 and 50 users on the list.
37 | username_list = [
38 | fake.user_name() for _ in range(random.randint(10, 50))
39 | ]
40 | for username in username_list:
41 | user, _ = User.objects.get_or_create(username=username)
42 | new_org.username_list.add(user)
43 | new_org.program.add(new_program)
44 |
45 | for k in range(random.randint(1, 3)):
46 | new_collection = Collection(
47 | name=fake.sentence(nb_words=3)[:-1], organisation=new_org
48 | )
49 | new_collection.save()
50 |
51 | for l in range(random.randint(1, 2)):
52 | new_urlpattern = URLPattern(
53 | # Strip https:// and /
54 | url=fake.url(schemes=["https"])[8:-1],
55 | )
56 | new_urlpattern.save()
57 | new_urlpattern.collections.add(new_collection)
58 | new_urlpattern.save()
59 |
--------------------------------------------------------------------------------
/extlinks/programs/migrations/0001_initial.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2 on 2019-05-20 14:01
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | initial = True
9 |
10 | dependencies = []
11 |
12 | operations = [
13 | migrations.CreateModel(
14 | name="Program",
15 | fields=[
16 | (
17 | "id",
18 | models.AutoField(
19 | auto_created=True,
20 | primary_key=True,
21 | serialize=False,
22 | verbose_name="ID",
23 | ),
24 | ),
25 | ("name", models.CharField(max_length=40)),
26 | ("description", models.TextField(blank=True, null=True)),
27 | ],
28 | ),
29 | ]
30 |
--------------------------------------------------------------------------------
/extlinks/programs/migrations/0002_auto_20190603_1255.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2 on 2019-06-03 12:55
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ("programs", "0001_initial"),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterModelOptions(
14 | name="program",
15 | options={"ordering": ["name"]},
16 | ),
17 | ]
18 |
--------------------------------------------------------------------------------
/extlinks/programs/migrations/0003_alter_program_id.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 4.2.7 on 2023-11-07 19:28
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('programs', '0002_auto_20190603_1255'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='program',
15 | name='id',
16 | field=models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID'),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/extlinks/programs/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/extlinks/programs/migrations/__init__.py
--------------------------------------------------------------------------------
/extlinks/programs/models.py:
--------------------------------------------------------------------------------
1 | from django.db import models
2 |
3 | from extlinks.links.models import LinkEvent
4 | from extlinks.organisations.models import Organisation
5 |
6 |
7 | class Program(models.Model):
8 | class Meta:
9 | app_label = "programs"
10 | ordering = ["name"]
11 |
12 | name = models.CharField(max_length=40)
13 |
14 | description = models.TextField(blank=True, null=True)
15 |
16 | def __str__(self):
17 | return self.name
18 |
19 | def get_linkevents(self):
20 | return LinkEvent.objects.filter(
21 | urlpattern__collection__organisation__program=self
22 | ).distinct()
23 |
24 | @property
25 | def any_orgs_user_list(self):
26 | """
27 | Returns True if any of this program's organisations limit by user
28 | """
29 | return Organisation.objects.filter(
30 | program=self, username_list__isnull=False
31 | ).exists()
32 |
--------------------------------------------------------------------------------
/extlinks/programs/templates/programs/program_detail.html:
--------------------------------------------------------------------------------
1 | {% extends 'base.html' %}
2 |
3 | {% block content %}
4 |
5 |
15 |
16 |
17 |
18 |
{{ object }}
19 |
Organisations: {{ program.organisation_set.count }}
20 |
21 |
60 |
61 | {% include "programs/program_charts_include.html" %}
62 |
63 |
64 | {% endblock %}
65 |
--------------------------------------------------------------------------------
/extlinks/programs/templates/programs/program_list.html:
--------------------------------------------------------------------------------
1 | {% extends 'base.html' %}
2 |
3 | {% block content %}
4 |
5 |
Programs
6 |
7 | {% for program in object_list %}
8 |
9 |
10 |
13 |
Organisations: {{ program.organisation_count }}
14 |
{{ program.description }}
15 |
Overview
16 |
17 |
18 | {% empty %}
19 | No programs.
20 | {% endfor %}
21 |
22 |
23 | {% endblock %}
24 |
--------------------------------------------------------------------------------
/extlinks/programs/urls.py:
--------------------------------------------------------------------------------
1 | from django.urls import path
2 |
3 | from extlinks.common.views import CSVOrgTotals
4 | from extlinks.common.urls import urlpatterns as shared_urls
5 | from .views import (
6 | ProgramListView,
7 | ProgramDetailView,
8 | get_editor_count,
9 | get_project_count,
10 | get_links_count,
11 | get_top_organisations,
12 | get_top_projects,
13 | get_top_users,
14 | )
15 |
16 | urlpatterns = [
17 | path("", ProgramListView.as_view(), name="list"),
18 | path("", ProgramDetailView.as_view(), name="detail"),
19 | path("editor_count/", get_editor_count, name="editor_count"),
20 | path("project_count/", get_project_count, name="project_count"),
21 | path("links_count/", get_links_count, name="links_count"),
22 | path("top_organisations/", get_top_organisations, name="top_organisations"),
23 | path("top_projects/", get_top_projects, name="top_projects"),
24 | path("top_users/", get_top_users, name="top_users"),
25 | # CSV downloads
26 | path("/csv/org_totals", CSVOrgTotals.as_view(), name="csv_org_totals"),
27 | ]
28 |
29 | urlpatterns += shared_urls
30 |
--------------------------------------------------------------------------------
/extlinks/settings/base.py:
--------------------------------------------------------------------------------
1 | """
2 | Django settings for extlinks project.
3 | """
4 |
5 | import os
6 | from pathlib import Path
7 |
8 |
9 | SECRET_KEY = os.environ["SECRET_KEY"]
10 | # Usually we'd define this relative to the settings file, but we're always
11 | # starting from /app in Docker.
12 | BASE_DIR = "/app"
13 |
14 | ALLOWED_HOSTS = ["127.0.0.1", "localhost", "0.0.0.0"]
15 |
16 | # Application definition
17 |
18 | INSTALLED_APPS = [
19 | "django.contrib.admin",
20 | "django.contrib.auth",
21 | "django.contrib.contenttypes",
22 | "django.contrib.sessions",
23 | "django.contrib.messages",
24 | "django.contrib.staticfiles",
25 | "extlinks.common",
26 | "extlinks.healthcheck",
27 | "extlinks.links",
28 | "extlinks.organisations",
29 | "extlinks.programs",
30 | "extlinks.aggregates",
31 | "django_extensions",
32 | ]
33 |
34 | MIDDLEWARE = [
35 | "django.middleware.security.SecurityMiddleware",
36 | "django.contrib.sessions.middleware.SessionMiddleware",
37 | "django.middleware.common.CommonMiddleware",
38 | "django.middleware.csrf.CsrfViewMiddleware",
39 | "django.contrib.auth.middleware.AuthenticationMiddleware",
40 | "django.contrib.messages.middleware.MessageMiddleware",
41 | "django.middleware.clickjacking.XFrameOptionsMiddleware",
42 | ]
43 |
44 | ROOT_URLCONF = "extlinks.urls"
45 |
46 | TEMPLATES = [
47 | {
48 | "BACKEND": "django.template.backends.django.DjangoTemplates",
49 | "DIRS": [os.path.join(BASE_DIR, "extlinks", "templates")],
50 | "APP_DIRS": True,
51 | "OPTIONS": {
52 | "context_processors": [
53 | "django.template.context_processors.debug",
54 | "django.template.context_processors.request",
55 | "django.contrib.auth.context_processors.auth",
56 | "django.contrib.messages.context_processors.messages",
57 | ],
58 | },
59 | },
60 | ]
61 |
62 | WSGI_APPLICATION = "extlinks.wsgi.application"
63 |
64 | # Database
65 | # https://docs.djangoproject.com/en/4.2/ref/settings/#databases
66 |
67 | DATABASES = {
68 | "default": {
69 | "ENGINE": "django.db.backends.mysql",
70 | "NAME": os.environ["MYSQL_DATABASE"],
71 | "USER": "root",
72 | "PASSWORD": os.environ["MYSQL_ROOT_PASSWORD"],
73 | "HOST": "db",
74 | "PORT": "3306",
75 | "OPTIONS": {"charset": "utf8mb4"},
76 | "CONN_MAX_AGE": None,
77 | "CONN_HEALTH_CHECKS": True,
78 | }
79 | }
80 |
81 | # Password validation
82 | # https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
83 |
84 | AUTH_PASSWORD_VALIDATORS = [
85 | {
86 | "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator",
87 | },
88 | {
89 | "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",
90 | },
91 | {
92 | "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",
93 | },
94 | {
95 | "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",
96 | },
97 | {
98 | "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",
99 | },
100 | {
101 | "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",
102 | },
103 | {
104 | "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",
105 | },
106 | ]
107 |
108 | # Internationalization
109 | # https://docs.djangoproject.com/en/4.2/topics/i18n/
110 |
111 | LANGUAGE_CODE = "en-us"
112 |
113 | TIME_ZONE = "UTC"
114 |
115 | USE_I18N = True
116 |
117 | USE_L10N = True
118 |
119 | USE_TZ = True
120 |
121 | # Cache
122 |
123 | CACHES = {
124 | "default": {
125 | "BACKEND": "django.core.cache.backends.memcached.PyMemcacheCache",
126 | "LOCATION": "cache:11211",
127 | "TIMEOUT": 600,
128 | "OPTIONS": {
129 | "no_delay": True,
130 | "ignore_exc": True,
131 | "max_pool_size": 4,
132 | "use_pooling": True,
133 | },
134 | }
135 | }
136 |
137 | # Static files (CSS, JavaScript, Images)
138 | # https://docs.djangoproject.com/en/4.2/howto/static-files/
139 |
140 | STATIC_URL = "/static/"
141 | STATIC_ROOT = os.path.join(BASE_DIR, "static")
142 |
143 | # EMAIL CONFIGURATION
144 | # ------------------------------------------------------------------------------
145 | EMAIL_BACKEND = "django.core.mail.backends.console.EmailBackend"
146 | EMAIL_HOST = os.environ.get("DJANGO_EMAIL_HOST", "localhost")
147 | EMAIL_PORT = 25
148 | EMAIL_HOST_USER = ""
149 | EMAIL_HOST_PASSWORD = ""
150 | EMAIL_USE_TLS = False
151 |
152 | DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
153 |
--------------------------------------------------------------------------------
/extlinks/settings/helpers.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import Any
3 |
4 |
5 | def sentry_before_send(event: dict, hint: dict):
6 | """
7 | Callback for sentry's client-side event filtering.
8 | We're using it to mask sensitive data.
9 | https://docs.sentry.io/platforms/python/configuration/filtering/#filtering-error-events
10 | Parameters
11 | ----------
12 | event : dict
13 | Sentry event dictionary object
14 | hint : dict
15 | Source data dictionary used to create the event.
16 | https://docs.sentry.io/platforms/python/configuration/filtering/#using-hints
17 | Returns
18 | -------
19 | dict
20 | The modified event.
21 | """
22 | # We catch any exception, because if we don't, the event is dropped.
23 | # We want to keep passing them on so we can continually improve our scrubbing
24 | # while still sending events.
25 | # noinspection PyBroadException
26 | try:
27 | event = _scrub_event(event)
28 | except:
29 | pass
30 |
31 | return event
32 |
33 |
34 | def _mask_pattern(dirty: str):
35 | """
36 | Masks out known sensitive data from string.
37 | Parameters
38 | ----------
39 | dirty : str
40 | Input that may contain sensitive information.
41 | Returns
42 | -------
43 | str
44 | Output with any known sensitive information masked out.
45 | """
46 | # DB credentials as found in called processes.
47 | call_proc_db_creds = re.compile(r"--(user|password)=[^', ]+([', ])")
48 | clean = call_proc_db_creds.sub(r"--\1=*****\2", dirty)
49 |
50 | return clean
51 |
52 |
53 | def _scrub_event(event_data: Any):
54 | """
55 | Recursively traverses sentry event data returns a scrubbed version.
56 | Parameters
57 | ----------
58 | event_data : Any
59 | Input that may contain sensitive information.
60 | Returns
61 | -------
62 | Any
63 | Output with any known sensitive information masked out.
64 | """
65 | # Basically cribbed from stackoverflow:
66 | # https://stackoverflow.com/a/38970181
67 | # Get dictionary items
68 | if isinstance(event_data, dict):
69 | items = event_data.items()
70 | # Enumerate list/tuple items
71 | elif isinstance(event_data, (list, tuple)):
72 | items = enumerate(event_data)
73 | # Mask sensitive patterns from stringlike elements
74 | else:
75 | return _mask_pattern(str(event_data))
76 |
77 | for key, value in items:
78 | # When we can id sensitive data by the key, do a simple replacement.
79 | if key == "user" or key == "password" or key == "passwd":
80 | event_data[key] = "*****"
81 | # Otherwise, continue recursion.
82 | else:
83 | event_data[key] = _scrub_event(value)
84 |
85 | return event_data
86 |
--------------------------------------------------------------------------------
/extlinks/settings/local.py:
--------------------------------------------------------------------------------
1 | from .base import *
2 | from .logging import *
3 | from os import getenv
4 | import sys
5 |
6 | DEBUG = True
7 |
8 | SERVER_EMAIL = "Wikilink Local "
9 | DEFAULT_FROM_EMAIL = SERVER_EMAIL
10 |
11 |
12 | # Django Debug Toolbar config
13 | # ------------------------------------------------------------------------------
14 |
15 | # Sometimes, developers do not want the debug toolbar on their local environments,
16 | # so we can disable it by not passing a REQUIREMENTS_FILE variable when building
17 | # the docker containers
18 | reqs = getenv("REQUIREMENTS_FILE", "django.txt")
19 | if reqs == "local.txt":
20 | TESTING = "test" in sys.argv
21 | if not TESTING:
22 | INSTALLED_APPS += [
23 | "debug_toolbar",
24 | ]
25 |
26 | MIDDLEWARE += [
27 | "debug_toolbar.middleware.DebugToolbarMiddleware",
28 | ]
29 |
30 | INTERNAL_IPS = ["127.0.0.1", "localhost", "0.0.0.0"]
31 |
32 | def show_toolbar(request):
33 | return True
34 |
35 | DEBUG_TOOLBAR_CONFIG = {
36 | "SHOW_TOOLBAR_CALLBACK": show_toolbar,
37 | }
38 | # Dummy Cache
39 | CACHES = {
40 | "default": {
41 | "BACKEND": "django.core.cache.backends.dummy.DummyCache",
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/extlinks/settings/logging.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging.config
3 |
4 | # LOGGING CONFIGURATION
5 | # ------------------------------------------------------------------------------
6 | # We're replacing the default logging config to get better control of the
7 | # mail_admins behavior.
8 | # Logging is in another file since Django 3.1 because of https://code.djangoproject.com/ticket/32016
9 |
10 | LOGGING_CONFIG = None
11 |
12 | logging.config.dictConfig(
13 | {
14 | "version": 1,
15 | "disable_existing_loggers": False,
16 | "filters": {
17 | "require_debug_false": {"()": "django.utils.log.RequireDebugFalse"},
18 | "require_debug_true": {"()": "django.utils.log.RequireDebugTrue"},
19 | },
20 | "formatters": {
21 | "django.server": {
22 | "()": "django.utils.log.ServerFormatter",
23 | "format": "[%(server_time)s] %(message)s",
24 | }
25 | },
26 | "handlers": {
27 | "nodebug_console": {
28 | "level": "WARNING",
29 | "filters": ["require_debug_false"],
30 | "class": "logging.StreamHandler",
31 | },
32 | "debug_console": {
33 | "level": "INFO",
34 | "filters": ["require_debug_true"],
35 | "class": "logging.StreamHandler",
36 | },
37 | "django.server": {
38 | "level": "INFO",
39 | "class": "logging.StreamHandler",
40 | "formatter": "django.server",
41 | },
42 | },
43 | "loggers": {
44 | "django": {
45 | "handlers": ["nodebug_console", "debug_console"],
46 | "level": os.environ.get("DJANGO_LOG_LEVEL", "INFO"),
47 | },
48 | "django.server": {
49 | "handlers": ["django.server"],
50 | "level": os.environ.get("DJANGO_LOG_LEVEL", "INFO"),
51 | "propagate": False,
52 | },
53 | "Wikilink": {
54 | "handlers": ["nodebug_console", "debug_console"],
55 | "level": os.environ.get("DJANGO_LOG_LEVEL", "INFO"),
56 | },
57 | },
58 | }
59 | )
60 |
--------------------------------------------------------------------------------
/extlinks/settings/production.py:
--------------------------------------------------------------------------------
1 | import sentry_sdk
2 | from sentry_sdk.integrations.django import DjangoIntegration
3 | from extlinks.settings.helpers import sentry_before_send
4 |
5 | from .base import *
6 | from .logging import *
7 |
8 | DEBUG = False
9 |
10 | ALLOWED_HOSTS = ["wikilink.wmflabs.org"]
11 |
12 | # Redirect HTTP to HTTPS
13 | # SECURE_PROXY_SSL_HEADER is required because we're behind a proxy
14 | SECURE_SSL_REDIRECT = True
15 | SECURE_PROXY_SSL_HEADER = ("HTTP_X_FORWARDED_PROTO", "https")
16 |
17 | DEFAULT_FROM_EMAIL = "Wikilink Production "
18 |
19 | sentry_sdk.init(
20 | dsn="https://cdabef0803434e3c97cb2c15f9a7da37@glitchtip-wikilink.wmflabs.org/1",
21 | integrations=[DjangoIntegration()],
22 | before_send=sentry_before_send,
23 | )
24 |
--------------------------------------------------------------------------------
/extlinks/templates/base.html:
--------------------------------------------------------------------------------
1 | {% load static %}
2 |
3 |
4 |
5 |
6 |
7 | Wikimedia External Links Tool
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 | Wikilink
19 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 | {% block content %}
32 | {% endblock %}
33 |
34 |
35 |
36 |
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/extlinks/templates/documentation.html:
--------------------------------------------------------------------------------
1 | {% extends 'base.html' %}
2 |
3 | {% block content %}
4 |
5 |
Wikilink
6 |
7 | The Wikilink tool helps program organisers and organisations track external links on Wikimedia projects. While
8 | MediaWiki has the ability to search existing
9 | links , at the time of writing there is no way to easily monitor link additions and removals over time. The
10 | tool was built primarily for The Wikipedia Library's use case. Publishers donate access to Wikipedia editors,
11 | and while it was possible to monitor the total number of links over time, there was no simple way to investigate
12 | that data further - to find out where links were being added, who was adding them, or in the case of a drop
13 | in link numbers, why those links were removed.
14 |
15 |
16 |
Using the tool
17 |
18 | There are two primary views into the data - the 'program' level and 'organisation' level.
19 |
20 |
Programs
21 |
22 | Programs are collections of organisations. Program pages provide a high level overview of the link additions
23 | and removals for many organisations in one place. If you have partnerships with multiple organisations,
24 | the program pages can provide data about their data in aggregate for reporting purposes.
25 |
26 |
Organisations
27 |
28 | Organisation pages provide data relevant to an individual organisation. Organisations can have multiple
29 | collections of tracked URLs - these could be different websites or simply different URL patterns. Results
30 | for each collection are presented individually. Additionally, each collection can have multiple URLs. This is
31 | useful primarily in the case that a website has moved; both URLs can continue to be tracked in the same place.
32 |
33 |
34 |
Data collection
35 |
36 | Two sets of data are collected: Link events and totals
37 |
38 |
Link events
39 |
40 | A
41 | script is always monitoring the
42 | page-links-change
43 | event stream; when a link tracked by Wikilink is added or removed, the data is stored in Wikilink's database.
44 |
45 |
46 | The event stream reports link additions and removals from all Wikimedia projects and languages, and tracks
47 | events from all namespaces. If a link is changed, it will register both an addition (the new URL) and a removal
48 | (the old URL). Editing the same URL multiple times in one edit will only send a single event.
49 |
50 |
51 | Please be aware there is currently a known bug with the
52 | event stream whereby some additional events are being sent related to template transclusions.
53 |
54 |
Link totals
55 |
56 | The tool also tracks the total number of links to each tracked URL on a weekly basis. These totals are
57 | retrieved from the externallinks table.
58 | Currently, these totals only consider Wikipedia projects, however they do cover every language. Unlike with the
59 | event stream, queries have to be made against each project's database individually, and it is therefore
60 | prohibitive to collect total data for every Wikimedia project.
61 |
62 |
63 | {% endblock %}
64 |
--------------------------------------------------------------------------------
/extlinks/templates/homepage.html:
--------------------------------------------------------------------------------
1 | {% extends 'base.html' %}
2 |
3 | {% block content %}
4 |
5 |
6 |
7 |
8 | Wikilink
9 |
10 |
11 |
12 |
13 |
14 | The Wikilink tool collates and presents data on the addition, removal, and total number of links to websites on
15 | Wikimedia projects. It tracks the
16 | page-links-change
17 | event stream for link additions and removals for specified URL Patterns, and queries the
18 | externallinks table to retrieve totals.
19 |
20 |
21 | The tool currently only supports The Wikipedia
22 | Library program, however support for other programs and partnerships is planned.
23 |
24 |
25 |
26 | {% endblock %}
27 |
--------------------------------------------------------------------------------
/extlinks/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase, RequestFactory
2 | from django.urls import reverse
3 |
4 | from .views import Homepage
5 |
6 |
7 | class HomepageTest(TestCase):
8 | def test_homepage_view(self):
9 | """
10 | Can we simply load the homepage successfully?
11 | """
12 | factory = RequestFactory()
13 |
14 | request = factory.get(reverse("homepage"))
15 | response = Homepage.as_view()(request)
16 |
17 | self.assertEqual(response.status_code, 200)
18 |
--------------------------------------------------------------------------------
/extlinks/urls.py:
--------------------------------------------------------------------------------
1 | from os import getenv
2 | from django.contrib import admin
3 | from django.urls import include, path
4 | from django.conf import settings
5 |
6 | from extlinks.healthcheck.urls import urlpatterns as healthcheck_urls
7 | from extlinks.programs.urls import urlpatterns as programs_urls
8 | from extlinks.organisations.urls import urlpatterns as organisations_urls
9 |
10 | from .views import Homepage, Documentation
11 |
12 | urlpatterns = [
13 | path("admin/", admin.site.urls),
14 | path("", Homepage.as_view(), name="homepage"),
15 | path("docs", Documentation.as_view(), name="documentation"),
16 | path(
17 | "healthcheck/",
18 | include((healthcheck_urls, "healthcheck"), namespace="healthcheck"),
19 | ),
20 | path("programs/", include((programs_urls, "programs"), namespace="programs")),
21 | path(
22 | "organisations/",
23 | include((organisations_urls, "organisations"), namespace="organisations"),
24 | ),
25 | ]
26 |
27 | reqs = getenv("REQUIREMENTS_FILE", "django.txt")
28 | if settings.DEBUG and reqs == "local.txt":
29 | if not settings.TESTING:
30 | import debug_toolbar
31 |
32 | urlpatterns += [
33 | path("__debug__/", include(debug_toolbar.urls)),
34 | ]
35 |
--------------------------------------------------------------------------------
/extlinks/views.py:
--------------------------------------------------------------------------------
1 | from django.views.generic import TemplateView
2 |
3 |
4 | class Homepage(TemplateView):
5 | template_name = "homepage.html"
6 |
7 |
8 | class Documentation(TemplateView):
9 | template_name = "documentation.html"
10 |
--------------------------------------------------------------------------------
/extlinks/wsgi.py:
--------------------------------------------------------------------------------
1 | """
2 | WSGI config for extlinks project.
3 |
4 | It exposes the WSGI callable as a module-level variable named ``application``.
5 | """
6 |
7 | import os
8 |
9 | from django.core.wsgi import get_wsgi_application
10 |
11 | application = get_wsgi_application()
12 |
--------------------------------------------------------------------------------
/manage.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Django's command-line utility for administrative tasks."""
3 | import os
4 | import sys
5 | from dotenv import load_dotenv
6 |
7 |
8 | def main():
9 | try:
10 | from django.core.management import execute_from_command_line
11 | except ImportError as exc:
12 | raise ImportError(
13 | "Couldn't import Django. Are you sure it's installed and "
14 | "available on your PYTHONPATH environment variable? Did you "
15 | "forget to activate a virtual environment?"
16 | ) from exc
17 | load_dotenv(".env")
18 | execute_from_command_line(sys.argv)
19 |
20 |
21 | if __name__ == "__main__":
22 | main()
23 |
--------------------------------------------------------------------------------
/nginx.conf:
--------------------------------------------------------------------------------
1 | map $http_x_forwarded_proto $web_proxy_scheme {
2 | default $scheme;
3 | https https;
4 | }
5 |
6 | map $http_user_agent $limit_bots {
7 | default "";
8 | ~*(GoogleBot|bingbot|YandexBot|mj12bot|Apache-HttpClient|Adsbot|Barkrowler|FacebookBot|dotbot|Googlebot|Bytespider|SemrushBot|AhrefsBot|Amazonbot|GPTBot|DotBot) $binary_remote_addr;
9 | }
10 |
11 | ## Testing the request method
12 | # Only GET and HEAD are caching safe.
13 | map $request_method $no_cache_method {
14 | default 1;
15 | HEAD 0;
16 | GET 0;
17 | }
18 |
19 | ## Testing for Cache-Control header
20 | # Only checking for no-cache because chrome annoyingly sets max-age=0 when hitting enter in the address bar.
21 | map $http_cache_control $no_cache_control {
22 | default 0;
23 | no-cache 1;
24 | }
25 |
26 | ## Testing for the session cookie being present
27 | map $http_cookie $no_cache_session {
28 | default 0;
29 | ~sessionid 1; # Django session cookie
30 | }
31 |
32 | ## proxy caching settings.
33 | proxy_cache_path /var/lib/nginx/cache levels=1:2 keys_zone=cache:8m max_size=10g inactive=10m;
34 | proxy_cache_key "$scheme$proxy_host$uri$is_args$args$http_accept_language";
35 | proxy_cache_lock on;
36 | proxy_cache_use_stale error timeout invalid_header updating http_500 http_502 http_503 http_504;
37 |
38 | # remote address is a joke here since we don't have x-forwarded-for
39 | limit_req_zone $limit_bots zone=bots:10m rate=1r/s;
40 | limit_req_zone $binary_remote_addr zone=one:10m rate=500r/s;
41 |
42 | upstream django_server {
43 | server externallinks:8000 fail_timeout=0;
44 | }
45 |
46 | server {
47 | listen 80 deferred;
48 | client_max_body_size 4G;
49 | server_name wikilink.wmflabs.org;
50 | keepalive_timeout 5;
51 |
52 | # Definied explicitly to avoid caching
53 | location /healthcheck/link_event {
54 | # Rate limit
55 | limit_req zone=bots burst=2 nodelay;
56 | limit_req zone=one burst=1000 nodelay;
57 | limit_req_status 429;
58 | # Proxy
59 | proxy_set_header X-Forwarded-Proto $web_proxy_scheme;
60 | proxy_set_header Host $http_host;
61 | proxy_redirect off;
62 | proxy_pass http://django_server;
63 | }
64 |
65 | location = /robots.txt {
66 | add_header Content-Type text/plain;
67 | alias /app/robots.txt;
68 | }
69 |
70 | location / {
71 | root /app/;
72 | expires 30d;
73 |
74 | if ($http_user_agent ~* (GoogleBot|bingbot|YandexBot|mj12bot|Apache-HttpClient|Adsbot|Barkrowler|FacebookBot|dotbot|Bytespider|SemrushBot|AhrefsBot|Amazonbot|GPTBot) ) {
75 | return 403;
76 | }
77 | location /admin/links/ {
78 | try_files $uri @django-admin-slow;
79 | }
80 | # checks for static file, if not found proxy to app
81 | try_files $uri @django;
82 | }
83 | location @django {
84 | # Cache
85 | proxy_cache_valid 200 301 302 401 403 404 1d;
86 | proxy_cache_bypass $http_pragma $no_cache_method $no_cache_control $no_cache_session;
87 | proxy_cache_revalidate on;
88 | proxy_cache cache;
89 | add_header X-Cache-Status $upstream_cache_status;
90 | # Rate limit
91 | limit_req zone=bots burst=2 nodelay;
92 | limit_req zone=one burst=1000 nodelay;
93 | limit_req_status 429;
94 | # Proxy
95 | proxy_set_header X-Forwarded-Proto $web_proxy_scheme;
96 | proxy_set_header Host $http_host;
97 | proxy_redirect off;
98 | proxy_pass http://django_server;
99 | }
100 | location @django-admin-slow {
101 | # https://nginx.org/en/docs/http/ngx_http_proxy_module.html#proxy_send_timeout
102 | proxy_connect_timeout 120s;
103 | proxy_send_timeout 120s;
104 | proxy_read_timeout 120s;
105 | # https://nginx.org/en/docs/http/ngx_http_core_module.html#send_timeout
106 | send_timeout 120s;
107 | keepalive_timeout 120s;
108 | # Cache
109 | proxy_cache_valid 200 301 302 401 403 404 1d;
110 | proxy_cache_bypass $http_pragma $no_cache_method $no_cache_control $no_cache_session;
111 | proxy_cache_revalidate on;
112 | proxy_cache cache;
113 | add_header X-Cache-Status $upstream_cache_status;
114 | # Rate limit
115 | limit_req zone=bots burst=2 nodelay;
116 | limit_req zone=one burst=1000 nodelay;
117 | limit_req_status 429;
118 | # Proxy
119 | proxy_set_header X-Forwarded-Proto $web_proxy_scheme;
120 | proxy_set_header Host $http_host;
121 | proxy_redirect off;
122 | proxy_pass http://django_server;
123 | }
124 | }
125 |
--------------------------------------------------------------------------------
/requirements/django.txt:
--------------------------------------------------------------------------------
1 | Django>=4.2.7,<5.0.0
2 | django_extensions==3.2.3
3 | factory_boy>=3.0.1
4 | faker==26.0.0
5 | filelock==3.18.0
6 | mysqlclient==2.2.4
7 | pymemcache==4.0.0
8 | python-dotenv==1.1.0
9 | sentry-sdk==2.10.0
10 | sseclient==0.0.27
11 | time_machine==2.14.2
12 | python-swiftclient>=4.6.0,<5.0.0
13 | keystoneauth1>=5.9.2,<6.0.0
14 | coverage==7.8.0
15 |
--------------------------------------------------------------------------------
/requirements/local.txt:
--------------------------------------------------------------------------------
1 | -r django.txt
2 | django-debug-toolbar==4.4.6
3 | pudb==2024.1.3
4 |
--------------------------------------------------------------------------------
/robots.txt:
--------------------------------------------------------------------------------
1 | User-agent: *
2 | Disallow: /
3 |
--------------------------------------------------------------------------------
/static/css/local.css:
--------------------------------------------------------------------------------
1 | h1 {
2 | margin-bottom: 20px;
3 | }
4 |
5 | hr {
6 | margin-top: 2%;
7 | width: 70%;
8 | }
9 |
10 | .body {
11 | padding-left: 10%;
12 | padding-right: 10%;
13 | padding-top: 2%;
14 | }
15 |
16 | .charts-body {
17 | padding: 2%;
18 | width: 100%;
19 |
20 | }
21 |
22 | .navbar {
23 | background-color: #fbfbfb;
24 | }
25 |
26 | .navbar-light .navbar-nav .nav-link {
27 | color: rgba(0, 0, 0, 0.7);
28 | }
29 |
30 | .intro {
31 | font-size:30pt;
32 | padding-bottom:25px;
33 | }
34 |
35 | .footer {
36 | font-size: 14px;
37 | text-align: center;
38 | padding-bottom:5%;
39 | }
40 |
41 | .wrapper {
42 | display: flex;
43 | width: 100%;
44 | align-items: stretch;
45 | }
46 |
47 | .stat-box {
48 | border-style: solid;
49 | border-width: 2px;
50 | border-top-width: 0;
51 | border-bottom-width: 0;
52 | border-color: rgba(197, 197, 197);
53 | border-radius: 25px;
54 | padding: 25px 30px;
55 | }
56 |
57 | .stat-table-header {
58 | border-bottom-style: solid;
59 | border-bottom-color: lightgrey;
60 | border-bottom-width: 1px;
61 | }
62 |
63 | .tr {
64 | line-height: 25px;
65 | }
66 |
67 | .tabcontent {
68 | display: none;
69 | }
70 |
71 | #sidebar {
72 | min-width: 250px;
73 | max-width: 250px;
74 | min-height: 100vh;
75 | padding: 20px;
76 | padding-top: 40px;
77 | background-color: #fbfbfb;
78 | }
79 |
80 | .sidebar-sub-entry {
81 | text-indent: 2em;
82 | font-style: italic;
83 | font-size: 14px;
84 | }
85 |
86 | .card {
87 | margin: 1%;
88 | width: 20em;
89 | }
90 |
91 | .tablinks {
92 | border: none;
93 | min-height: 50px;
94 | margin: 0;
95 | float: left;
96 | background-color: #f2f2f4;
97 | transition: 0.2s;
98 | border-top-style: solid;
99 | border-top-color: #cdcdcd;
100 | border-top-width: 2px;
101 | }
102 |
103 | .tab {
104 | border-bottom-color: #ebebeb;
105 | border-bottom-style: solid;
106 | border-bottom-width: 2px;
107 | padding-left: 0;
108 | padding-right: 30px;
109 | margin-bottom: 10px;
110 | }
111 |
112 | .tab button:hover {
113 | background-color: #e5e5e5;
114 | }
115 |
116 | .tab button:focus {
117 | outline: 0;
118 | }
119 |
120 | .active {
121 | background-color: #d6d6d6;
122 | border-top-style: solid;
123 | border-top-color: #9a9a9a;
124 | border-top-width: 4px;
125 | }
126 |
--------------------------------------------------------------------------------
/static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WikipediaLibrary/externallinks/f86118b12123616b6c3f1462f7397e1caf6b68c3/static/favicon.ico
--------------------------------------------------------------------------------
/template.env:
--------------------------------------------------------------------------------
1 | ENV=local
2 | DJANGO_SETTINGS_MODULE=extlinks.settings.local
3 | # To enable admin error emails, change to django.core.mail.backends.smtp.EmailBackend
4 | DJANGO_EMAIL_ADMINS_BACKEND=django.core.mail.backends.console.EmailBackend
5 | DJANGO_EMAIL_HOST=localhost
6 | SECRET_KEY=et=o&42)q3r8ztu&b^sjs2+%vv3^mg%3amzcia8^)fq*w6#oj3
7 | MYSQL_DATABASE=extlinks_db
8 | MYSQL_ROOT_PASSWORD=links
9 | REPLICA_DB_USER=user
10 | REPLICA_DB_PASSWORD=password
11 | TWL_API_TOKEN=token
12 | # When building, these define how the local images are tagged
13 | # When pulling, these define which images are pulled
14 | EVENTSTREAM_TAG=latest
15 | EXTERNALLINKS_TAG=latest
16 | # Change to something like /data/project/prod for real servers.
17 | HOST_BACKUP_DIR=./backup
18 | # Swift Object Store (for storing archives), `local` will just ignore Swift usage
19 | SWIFT_APPLICATION_CREDENTIAL_ID=local
20 | SWIFT_APPLICATION_CREDENTIAL_SECRET=local
21 | SWIFT_CONTAINER_NAME=archive-linkevents
22 | # See the readme file for setting up a local swift store
23 | # In production, it should use the known URL https://openstack.eqiad1.wikimediacloud.org:25000/v3
24 | OPENSTACK_AUTH_URL=http://externallinks-swift:5001/v3
25 | LINKEVENTS_ARCHIVE_OBJECT_STORAGE_ONLY=false
26 |
--------------------------------------------------------------------------------
/wiki-list.csv:
--------------------------------------------------------------------------------
1 | en
2 | ceb
3 | sv
4 | de
5 | fr
6 | nl
7 | ru
8 | it
9 | es
10 | pl
11 | war
12 | vi
13 | ja
14 | zh
15 | pt
16 | uk
17 | ar
18 | fa
19 | sr
20 | ca
21 | no
22 | id
23 | fi
24 | ko
25 | hu
26 | sh
27 | cs
28 | ro
29 | eu
30 | tr
31 | ms
32 | eo
33 | bg
34 | hy
35 | da
36 | he
37 | sk
38 | zh_min_nan
39 | kk
40 | min
41 | ce
42 | hr
43 | lt
44 | et
45 | be
46 | sl
47 | el
48 | gl
49 | nn
50 | az
51 | ur
52 | simple
53 | azb
54 | th
55 | hi
56 | uz
57 | la
58 | ka
59 | vo
60 | ta
61 | cy
62 | mk
63 | ast
64 | tg
65 | lv
66 | mg
67 | tt
68 | oc
69 | af
70 | bs
71 | ky
72 | sq
73 | tl
74 | zh_yue
75 | new
76 | te
77 | bn
78 | br
79 | pms
80 | ml
81 | lb
82 | jv
83 | ht
84 | sco
85 | mr
86 | sw
87 | ga
88 | nds
89 | su
90 | ba
91 | pnb
92 | is
93 | my
94 | fy
95 | cv
96 | lmo
97 | an
98 | ne
99 | yo
100 | pa
101 | gu
102 | io
103 | bar
104 | scn
105 | ku
106 | als
107 | bpy
108 | kn
109 | ckb
110 | ia
111 | qu
112 | arz
113 | wuu
114 | mn
115 | bat_smg
116 | si
117 | or
118 | wa
119 | gd
120 | am
121 | yi
122 | cdo
123 | nap
124 | bug
125 | hsb
126 | mai
127 | map_bms
128 | mzn
129 | fo
130 | xmf
131 | li
132 | ilo
133 | eml
134 | sah
135 | vec
136 | os
137 | sd
138 | sa
139 | diq
140 | mrj
141 | ps
142 | mhr
143 | hif
144 | zh_classical
145 | roa_tara
146 | bcl
147 | ace
148 | hak
149 | frr
150 | pam
151 | szl
152 | nso
153 | nv
154 | se
155 | km
156 | mi
157 | rue
158 | nah
159 | bh
160 | nds_nl
161 | vls
162 | crh
163 | gan
164 | sc
165 | vep
166 | bo
167 | glk
168 | myv
169 | co
170 | as
171 | tk
172 | fiu_vro
173 | so
174 | kv
175 | lrc
176 | csb
177 | gv
178 | udm
179 | zea
180 | ay
181 | ie
182 | pcd
183 | sn
184 | nrm
185 | ug
186 | stq
187 | lez
188 | kw
189 | lad
190 | mwl
191 | gom
192 | ab
193 | gn
194 | haw
195 | rm
196 | ha
197 | lij
198 | kab
199 | koi
200 | lfn
201 | lo
202 | mt
203 | fur
204 | frp
205 | dsb
206 | ln
207 | ang
208 | ext
209 | olo
210 | dty
211 | cbk_zam
212 | dv
213 | ksh
214 | gag
215 | pi
216 | pag
217 | pfl
218 | bjn
219 | av
220 | bxr
221 | xal
222 | gor
223 | krc
224 | za
225 | pap
226 | kaa
227 | pdc
228 | rw
229 | tyv
230 | to
231 | kl
232 | nov
233 | jam
234 | arc
235 | kbp
236 | kbd
237 | tpi
238 | tet
239 | ig
240 | ki
241 | na
242 | jbo
243 | wo
244 | roa_rup
245 | lbe
246 | bi
247 | ty
248 | kg
249 | mdf
250 | lg
251 | zu
252 | srn
253 | tcy
254 | inh
255 | atj
256 | chr
257 | ltg
258 | sat
259 | sm
260 | xh
261 | om
262 | pih
263 | cu
264 | rmy
265 | tw
266 | bm
267 | tn
268 | chy
269 | rn
270 | got
271 | ts
272 | tum
273 | ak
274 | st
275 | ny
276 | ch
277 | ss
278 | pnt
279 | fj
280 | iu
281 | ady
282 | ee
283 | ks
284 | ve
285 | ik
286 | sg
287 | ff
288 | dz
289 | ti
290 | cr
291 | din
292 | ng
293 | cho
294 | kj
295 | mh
296 | ho
297 | ii
298 | aa
299 | mus
300 | hz
301 | kr
302 | shn
303 | hyw
304 |
--------------------------------------------------------------------------------