├── .github └── workflows │ ├── release.yml │ └── test.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── nginx.conf └── tests ├── mitmtest.sh └── pypi_intercept.py /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | on: 2 | release: 3 | types: [published] 4 | push: 5 | branches: 6 | - master 7 | 8 | permissions: 9 | contents: read 10 | packages: write 11 | 12 | jobs: 13 | release: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: docker/login-action@v3 17 | with: 18 | registry: ghcr.io 19 | username: ${{ github.actor }} 20 | password: ${{ github.token }} 21 | 22 | - uses: docker/build-push-action@v6 23 | with: 24 | push: true 25 | tags: | 26 | ghcr.io/${{ github.repository }}:${{ github.ref_name }} 27 | ghcr.io/${{ github.repository }}:latest 28 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | pull_request: 5 | push: 6 | workflow_dispatch: 7 | schedule: 8 | - cron: '43 5 1 * *' 9 | 10 | permissions: 11 | contents: read 12 | 13 | jobs: 14 | mitmtest: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v4 18 | - uses: actions/setup-python@v5 19 | - run: bash tests/mitmtest.sh 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | 4 | .DS_Store 5 | 6 | .env 7 | .venv 8 | env/ 9 | venv/ 10 | 11 | build/ 12 | dist/ 13 | *.egg-info/ 14 | 15 | .tox/ 16 | .mypy_cache/ 17 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nginx:1.23.2 2 | 3 | RUN mkdir -p /var/lib/nginx/pypi/ /var/log/nginx/ /var/run/ 4 | ADD nginx.conf /etc/nginx/nginx.conf 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 hauntsaninja 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pypi_nginx_cache 2 | 3 | A PyPI cache using nginx. 4 | 5 | ## Usage 6 | 7 | This serves as a caching mirror for PyPI. It's a simple stateless service and does not 8 | support uploading packages / private indices. For this use case, I've found it to be 9 | significantly faster and significantly more reliable than devpi. 10 | 11 | To run it locally: 12 | ```bash 13 | docker run -p 80:80 --rm $(docker build -q .) 14 | ``` 15 | 16 | To tell `pip` to connect to this instead of `pypi.org`, use: 17 | ```bash 18 | pip install --index-url=http://localhost/simple mypy 19 | ``` 20 | or 21 | ```bash 22 | export PIP_INDEX_URL=http://localhost/simple 23 | pip install mypy 24 | ``` 25 | 26 | ## Github container registry 27 | 28 | To pull the latest version from the Github container registry: 29 | 30 | ```bash 31 | docker pull ghcr.io/hauntsaninja/nginx_pypi_cache:latest 32 | ``` 33 | 34 | See https://github.com/hauntsaninja/nginx_pypi_cache/pkgs/container/nginx_pypi_cache 35 | 36 | ## Troubleshooting 37 | 38 | It turns out it's surprisingly easy to mess something up and not actually end up proxying 39 | requests. `tests/mitmtest.sh` should help confirm that we're hitting the cache when we expect to, 40 | instead of hitting upstream PyPI. 41 | 42 | The log messages are also pretty useful (check `nginx.conf` to see exactly what these 43 | correspond to): 44 | ``` 45 | 172.17.0.1 - localhost [13/Jan/2023:02:36:00 +0000] request_time=0.000 upstream_time=- cache_status=HIT 200 "GET /simple/mypy/ HTTP/1.1" 78368 46 | ``` 47 | -------------------------------------------------------------------------------- /nginx.conf: -------------------------------------------------------------------------------- 1 | # Loosely based on the following: 2 | # (note these do not work correctly in 2023) 3 | # https://joelkleier.com/blog/2018-04-17-pypi-temporary-cache.html 4 | # https://gist.github.com/dctrwatson/5785638#file-nginx-conf 5 | # It's also very easy to end up not proxying requests; tests/mitmtest.sh should help verify that 6 | # pip installs actually avoid hitting upstream 7 | 8 | error_log /dev/stderr; 9 | # Log to file, can be useful for dev 10 | # error_log /var/log/nginx/error.log; 11 | pid /var/run/nginx.pid; 12 | 13 | worker_processes auto; 14 | 15 | events { 16 | worker_connections 2048; 17 | } 18 | 19 | http { 20 | include /etc/nginx/mime.types; 21 | default_type application/octet-stream; 22 | sendfile on; 23 | tcp_nodelay on; 24 | tcp_nopush off; 25 | reset_timedout_connection on; 26 | server_tokens off; 27 | gzip on; 28 | gzip_types application/vnd.pypi.simple.v1+json; 29 | gzip_proxied any; 30 | gzip_vary on; 31 | 32 | log_format pypi_cache '$remote_addr - $http_host [$time_local] ' 33 | 'request_time=$request_time upstream_time=$upstream_response_time ' 34 | 'cache_status=$upstream_cache_status \t' 35 | '$status "$request" $body_bytes_sent'; 36 | access_log /dev/stdout pypi_cache buffer=64k flush=1s; 37 | # Log to file, can be useful for dev 38 | # access_log /var/log/nginx/cache.log pypi_cache buffer=64k flush=1s; 39 | 40 | # Cache 50G worth of packages for up to 1 month 41 | proxy_cache_path /var/lib/nginx/pypi levels=1:2 keys_zone=pypi:16m inactive=1M max_size=50G use_temp_path=off; 42 | 43 | # Having the same upstream server listed twice allegedly forces nginx to retry 44 | # connections and not fail the request immediately. 45 | upstream sg_pypi { 46 | server pypi.org:443; 47 | server pypi.org:443; 48 | keepalive 16; 49 | } 50 | upstream sg_pythonhosted { 51 | server files.pythonhosted.org:443; 52 | server files.pythonhosted.org:443; 53 | keepalive 16; 54 | } 55 | 56 | server { 57 | listen 80 default_server; 58 | 59 | proxy_cache pypi; 60 | proxy_cache_key $uri/$http_accept_encoding; 61 | proxy_cache_lock on; 62 | proxy_cache_use_stale error timeout updating http_500 http_502 http_503 http_504; 63 | 64 | proxy_http_version 1.1; 65 | proxy_ssl_server_name on; 66 | 67 | # sub_filter can't apply to gzipped content, so be careful about that 68 | add_header X-Pypi-Cache $upstream_cache_status; 69 | sub_filter 'https://pypi.org' $scheme://$http_host; 70 | sub_filter 'https://files.pythonhosted.org/packages' $scheme://$http_host/packages; 71 | sub_filter_once off; 72 | sub_filter_types application/vnd.pypi.simple.v1+json application/vnd.pypi.simple.v1+html; 73 | 74 | location / { 75 | proxy_set_header Connection ""; 76 | proxy_set_header Accept-Encoding ""; 77 | proxy_cache_valid 200 301 10m; 78 | proxy_cache_valid 404 1m; 79 | 80 | proxy_set_header Host pypi.org; 81 | proxy_ssl_name pypi.org; 82 | proxy_pass 'https://sg_pypi'; 83 | proxy_redirect 'https://pypi.org' $scheme://$http_host; 84 | } 85 | 86 | location ^~ /simple { 87 | proxy_set_header Connection ""; 88 | proxy_set_header Accept-Encoding ""; 89 | proxy_cache_valid 200 301 10m; 90 | proxy_cache_valid 404 1m; 91 | 92 | proxy_set_header Host pypi.org; 93 | proxy_ssl_name pypi.org; 94 | proxy_pass 'https://sg_pypi'; 95 | proxy_redirect 'https://pypi.org' $scheme://$http_host; 96 | } 97 | 98 | location ^~ /packages { 99 | proxy_set_header Connection ""; 100 | proxy_set_header Accept-Encoding ""; 101 | proxy_cache_valid 200 301 1M; 102 | proxy_cache_valid 404 1m; 103 | 104 | proxy_set_header Host files.pythonhosted.org; 105 | proxy_ssl_name files.pythonhosted.org; 106 | proxy_pass 'https://sg_pythonhosted/packages'; 107 | proxy_redirect 'https://files.pythonhosted.org/packages' $scheme://$http_host/packages; 108 | } 109 | 110 | location /nginx_status { 111 | stub_status; 112 | } 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /tests/mitmtest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd "$(dirname "${BASH_SOURCE[0]}")" || exit 127 3 | 4 | print_bold_red() { 5 | printf '\033[1;31m%s\033[0m\n' "$1" 6 | } 7 | 8 | set -x 9 | 10 | if ! docker ps >/dev/null; then 11 | print_bold_red "test requires docker to be running" 12 | exit 1 13 | fi 14 | 15 | venv_dir=$(mktemp -d) 16 | if ! python3 -c 'import sys; assert sys.version_info >= (3, 9)'; then 17 | print_bold_red "test requires Python 3.9 or newer" 18 | exit 1 19 | fi 20 | python3 -m venv "$venv_dir" 21 | 22 | export PIP_DISABLE_PIP_VERSION_CHECK=1 23 | "$venv_dir/bin/pip" install uv >/dev/null 24 | "$venv_dir/bin/python" -m uv pip install --python "$venv_dir/bin/python" --upgrade wheel pip mitmproxy >/dev/null 25 | 26 | docker_image=$(docker build -q ../) 27 | 28 | # kill background jobs on exit 29 | trap 'echo "cleaning up..."; docker kill nginx-pypi-cache; jobs -p | xargs -r kill; sleep 1' SIGTERM EXIT 30 | 31 | # run mitmdump on unprivileged port 32 | MITM=12345 33 | "$venv_dir/bin/mitmdump" -s pypi_intercept.py -p $MITM & 34 | 35 | # run the pypi cache on port 80 36 | docker run --name nginx-pypi-cache -p 80:80 --rm "$docker_image" & 37 | 38 | # wait for everything to come up 39 | sleep 5 40 | 41 | printf '\n\n\n===== basic curl test =====\n\n\n\n' 42 | 43 | # check a curl to pypi cache works 44 | STATUS=$(curl -s --output /dev/null --write-out "%{http_code}" http://localhost/simple/) 45 | if [ "$STATUS" -ne 200 ]; then 46 | print_bold_red "failed to issue request to pypi cache, got $STATUS" 47 | exit 1 48 | fi 49 | STATUS=$(curl -s --output /dev/null --write-out "%{http_code}" http://localhost/simple/boostedblob/) 50 | if [ "$STATUS" -ne 200 ]; then 51 | print_bold_red "failed to issue request to pypi cache, got $STATUS" 52 | exit 1 53 | fi 54 | # check that the mypy response was not cached 55 | if ! curl -s -I -X GET http://localhost/simple/mypy/ | grep -q 'X-Pypi-Cache: MISS'; then 56 | print_bold_red "mypy response was missing cache header (or unexpectedly cached)" 57 | exit 1 58 | fi 59 | # check that the mypy response did get cached 60 | if ! curl -s -I -X GET http://localhost/simple/mypy/ | grep -q 'X-Pypi-Cache: HIT'; then 61 | print_bold_red "mypy response was not cached" 62 | exit 1 63 | fi 64 | # check a curl to pypi cache that should fail does fail 65 | STATUS=$(curl -s --output /dev/null --write-out "%{http_code}" http://localhost/doesnotexist/) 66 | if [ "$STATUS" -ne 404 ]; then 67 | print_bold_red "expected 404 from pypi cache, got $STATUS" 68 | exit 1 69 | fi 70 | 71 | printf '\n\n\n===== mitm pip test =====\n\n\n\n' 72 | 73 | # check that mitmdump prevents pip installs from upstream pypi 74 | REQUESTS_CA_BUNDLE=~/.mitmproxy/mitmproxy-ca.pem ALL_PROXY=http://localhost:$MITM/ "$venv_dir/bin/pip" install --no-cache-dir --force-reinstall mypy 75 | if [ $? -ne 1 ]; then 76 | print_bold_red "installing mypy from upstream unexpectedly succeeded (should be blocked by mitmdump)" 77 | exit 1 78 | fi 79 | # check that mitmdump prevents pip installs of numpy from pypi cache 80 | REQUESTS_CA_BUNDLE=~/.mitmproxy/mitmproxy-ca.pem ALL_PROXY=http://localhost:$MITM/ "$venv_dir/bin/pip" install --no-cache-dir --force-reinstall --index-url=http://localhost/simple numpy 81 | if [ $? -ne 1 ]; then 82 | print_bold_red "installing numpy from pypi cache unexpectedly succeeded (should be blocked by mitmdump)" 83 | exit 1 84 | fi 85 | # but everything works for other packages if we use the pypi cache 86 | REQUESTS_CA_BUNDLE=~/.mitmproxy/mitmproxy-ca.pem ALL_PROXY=http://localhost:$MITM/ "$venv_dir/bin/pip" install --no-cache-dir --force-reinstall --index-url=http://localhost/simple mypy 87 | # shellcheck disable=SC2181 88 | if [ $? -ne 0 ]; then 89 | print_bold_red "failed to install mypy from pypi cache" 90 | exit 1 91 | fi 92 | 93 | # TODO: check that the pypi cache is actually getting cache hits, should be visible in the access log when we run the following 94 | REQUESTS_CA_BUNDLE=~/.mitmproxy/mitmproxy-ca.pem ALL_PROXY=http://localhost:$MITM/ "$venv_dir/bin/pip" install --no-cache-dir --force-reinstall --index-url=http://localhost/simple mypy 95 | 96 | # check that installing mypy did not invalidate the cache (the requests use different Accept headers) 97 | if ! curl -s -I -X GET http://localhost/simple/mypy/ | grep -q 'X-Pypi-Cache: HIT'; then 98 | print_bold_red "mypy response was not cached" 99 | exit 1 100 | fi 101 | 102 | printf '\n\n\n===== mitm uv test =====\n\n\n\n' 103 | 104 | # check that mitmdump prevents pip installs from upstream pypi 105 | SSL_CERT_FILE=~/.mitmproxy/mitmproxy-ca.pem ALL_PROXY=http://localhost:$MITM/ "$venv_dir/bin/python" -m uv pip install --python "$venv_dir/bin/python" --no-cache-dir --force-reinstall mypy 106 | if [ $? -ne 2 ]; then 107 | print_bold_red "installing mypy from upstream unexpectedly succeeded (should be blocked by mitmdump)" 108 | exit 1 109 | fi 110 | # check that mitmdump prevents pip installs of numpy from pypi cache 111 | SSL_CERT_FILE=~/.mitmproxy/mitmproxy-ca.pem ALL_PROXY=http://localhost:$MITM/ "$venv_dir/bin/python" -m uv pip install --python "$venv_dir/bin/python" --no-cache-dir--force-reinstall --index-url=http://localhost/simple numpy 112 | if [ $? -ne 2 ]; then 113 | print_bold_red "installing numpy from pypi cache unexpectedly succeeded (should be blocked by mitmdump)" 114 | exit 1 115 | fi 116 | # but everything works for other packages if we use the pypi cache 117 | SSL_CERT_FILE=~/.mitmproxy/mitmproxy-ca.pem ALL_PROXY=http://localhost:$MITM/ "$venv_dir/bin/python" -m uv pip install --python "$venv_dir/bin/python" --no-cache-dir --force-reinstall --index-url=http://localhost/simple mypy 118 | # shellcheck disable=SC2181 119 | if [ $? -ne 0 ]; then 120 | print_bold_red "failed to install mypy from pypi cache" 121 | exit 1 122 | fi 123 | 124 | printf '\033[1;32m%s\033[0m\n' "success!" 125 | -------------------------------------------------------------------------------- /tests/pypi_intercept.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from mitmproxy import http 3 | 4 | 5 | class ChangeHTTPCode: 6 | def response(self, flow: http.HTTPFlow) -> None: 7 | if not flow.response: 8 | raise ValueError("No response") 9 | if flow.response.status_code >= 400: 10 | print( 11 | f"[pypi_intercept] got error response: {flow.response.status_code}", 12 | flow.response.data, 13 | file=sys.stderr, 14 | ) 15 | 16 | # prevent requests to upstream pypi 17 | if "pypi" in flow.request.pretty_url or "pythonhosted" in flow.request.pretty_url: 18 | flow.response.status_code = 400 19 | # prevent requests involving numpy 20 | if "numpy" in flow.request.pretty_url: 21 | flow.response.status_code = 400 22 | 23 | 24 | addons = [ChangeHTTPCode()] 25 | --------------------------------------------------------------------------------