├── .editorconfig ├── .env.template ├── .github ├── actions │ └── setup-poetry-env │ │ └── action.yml └── workflows │ ├── main.yml │ └── publish.yml ├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── DockerfileCPU ├── LICENSE ├── Makefile ├── README.md ├── SETUP.md ├── data └── .gitignore ├── docker-compose.yml ├── frontend ├── .dockerignore ├── .eslintrc.json ├── .gitignore ├── Dockerfile ├── README.md ├── app │ ├── components │ │ ├── GitHubButton.tsx │ │ ├── GoogleAnalytics.tsx │ │ ├── Header.tsx │ │ ├── InfoBox.tsx │ │ ├── ScatterPlot.tsx │ │ ├── SearchResultsTable.tsx │ │ ├── SupportButton.tsx │ │ └── ToggleSwitch.tsx │ ├── favicon.ico │ ├── globals.css │ ├── layout.tsx │ ├── page.tsx │ └── utils │ │ └── search.ts ├── next.config.mjs ├── package-lock.json ├── package.json ├── postcss.config.mjs ├── public │ ├── kofi.png │ ├── next.svg │ ├── pypi-light.svg │ ├── pypi.svg │ └── vercel.svg ├── tailwind.config.ts └── tsconfig.json ├── package-lock.json ├── package.json ├── poetry.lock ├── pypi_bigquery.sql ├── pypi_scout ├── __init__.py ├── api │ ├── data_loader.py │ ├── main.py │ └── models.py ├── config.py ├── data │ ├── description_cleaner.py │ └── raw_data_reader.py ├── embeddings │ ├── embeddings_creator.py │ └── simple_vector_database.py ├── scripts │ ├── create_vector_embeddings.py │ ├── download_raw_dataset.py │ ├── process_raw_dataset.py │ ├── setup.py │ └── upload_processed_datasets.py └── utils │ ├── blob_io.py │ ├── logging.py │ └── score_calculator.py ├── pyproject.toml ├── requirements-cpu.txt ├── static ├── demo.gif ├── pypi-light.svg └── pypi.svg └── tests └── embeddings └── test_simple_vector_database.py /.editorconfig: -------------------------------------------------------------------------------- 1 | max_line_length = 120 2 | 3 | [*.json] 4 | indent_style = space 5 | indent_size = 4 6 | -------------------------------------------------------------------------------- /.env.template: -------------------------------------------------------------------------------- 1 | STORAGE_BACKEND=BLOB 2 | STORAGE_BACKEND_BLOB_ACCOUNT_NAME= 3 | STORAGE_BACKEND_BLOB_CONTAINER_NAME= 4 | STORAGE_BACKEND_BLOB_KEY= 5 | -------------------------------------------------------------------------------- /.github/actions/setup-poetry-env/action.yml: -------------------------------------------------------------------------------- 1 | name: "setup-poetry-env" 2 | description: "Composite action to setup the Python and poetry environment." 3 | 4 | inputs: 5 | python-version: 6 | required: false 7 | description: "The python version to use" 8 | default: "3.11" 9 | 10 | runs: 11 | using: "composite" 12 | steps: 13 | - name: Set up python 14 | uses: actions/setup-python@v5 15 | with: 16 | python-version: ${{ inputs.python-version }} 17 | 18 | - name: Install Poetry 19 | uses: snok/install-poetry@v1 20 | with: 21 | virtualenvs-in-project: true 22 | 23 | - name: Load cached venv 24 | id: cached-poetry-dependencies 25 | uses: actions/cache@v4 26 | with: 27 | path: .venv 28 | key: venv-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('poetry.lock') }} 29 | 30 | - name: Install dependencies 31 | if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' 32 | run: poetry install --no-interaction 33 | shell: bash 34 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Main 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | types: [opened, synchronize, reopened, ready_for_review] 9 | 10 | jobs: 11 | quality: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Check out 15 | uses: actions/checkout@v4 16 | 17 | - uses: actions/cache@v4 18 | with: 19 | path: ~/.cache/pre-commit 20 | key: pre-commit-${{ hashFiles('.pre-commit-config.yaml') }} 21 | 22 | - name: Set up the environment 23 | uses: ./.github/actions/setup-poetry-env 24 | 25 | - name: Run checks 26 | run: make check 27 | 28 | tox: 29 | runs-on: ubuntu-latest 30 | strategy: 31 | matrix: 32 | python-version: ["3.9", "3.10", "3.11", "3.12"] 33 | fail-fast: false 34 | steps: 35 | - name: Check out 36 | uses: actions/checkout@v4 37 | 38 | - name: Set up python 39 | uses: actions/setup-python@v5 40 | with: 41 | python-version: ${{ matrix.python-version }} 42 | 43 | - name: Install Poetry 44 | uses: snok/install-poetry@v1 45 | 46 | - name: Load cached venv 47 | uses: actions/cache@v4 48 | with: 49 | path: .tox 50 | key: venv-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('poetry.lock') }} 51 | 52 | - name: Install tox 53 | run: | 54 | python -m pip install --upgrade pip 55 | python -m pip install tox tox-gh-actions 56 | 57 | - name: Test with tox 58 | run: tox 59 | 60 | - name: Upload coverage reports to Codecov with GitHub Action on Python 3.11 61 | uses: codecov/codecov-action@v4 62 | if: ${{ matrix.python-version == '3.11' }} 63 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Build and Push Docker Images 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | build-and-push-backend: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout repository 11 | uses: actions/checkout@v2 12 | 13 | - name: Set up Docker Buildx 14 | uses: docker/setup-buildx-action@v2 15 | 16 | - name: Login to Azure Container Registry 17 | uses: azure/docker-login@v1 18 | with: 19 | login-server: pypiscoutacr.azurecr.io 20 | username: ${{ secrets.ACR_USERNAME }} 21 | password: ${{ secrets.ACR_PASSWORD }} 22 | 23 | - name: Build and Push Backend Docker image 24 | uses: docker/build-push-action@v4 25 | with: 26 | context: . 27 | file: ./DockerfileCPU 28 | platforms: linux/amd64 29 | push: true 30 | tags: pypiscoutacr.azurecr.io/pypi-scout-backend:latest 31 | 32 | build-and-push-frontend: 33 | runs-on: ubuntu-latest 34 | steps: 35 | - name: Checkout repository 36 | uses: actions/checkout@v2 37 | 38 | - name: Set up Docker Buildx 39 | uses: docker/setup-buildx-action@v2 40 | 41 | - name: Login to Azure Container Registry 42 | uses: azure/docker-login@v1 43 | with: 44 | login-server: pypiscoutacr.azurecr.io 45 | username: ${{ secrets.ACR_USERNAME }} 46 | password: ${{ secrets.ACR_PASSWORD }} 47 | 48 | - name: Build and Push Frontend Docker image 49 | uses: docker/build-push-action@v4 50 | with: 51 | context: ./frontend 52 | file: ./frontend/Dockerfile 53 | platforms: linux/amd64 54 | push: true 55 | tags: pypiscoutacr.azurecr.io/pypi-scout-frontend:latest 56 | build-args: | 57 | NEXT_PUBLIC_API_URL=https://pypiscout.com/api 58 | NEXT_PUBLIC_GA_TRACKING_ID=${{ secrets.NEXT_PUBLIC_GA_TRACKING_ID }} 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | docs/source 2 | 3 | # From https://raw.githubusercontent.com/github/gitignore/main/Python.gitignore 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/#use-with-ide 114 | .pdm.toml 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # Vscode config files 160 | .vscode/ 161 | 162 | # PyCharm 163 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 164 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 165 | # and can be added to the global gitignore or merged into this file. For a more nuclear 166 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 167 | #.idea/ 168 | 169 | .env 170 | .DS_Store 171 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: "v4.4.0" 4 | hooks: 5 | - id: check-case-conflict 6 | - id: check-merge-conflict 7 | - id: check-toml 8 | - id: check-yaml 9 | - id: end-of-file-fixer 10 | - id: trailing-whitespace 11 | 12 | - repo: https://github.com/astral-sh/ruff-pre-commit 13 | rev: "v0.1.6" 14 | hooks: 15 | - id: ruff 16 | args: [--exit-non-zero-on-fix] 17 | - id: ruff-format 18 | 19 | - repo: https://github.com/pre-commit/mirrors-prettier 20 | rev: "v3.0.3" 21 | hooks: 22 | - id: prettier 23 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | 3 | FROM python:3.10-slim-bookworm 4 | 5 | ENV POETRY_VERSION=1.6 \ 6 | POETRY_VIRTUALENVS_CREATE=false 7 | 8 | # Install poetry and clean up 9 | RUN pip install "poetry==$POETRY_VERSION" && \ 10 | rm -rf /root/.cache/pip 11 | 12 | # Set work directory 13 | WORKDIR /code 14 | 15 | # Copy only requirements to cache them in docker layer 16 | COPY poetry.lock pyproject.toml /code/ 17 | 18 | # Install project dependencies and clean up 19 | RUN poetry install --no-interaction --no-ansi --no-root --no-dev && \ 20 | rm -rf /root/.cache/pip 21 | 22 | # Copy Python code to the Docker image 23 | COPY pypi_scout /code/pypi_scout/ 24 | 25 | # Make empty data directory 26 | RUN mkdir -p /code/data 27 | 28 | ENV PYTHONPATH=/code 29 | 30 | # Use the script as the entrypoint 31 | CMD ["uvicorn", "pypi_scout.api.main:app", "--host", "0.0.0.0", "--port", "8000"] 32 | -------------------------------------------------------------------------------- /DockerfileCPU: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | 3 | # Use a slim Python image as the base 4 | FROM python:3.10-slim-bookworm 5 | 6 | # Set environment variables 7 | ENV PYTHONUNBUFFERED=1 8 | 9 | # Install system dependencies 10 | RUN apt-get update && apt-get install -y --no-install-recommends \ 11 | build-essential \ 12 | && apt-get clean && rm -rf /var/lib/apt/lists/* 13 | 14 | # Set working directory 15 | WORKDIR /code 16 | 17 | # Copy only requirements to cache them in docker layer 18 | COPY requirements-cpu.txt /code/requirements-cpu.txt 19 | 20 | # Install Python dependencies 21 | RUN pip install --no-cache-dir -r requirements-cpu.txt 22 | 23 | # Copy the rest of the application code 24 | COPY pypi_scout /code/pypi_scout/ 25 | 26 | # Make empty data directory 27 | RUN mkdir -p /code/data 28 | 29 | ENV PYTHONPATH=/code 30 | 31 | # Use the script as the entrypoint 32 | CMD ["uvicorn", "pypi_scout.api.main:app", "--host", "0.0.0.0", "--port", "8000"] 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: install 2 | install: ## Install the poetry environment and install the pre-commit hooks 3 | @echo "🚀 Creating virtual environment using pyenv and poetry" 4 | @poetry install 5 | @ poetry run pre-commit install 6 | @poetry shell 7 | 8 | .PHONY: check 9 | check: ## Run code quality tools. 10 | @echo "🚀 Checking Poetry lock file consistency with 'pyproject.toml': Running poetry check --lock" 11 | @poetry check --lock 12 | @echo "🚀 Linting code: Running pre-commit" 13 | @poetry run pre-commit run -a 14 | @echo "🚀 Checking for obsolete dependencies: Running deptry" 15 | @poetry run deptry . 16 | 17 | .PHONY: test 18 | test: ## Test the code with pytest 19 | @echo "🚀 Testing code: Running pytest" 20 | @poetry run pytest --cov --cov-config=pyproject.toml --cov-report=xml 21 | 22 | .PHONY: build 23 | build: ## Build wheel file using poetry 24 | @echo "🚀 Creating wheel file" 25 | @poetry build 26 | 27 | .PHONY: serve 28 | serve: ## Serve API with uvicorn in development mode 29 | @poetry run uvicorn pypi_scout.api.main:app --reload 30 | 31 | .PHONY: frontend 32 | frontend: ## Serve frontend in development mode 33 | @cd frontend; npm run dev 34 | 35 | .PHONY: help 36 | help: 37 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' 38 | 39 | .DEFAULT_GOAL := help 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | PyPI Scout Logo 3 |

4 | 5 |
6 |

7 | PyPI Scout Demo 8 |

9 | 10 | ## What does this do? 11 | 12 | Finding the right Python package on [PyPI](https://pypi.org/) can be a bit difficult, since PyPI isn't really designed for discovering packages easily. For example, you can search for the word "plot" and get a list of hundreds of packages that contain the word "plot" in seemingly random order. 13 | 14 | Inspired by [this blog post](https://koaning.io/posts/search-boxes/) about finding arXiv articles using vector embeddings, I decided to build a small application that helps you find Python packages with a similar approach. For example, you can ask it "I want to make nice plots and visualizations", and it will provide you with a short list of packages that can help you with that. 15 | 16 | ## How does this work? 17 | 18 | The project works by collecting project summaries and descriptions for all packages on PyPI with more than 100 weekly downloads. These are then converted into vector representations using [Sentence Transformers](https://www.sbert.net/). When the user enters a query, it is converted into a vector representation, and the most similar package descriptions are fetched from the vector database. Additional weight is given to the amount of weekly downloads before presenting the results to the user in a dashboard. 19 | 20 | ## Stack 21 | 22 | The project uses the following technologies: 23 | 24 | 1. **[FastAPI](https://fastapi.tiangolo.com/)** for the API backend 25 | 2. **[NextJS](https://nextjs.org/) and [TailwindCSS](https://tailwindcss.com/)** for the frontend 26 | 3. **[Sentence Transformers](https://www.sbert.net/)** for vector embeddings 27 | 28 | ## Getting Started 29 | 30 | ### Build and Setup 31 | 32 | #### 1. (Optional) **Create a `.env` file** 33 | 34 | By default, all data will be stored on your local machine. It is also possible to store the data for the API on Azure Blob storage, and 35 | have the API read from there. To do so, create a `.env` file: 36 | 37 | ```sh 38 | cp .env.template .env 39 | ``` 40 | 41 | and fill in the required fields. 42 | 43 | #### 2. **Run the Setup Script** 44 | 45 | The setup script will: 46 | 47 | - Download and process the PyPI dataset and store the results in the `data` directory. 48 | - Create vector embeddings for the PyPI dataset. 49 | - If the `STORAGE_BACKEND` environment variable is set to `BLOB`: Upload the datasets to blob storage. 50 | 51 | There are three methods to run the setup script, dependent on if you have a NVIDIA GPU and [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) installed. Please run the setup script using the method that is applicable for you: 52 | 53 | - [Option 1: Using Poetry](SETUP.md#option-1-using-poetry) 54 | - [Option 2: Using Docker with NVIDIA GPU and NVIDIA Container Toolkit](SETUP.md#option-2-using-docker-with-nvidia-gpu-and-nvidia-container-toolkit) 55 | - [Option 3: Using Docker without NVIDIA GPU and NVIDIA Container Toolkit](SETUP.md#option-3-using-docker-without-nvidia-gpu-and-nvidia-container-toolkit) 56 | 57 | > [!NOTE] 58 | > The dataset contains approximately 100.000 packages on PyPI with more than 100 weekly downloads. To speed up local development, 59 | > you can lower the amount of packages that is processed locally by lowering the value of `FRAC_DATA_TO_INCLUDE` in `pypi_scout/config.py`. 60 | 61 | #### 3. **Run the Application** 62 | 63 | Start the application using Docker Compose: 64 | 65 | ```sh 66 | docker-compose up 67 | ``` 68 | 69 | After a short while, your application will be live at [http://localhost:3000](http://localhost:3000). 70 | 71 | ## Data 72 | 73 | The dataset for this project is created using the [PyPI dataset on Google BigQuery](https://console.cloud.google.com/marketplace/product/gcp-public-data-pypi/pypi?project=regal-net-412415). The SQL query used can be found in [pypi_bigquery.sql](./pypi_bigquery.sql). The resulting dataset is available as a CSV file on [Google Drive](https://drive.google.com/file/d/1huR7-VD3AieBRCcQyRX9MWbPLMb_czjq/view?usp=sharing). 74 | -------------------------------------------------------------------------------- /SETUP.md: -------------------------------------------------------------------------------- 1 | # Running the Setup Script 2 | 3 | The setup script will: 4 | 5 | - Download and process the PyPI dataset and store the results in the `data` directory. 6 | - Create vector embeddings for the PyPI dataset. 7 | - If the `STORAGE_BACKEND` environment variable is set to `BLOB`: Upload the datasets to blob storage. 8 | 9 | There are three ways to run the setup script: 10 | 11 | ### Option 1: Using Poetry 12 | 13 | You can run the setup script using a virtual environment with Poetry. This method will automatically utilize your GPU for the vector embeddings if it is detected. 14 | 15 | 1. Install dependencies and set up the virtual environment: 16 | 17 | ```sh 18 | poetry install 19 | ``` 20 | 21 | 2. Run the setup script: 22 | 23 | ```sh 24 | poetry run python pypi_scout/scripts/setup.py 25 | ``` 26 | 27 | ### Option 2: Using Docker with NVIDIA GPU and NVIDIA Container Toolkit 28 | 29 | If you have an NVIDIA GPU and the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) installed, follow these steps: 30 | 31 | 1. Build the Docker image: 32 | 33 | ```sh 34 | docker build -t pypi-scout . 35 | ``` 36 | 37 | 2. Run the setup script in a Docker container with GPU support: 38 | 39 | ```sh 40 | docker run --rm \ 41 | --gpus all \ 42 | --env-file .env \ 43 | -v $(pwd)/data:/code/data \ 44 | --entrypoint "/bin/bash" \ 45 | pypi-scout \ 46 | -c "python /code/pypi_scout/scripts/setup.py" 47 | ``` 48 | 49 | ### Option 3: Using Docker without NVIDIA GPU and NVIDIA Container Toolkit 50 | 51 | If you do not have an NVIDIA GPU or the NVIDIA Container Toolkit installed, follow these steps: 52 | 53 | 1. Build the Docker image: 54 | 55 | ```sh 56 | docker build -f DockerfileCPU -t pypi-scout . 57 | ``` 58 | 59 | 2. Run the setup script in a Docker container without GPU support: 60 | 61 | ```sh 62 | docker run --rm \ 63 | --env-file .env \ 64 | -v $(pwd)/data:/code/data \ 65 | --entrypoint "/bin/bash" \ 66 | pypi-scout \ 67 | -c "python /code/pypi_scout/scripts/setup.py" 68 | ``` 69 | 70 | ### Running the Application 71 | 72 | After setting up the dataset, start the application using Docker Compose: 73 | 74 | ```sh 75 | docker-compose up 76 | ``` 77 | 78 | After a short while, your application will be live at [http://localhost:3000](http://localhost:3000). 79 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.8" 2 | 3 | services: 4 | backend: 5 | build: 6 | context: . 7 | dockerfile: Dockerfile 8 | ports: 9 | - "8000:8000" 10 | volumes: 11 | - ./data:/code/data 12 | env_file: 13 | - .env 14 | 15 | frontend: 16 | build: 17 | context: ./frontend 18 | dockerfile: Dockerfile 19 | args: 20 | NEXT_PUBLIC_API_URL: http://localhost:8000/api 21 | ports: 22 | - "3000:3000" 23 | depends_on: 24 | - backend 25 | -------------------------------------------------------------------------------- /frontend/.dockerignore: -------------------------------------------------------------------------------- 1 | # .dockerignore 2 | node_modules 3 | .next 4 | .env 5 | .git 6 | -------------------------------------------------------------------------------- /frontend/.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "next/core-web-vitals" 3 | } 4 | -------------------------------------------------------------------------------- /frontend/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.js 7 | .yarn/install-state.gz 8 | 9 | # testing 10 | /coverage 11 | 12 | # next.js 13 | /.next/ 14 | /out/ 15 | 16 | # production 17 | /build 18 | 19 | # misc 20 | .DS_Store 21 | *.pem 22 | 23 | # debug 24 | npm-debug.log* 25 | yarn-debug.log* 26 | yarn-error.log* 27 | 28 | # local env files 29 | .env*.local 30 | 31 | # vercel 32 | .vercel 33 | 34 | # typescript 35 | *.tsbuildinfo 36 | next-env.d.ts 37 | 38 | .next 39 | -------------------------------------------------------------------------------- /frontend/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use the official Node.js image as the base image 2 | FROM node:18-alpine 3 | 4 | # Set the working directory inside the container 5 | WORKDIR /app 6 | 7 | # Copy package.json and package-lock.json files to the container 8 | COPY package.json package-lock.json ./ 9 | 10 | # Install dependencies 11 | RUN npm install 12 | 13 | # Copy the rest of the application code to the container 14 | COPY . . 15 | 16 | # Add build arguments to environment 17 | ARG NEXT_PUBLIC_API_URL 18 | ARG NEXT_PUBLIC_GA_TRACKING_ID 19 | ENV NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL} 20 | ENV NEXT_PUBLIC_GA_TRACKING_ID=${NEXT_PUBLIC_GA_TRACKING_ID} 21 | 22 | # Build the Next.js application 23 | RUN npm run build 24 | 25 | # Expose the port on which the application will run 26 | EXPOSE 3000 27 | 28 | # Start the Next.js application 29 | CMD ["npm", "run", "start"] 30 | -------------------------------------------------------------------------------- /frontend/README.md: -------------------------------------------------------------------------------- 1 | This is a [Next.js](https://nextjs.org/) project bootstrapped with [`create-next-app`](https://github.com/vercel/next.js/tree/canary/packages/create-next-app). 2 | 3 | ## Getting Started 4 | 5 | First, run the development server: 6 | 7 | ```bash 8 | npm run dev 9 | ``` 10 | 11 | Open [http://localhost:3000](http://localhost:3000) with your browser to see the result. 12 | 13 | You can start editing the page by modifying `app/page.tsx`. The page auto-updates as you edit the file. 14 | 15 | This project uses [`next/font`](https://nextjs.org/docs/basic-features/font-optimization) to automatically optimize and load Inter, a custom Google Font. 16 | 17 | ## Learn More 18 | 19 | To learn more about Next.js, take a look at the following resources: 20 | 21 | - [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API. 22 | - [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial. 23 | 24 | You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js/) - your feedback and contributions are welcome! 25 | -------------------------------------------------------------------------------- /frontend/app/components/GitHubButton.tsx: -------------------------------------------------------------------------------- 1 | import React from "react"; 2 | 3 | const GitHubButton: React.FC = () => { 4 | return ( 5 | 11 | 18 | 19 | 20 | GitHub 21 | 22 | ); 23 | }; 24 | 25 | export default GitHubButton; 26 | -------------------------------------------------------------------------------- /frontend/app/components/GoogleAnalytics.tsx: -------------------------------------------------------------------------------- 1 | // app/components/GoogleAnalytics.tsx 2 | "use client"; 3 | 4 | import { useEffect } from "react"; 5 | 6 | const GoogleAnalytics = () => { 7 | useEffect(() => { 8 | const trackingId = process.env.NEXT_PUBLIC_GA_TRACKING_ID; 9 | if (trackingId) { 10 | const script1 = document.createElement("script"); 11 | script1.async = true; 12 | script1.src = `https://www.googletagmanager.com/gtag/js?id=${trackingId}`; 13 | document.head.appendChild(script1); 14 | 15 | const script2 = document.createElement("script"); 16 | script2.innerHTML = ` 17 | window.dataLayer = window.dataLayer || []; 18 | function gtag(){dataLayer.push(arguments);} 19 | gtag('js', new Date()); 20 | gtag('config', '${trackingId}'); 21 | `; 22 | document.head.appendChild(script2); 23 | } 24 | }, []); 25 | 26 | return null; 27 | }; 28 | 29 | export default GoogleAnalytics; 30 | -------------------------------------------------------------------------------- /frontend/app/components/Header.tsx: -------------------------------------------------------------------------------- 1 | import { useState } from "react"; 2 | import GitHubButton from "./GitHubButton"; 3 | import SupportButton from "./SupportButton"; 4 | import { FaBars, FaTimes } from "react-icons/fa"; 5 | 6 | const Header: React.FC = () => { 7 | const [isMenuOpen, setIsMenuOpen] = useState(false); 8 | 9 | const toggleMenu = () => { 10 | setIsMenuOpen(!isMenuOpen); 11 | }; 12 | 13 | return ( 14 |
15 |
16 | 17 | 18 |
19 |
20 | 26 |
27 | {isMenuOpen && ( 28 |
29 | 30 | 31 |
32 | )} 33 |
34 | ); 35 | }; 36 | 37 | export default Header; 38 | -------------------------------------------------------------------------------- /frontend/app/components/InfoBox.tsx: -------------------------------------------------------------------------------- 1 | import React from "react"; 2 | 3 | interface InfoBoxProps { 4 | infoBoxVisible: boolean; 5 | } 6 | 7 | const InfoBox: React.FC = ({ infoBoxVisible }) => { 8 | if (!infoBoxVisible) return null; 9 | 10 | return ( 11 |
12 |

13 | How does this work? 14 |

15 |

16 | This application allows you to search for Python packages on PyPI using 17 | natural language queries. For example, a query could be "a package 18 | that creates plots and beautiful visualizations". 19 |

20 |
21 |

22 | Once you click search, your query will be matched against the summary 23 | and the first part of the description of the ~100.000 most popular 24 | packages on PyPI, which includes all packages with at least ~100 25 | downloads per week. The results are then scored based on their 26 | similarity to the query and their number of weekly downloads, and the 27 | best results are displayed in the plot and table above. 28 |

29 |
30 | ); 31 | }; 32 | 33 | export default InfoBox; 34 | -------------------------------------------------------------------------------- /frontend/app/components/ScatterPlot.tsx: -------------------------------------------------------------------------------- 1 | import React from "react"; 2 | import { Scatter } from "react-chartjs-2"; 3 | import { 4 | Chart, 5 | Tooltip, 6 | Legend, 7 | PointElement, 8 | LinearScale, 9 | Title, 10 | LogarithmicScale, 11 | CategoryScale, 12 | FontSpec, 13 | } from "chart.js"; 14 | 15 | Chart.register( 16 | Tooltip, 17 | Legend, 18 | PointElement, 19 | LinearScale, 20 | Title, 21 | LogarithmicScale, 22 | CategoryScale, 23 | ); 24 | 25 | interface Match { 26 | name: string; 27 | similarity: number; 28 | weekly_downloads: number; 29 | summary: string; 30 | } 31 | 32 | interface ScatterPlotProps { 33 | results: Match[]; 34 | } 35 | 36 | const getColor = ( 37 | similarity: number, 38 | downloads: number, 39 | minSim: number, 40 | maxSim: number, 41 | minLogDownloads: number, 42 | maxLogDownloads: number, 43 | ) => { 44 | const baseColor = [54, 162, 235]; // Blue 45 | const highlightColor = [255, 99, 132]; // Red 46 | 47 | const normalizedSimilarity = (similarity - minSim) / (maxSim - minSim); 48 | const normalizedDownloads = 49 | (Math.log10(downloads) - minLogDownloads) / 50 | (maxLogDownloads - minLogDownloads); 51 | 52 | const weight = Math.min( 53 | ((normalizedSimilarity + normalizedDownloads) / 2) * 1.5, 54 | 1, 55 | ); 56 | 57 | const color = baseColor.map((base, index) => 58 | Math.round(base + weight * (highlightColor[index] - base)), 59 | ); 60 | 61 | return `rgba(${color.join(",")}, 0.8)`; 62 | }; 63 | 64 | const getPointSize = ( 65 | similarity: number, 66 | downloads: number, 67 | minSim: number, 68 | maxSim: number, 69 | minLogDownloads: number, 70 | maxLogDownloads: number, 71 | ) => { 72 | const normalizedSimilarity = (similarity - minSim) / (maxSim - minSim); 73 | const normalizedDownloads = 74 | (Math.log10(downloads) - minLogDownloads) / 75 | (maxLogDownloads - minLogDownloads); 76 | 77 | const minSize = 2; 78 | const size = Math.min( 79 | (normalizedSimilarity + normalizedDownloads) * 10 + minSize, 80 | 25, 81 | ); 82 | return size; 83 | }; 84 | 85 | const ScatterPlot: React.FC = ({ results }) => { 86 | const similarities = results.map((result) => result.similarity); 87 | const downloads = results.map((result) => result.weekly_downloads); 88 | const logDownloads = downloads.map((download) => Math.log10(download)); 89 | 90 | const minSim = Math.min(...similarities); 91 | const maxSim = Math.max(...similarities); 92 | const minLogDownloads = Math.min(...logDownloads); 93 | const maxLogDownloads = Math.max(...logDownloads); 94 | 95 | const data = { 96 | datasets: [ 97 | { 98 | label: "Packages", 99 | data: results.map((result) => ({ 100 | x: result.similarity, 101 | y: result.weekly_downloads, 102 | name: result.name, 103 | summary: result.summary, 104 | link: `https://pypi.org/project/${result.name}/`, 105 | })), 106 | backgroundColor: results.map((result) => 107 | getColor( 108 | result.similarity, 109 | result.weekly_downloads, 110 | minSim, 111 | maxSim, 112 | minLogDownloads, 113 | maxLogDownloads, 114 | ), 115 | ), 116 | borderColor: results.map((result) => 117 | getColor( 118 | result.similarity, 119 | result.weekly_downloads, 120 | minSim, 121 | maxSim, 122 | minLogDownloads, 123 | maxLogDownloads, 124 | ), 125 | ), 126 | pointRadius: results.map((result) => 127 | getPointSize( 128 | result.similarity, 129 | result.weekly_downloads, 130 | minSim, 131 | maxSim, 132 | minLogDownloads, 133 | maxLogDownloads, 134 | ), 135 | ), 136 | hoverBackgroundColor: results.map((result) => 137 | getColor( 138 | result.similarity, 139 | result.weekly_downloads, 140 | minSim, 141 | maxSim, 142 | minLogDownloads, 143 | maxLogDownloads, 144 | ), 145 | ), 146 | hoverBorderColor: results.map((result) => 147 | getColor( 148 | result.similarity, 149 | result.weekly_downloads, 150 | minSim, 151 | maxSim, 152 | minLogDownloads, 153 | maxLogDownloads, 154 | ), 155 | ), 156 | pointHoverRadius: 15, 157 | }, 158 | ], 159 | }; 160 | 161 | const options = { 162 | responsive: true, 163 | maintainAspectRatio: false, 164 | plugins: { 165 | tooltip: { 166 | callbacks: { 167 | title: (context: any) => { 168 | const dataPoint = context[0].raw; 169 | return dataPoint.name; 170 | }, 171 | beforeLabel: (context: any) => { 172 | const dataPoint = context.raw; 173 | return dataPoint.summary; 174 | }, 175 | label: () => "", 176 | afterLabel: (context: any) => { 177 | const dataPoint = context.raw; 178 | return `\nWeekly downloads: ${dataPoint.y.toLocaleString()}`; 179 | }, 180 | }, 181 | titleFont: { size: 16, weight: "bold" as FontSpec["weight"] }, 182 | bodyFont: { size: 14, weight: "normal" as FontSpec["weight"] }, 183 | footerFont: { size: 12, weight: "normal" as FontSpec["weight"] }, 184 | displayColors: false, 185 | backgroundColor: "rgba(0, 0, 0, 0.8)", 186 | padding: 10, 187 | bodySpacing: 4, 188 | titleAlign: "left" as const, 189 | bodyAlign: "left" as const, 190 | footerAlign: "left" as const, 191 | }, 192 | legend: { 193 | display: false, 194 | }, 195 | }, 196 | scales: { 197 | x: { 198 | title: { 199 | display: true, 200 | text: "Similarity", 201 | color: "#FFFFFF", 202 | font: { 203 | size: 24, 204 | }, 205 | }, 206 | ticks: { 207 | color: "#FFFFFF", 208 | display: false, 209 | }, 210 | grid: { 211 | display: false, 212 | }, 213 | }, 214 | y: { 215 | title: { 216 | display: true, 217 | text: "Weekly Downloads", 218 | color: "#FFFFFF", 219 | font: { 220 | size: 24, 221 | }, 222 | }, 223 | ticks: { 224 | callback: function (value: any) { 225 | return value.toLocaleString(); 226 | }, 227 | color: "#FFFFFF", 228 | maxTicksLimit: 5, 229 | }, 230 | type: "logarithmic" as const, 231 | }, 232 | }, 233 | onClick: (event: any, elements: any) => { 234 | if (elements.length > 0) { 235 | const elementIndex = elements[0].index; 236 | const datasetIndex = elements[0].datasetIndex; 237 | const link = data.datasets[datasetIndex].data[elementIndex].link; 238 | window.open(link, "_blank"); 239 | } 240 | }, 241 | onHover: (event: any, elements: any) => { 242 | event.native.target.style.cursor = elements[0] ? "pointer" : "default"; 243 | }, 244 | elements: { 245 | point: { 246 | hoverRadius: 15, 247 | }, 248 | }, 249 | }; 250 | 251 | const plugins = [ 252 | { 253 | id: "customLabels", 254 | afterDatasetsDraw: (chart: any) => { 255 | const ctx = chart.ctx; 256 | chart.data.datasets.forEach((dataset: any) => { 257 | dataset.data.forEach((dataPoint: any, index: number) => { 258 | const { x, y } = chart 259 | .getDatasetMeta(0) 260 | .data[index].tooltipPosition(); 261 | ctx.fillStyle = "white"; 262 | ctx.textAlign = "center"; 263 | ctx.fillText(dataPoint.name, x, y - 10); 264 | }); 265 | }); 266 | }, 267 | }, 268 | ]; 269 | 270 | return ( 271 |
272 |

273 | Click a package to go to PyPI 274 |

275 |
276 |
277 | 278 |
279 |
280 | ); 281 | }; 282 | 283 | export default ScatterPlot; 284 | -------------------------------------------------------------------------------- /frontend/app/components/SearchResultsTable.tsx: -------------------------------------------------------------------------------- 1 | import React from "react"; 2 | import { FaExternalLinkAlt } from "react-icons/fa"; // Import the icon 3 | 4 | interface Match { 5 | name: string; 6 | similarity: number; 7 | weekly_downloads: number; 8 | summary: string; 9 | } 10 | 11 | interface SearchResultsTableProps { 12 | results: Match[]; 13 | sortField: string; 14 | sortDirection: string; 15 | onSort: (field: string) => void; 16 | } 17 | 18 | const SearchResultsTable: React.FC = ({ 19 | results, 20 | sortField, 21 | sortDirection, 22 | onSort, 23 | }) => { 24 | const getSortIndicator = (field: string) => { 25 | return sortField === field ? (sortDirection === "asc" ? "▲" : "▼") : ""; 26 | }; 27 | 28 | const truncateText = (text: string, maxLength: number) => { 29 | return text.length > maxLength 30 | ? `${text.substring(0, maxLength)}...` 31 | : text; 32 | }; 33 | 34 | return ( 35 |
36 | 37 | 38 | 39 | 47 | 56 | 67 | 70 | 73 | 74 | 75 | 76 | {results.map((result, index) => ( 77 | 78 | 81 | 84 | 87 | 90 | 101 | 102 | ))} 103 | 104 |
onSort("name")} 42 | > 43 |
44 | Name {getSortIndicator("name")} 45 |
46 |
onSort("similarity")} 50 | > 51 |
52 | Similarity{" "} 53 | {getSortIndicator("similarity")} 54 |
55 |
onSort("weekly_downloads")} 59 | > 60 |
61 | Weekly Downloads{" "} 62 | 63 | {getSortIndicator("weekly_downloads")} 64 | 65 |
66 |
68 | Summary 69 | 71 | Link 72 |
79 | {truncateText(result.name, 20)} 80 | 82 | {result.similarity.toFixed(3)} 83 | 85 | {result.weekly_downloads.toLocaleString()} 86 | 88 | {result.summary} 89 | 91 | 97 | 98 | PyPI 99 | 100 |
105 |
106 | ); 107 | }; 108 | 109 | export default SearchResultsTable; 110 | -------------------------------------------------------------------------------- /frontend/app/components/SupportButton.tsx: -------------------------------------------------------------------------------- 1 | import React from "react"; 2 | 3 | const SupportButton: React.FC = () => { 4 | return ( 5 | 11 | Ko-fi logo 18 | Support 19 | 20 | ); 21 | }; 22 | 23 | export default SupportButton; 24 | -------------------------------------------------------------------------------- /frontend/app/components/ToggleSwitch.tsx: -------------------------------------------------------------------------------- 1 | import React from "react"; 2 | 3 | interface ToggleSwitchProps { 4 | option1: string; 5 | option2: string; 6 | selectedOption: string; 7 | onToggle: (option: string) => void; 8 | } 9 | 10 | const ToggleSwitch: React.FC = ({ 11 | option1, 12 | option2, 13 | selectedOption, 14 | onToggle, 15 | }) => { 16 | return ( 17 |
18 | 28 | 38 |
39 | ); 40 | }; 41 | 42 | export default ToggleSwitch; 43 | -------------------------------------------------------------------------------- /frontend/app/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgmaas/pypi-scout/593a48a2512a14c350bae98b087cd861d94a0c6b/frontend/app/favicon.ico -------------------------------------------------------------------------------- /frontend/app/globals.css: -------------------------------------------------------------------------------- 1 | @tailwind base; 2 | @tailwind components; 3 | @tailwind utilities; 4 | 5 | :root { 6 | --foreground-rgb: 0, 0, 0; 7 | --background-start-rgb: 214, 219, 220; 8 | --background-end-rgb: 255, 255, 255; 9 | --dark-bg-start-rgb: 8, 47, 73; /* Dark sky (bg-sky-950) */ 10 | --dark-bg-end-rgb: 8, 47, 73; /* Dark sky (bg-sky-950) */ 11 | --dark-foreground-rgb: 255, 255, 255; 12 | } 13 | 14 | @media (prefers-color-scheme: dark) { 15 | :root { 16 | --foreground-rgb: var(--dark-foreground-rgb); 17 | --background-start-rgb: var(--dark-bg-start-rgb); 18 | --background-end-rgb: var(--dark-bg-end-rgb); 19 | } 20 | } 21 | 22 | body { 23 | color: rgb(var(--foreground-rgb)); 24 | background: rgb(var(--background-start-rgb)); /* Solid background color */ 25 | } 26 | 27 | @layer utilities { 28 | .text-balance { 29 | text-wrap: balance; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /frontend/app/layout.tsx: -------------------------------------------------------------------------------- 1 | import type { Metadata } from "next"; 2 | import { Inter } from "next/font/google"; 3 | import "./globals.css"; 4 | import GoogleAnalytics from "./components/GoogleAnalytics"; 5 | 6 | const inter = Inter({ subsets: ["latin"] }); 7 | 8 | export const metadata: Metadata = { 9 | title: "PyPI Scout", 10 | description: "Find Python packages on PyPI with natural language queries", 11 | openGraph: { 12 | title: "PyPI Scout", 13 | description: "Find Python packages on PyPI with natural language queries", 14 | images: [ 15 | { 16 | url: "/pypi-light.svg", 17 | width: 600, 18 | height: 300, 19 | alt: "pypi-scout logo", 20 | }, 21 | ], 22 | }, 23 | }; 24 | 25 | export default function RootLayout({ 26 | children, 27 | }: Readonly<{ 28 | children: React.ReactNode; 29 | }>) { 30 | return ( 31 | 32 | 33 | 34 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | {children} 46 | 47 | 48 | ); 49 | } 50 | -------------------------------------------------------------------------------- /frontend/app/page.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | 3 | import { useState, useEffect, useRef } from "react"; 4 | import { handleSearch, sortResults } from "./utils/search"; 5 | import SearchResultsTable from "./components/SearchResultsTable"; 6 | import InfoBox from "./components/InfoBox"; 7 | import ScatterPlot from "./components/ScatterPlot"; 8 | import ToggleSwitch from "./components/ToggleSwitch"; 9 | import { ClipLoader } from "react-spinners"; 10 | import Header from "./components/Header"; 11 | 12 | interface Match { 13 | name: string; 14 | similarity: number; 15 | weekly_downloads: number; 16 | summary: string; 17 | } 18 | 19 | export default function Home() { 20 | const [text, setText] = useState(""); 21 | const [results, setResults] = useState([]); 22 | const [sortField, setSortField] = useState("similarity"); 23 | const [sortDirection, setSortDirection] = useState("desc"); 24 | const [loading, setLoading] = useState(false); 25 | const [error, setError] = useState(""); 26 | const [infoBoxVisible, setInfoBoxVisible] = useState(false); 27 | const [view, setView] = useState("Plot"); 28 | 29 | const resultsRef = useRef(null); 30 | 31 | // If user is on small screen, we probably 32 | useEffect(() => { 33 | if (window.innerWidth < 768) { 34 | setView("Table"); 35 | } 36 | }, []); 37 | 38 | useEffect(() => { 39 | if (results.length > 0) { 40 | resultsRef.current?.scrollIntoView({ behavior: "smooth" }); 41 | } 42 | }, [results]); 43 | 44 | const handleSort = (field: string) => { 45 | const direction = 46 | sortField === field && sortDirection === "asc" ? "desc" : "asc"; 47 | setSortField(field); 48 | setSortDirection(direction); 49 | setResults(sortResults(results, field, direction)); 50 | }; 51 | 52 | const handleSearchAction = () => { 53 | handleSearch( 54 | text, 55 | sortField, 56 | sortDirection, 57 | setResults, 58 | setLoading, 59 | setError, 60 | ); 61 | }; 62 | 63 | const handleKeyDown = (e: React.KeyboardEvent) => { 64 | if (e.key === "Enter" && !e.shiftKey) { 65 | e.preventDefault(); 66 | handleSearchAction(); 67 | } 68 | }; 69 | 70 | return ( 71 |
72 |
73 |
74 |
75 | 76 | pypi-scout logo 83 | 84 |

85 | Find packages on PyPI with natural language queries 86 |

87 |
88 | 89 |
90 | 97 | 103 | {loading && ( 104 | 105 | )} 106 | {error &&

{error}

} 107 |
108 | 109 | {results.length > 0 && ( 110 |
111 | 117 |
118 | )} 119 | 120 |
121 | {" "} 122 | {/* Reference to this div */} 123 | {results.length > 0 && view === "Plot" && ( 124 |
125 |
126 | 127 |
128 |
129 | )} 130 | {results.length > 0 && view === "Table" && ( 131 |
132 |
133 | 139 |
140 |
141 | )} 142 |
143 | 144 |
145 | 151 |
152 | 153 | 154 |
155 |
156 | ); 157 | } 158 | -------------------------------------------------------------------------------- /frontend/app/utils/search.ts: -------------------------------------------------------------------------------- 1 | import axios from "axios"; 2 | 3 | interface Match { 4 | name: string; 5 | similarity: number; 6 | weekly_downloads: number; 7 | summary: string; 8 | } 9 | 10 | interface SearchResponse { 11 | matches: Match[]; 12 | warning?: boolean; 13 | warning_message?: string; 14 | } 15 | 16 | const apiUrl = process.env.NEXT_PUBLIC_API_URL; 17 | 18 | export const handleSearch = async ( 19 | query: string, 20 | sortField: string, 21 | sortDirection: string, 22 | setResults: React.Dispatch>, 23 | setLoading: React.Dispatch>, 24 | setError: React.Dispatch>, 25 | ) => { 26 | setLoading(true); 27 | setError(""); 28 | try { 29 | const response = await axios.post( 30 | `${apiUrl}/search`, 31 | { 32 | query: query, 33 | top_k: 40, 34 | }, 35 | { 36 | headers: { 37 | "Content-Type": "application/json", 38 | }, 39 | }, 40 | ); 41 | 42 | const { matches, warning, warning_message } = response.data; 43 | 44 | if (warning && warning_message) { 45 | console.warn("Warning from API:", warning_message); 46 | } 47 | 48 | setResults(sortResults(matches, sortField, sortDirection)); 49 | } catch (error) { 50 | if (axios.isAxiosError(error) && error.response?.status === 429) { 51 | setError("Rate limit reached. Please wait a minute and try again."); 52 | } else { 53 | setError("Error fetching search results."); 54 | } 55 | console.error("Error fetching search results:", error); 56 | } finally { 57 | setLoading(false); 58 | } 59 | }; 60 | 61 | export const sortResults = ( 62 | data: Match[], 63 | field: string, 64 | direction: string, 65 | ): Match[] => { 66 | return [...data].sort((a, b) => { 67 | // @ts-ignore 68 | if (a[field] < b[field]) return direction === "asc" ? -1 : 1; 69 | // @ts-ignore 70 | if (a[field] > b[field]) return direction === "asc" ? 1 : -1; 71 | return 0; 72 | }); 73 | }; 74 | -------------------------------------------------------------------------------- /frontend/next.config.mjs: -------------------------------------------------------------------------------- 1 | /** @type {import('next').NextConfig} */ 2 | const nextConfig = { 3 | env: { 4 | NEXT_PUBLIC_API_URL: 5 | process.env.NEXT_PUBLIC_API_URL || "http://localhost:8000/api", 6 | }, 7 | }; 8 | 9 | export default nextConfig; 10 | -------------------------------------------------------------------------------- /frontend/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "frontend", 3 | "version": "0.1.0", 4 | "private": true, 5 | "scripts": { 6 | "dev": "next dev", 7 | "build": "next build", 8 | "start": "next start", 9 | "lint": "next lint" 10 | }, 11 | "dependencies": { 12 | "axios": "^1.7.2", 13 | "chart.js": "^4.4.3", 14 | "next": "14.2.4", 15 | "react": "^18", 16 | "react-chartjs-2": "^5.2.0", 17 | "react-dom": "^18", 18 | "react-icons": "^5.2.1", 19 | "react-spinners": "^0.13.8" 20 | }, 21 | "devDependencies": { 22 | "@types/node": "^20", 23 | "@types/react": "^18", 24 | "@types/react-dom": "^18", 25 | "eslint": "^8", 26 | "eslint-config-next": "14.2.4", 27 | "postcss": "^8", 28 | "tailwindcss": "^3.4.1", 29 | "typescript": "^5" 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /frontend/postcss.config.mjs: -------------------------------------------------------------------------------- 1 | /** @type {import('postcss-load-config').Config} */ 2 | const config = { 3 | plugins: { 4 | tailwindcss: {}, 5 | }, 6 | }; 7 | 8 | export default config; 9 | -------------------------------------------------------------------------------- /frontend/public/kofi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgmaas/pypi-scout/593a48a2512a14c350bae98b087cd861d94a0c6b/frontend/public/kofi.png -------------------------------------------------------------------------------- /frontend/public/next.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /frontend/public/pypi-light.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /frontend/public/pypi.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /frontend/public/vercel.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /frontend/tailwind.config.ts: -------------------------------------------------------------------------------- 1 | import type { Config } from "tailwindcss"; 2 | 3 | const config: Config = { 4 | content: [ 5 | "./pages/**/*.{js,ts,jsx,tsx,mdx}", 6 | "./components/**/*.{js,ts,jsx,tsx,mdx}", 7 | "./app/**/*.{js,ts,jsx,tsx,mdx}", 8 | ], 9 | theme: { 10 | extend: { 11 | colors: { 12 | sky: { 13 | 50: "#d9f0ff", // Darkened from #f0f9ff 14 | 100: "#c3e4fe", // Darkened from #e0f2fe 15 | 200: "#a3d4fd", // Darkened from #bae6fd 16 | 300: "#5cbdfc", // Darkened from #7dd3fc 17 | 400: "#2aa3f8", // Darkened from #38bdf8 18 | 500: "#0b8edc", // Darkened from #0ea5e9 19 | 600: "#026baa", // Darkened from #0284c7 20 | 700: "#015a89", // Darkened from #0369a1 21 | 800: "#054b6e", // Darkened from #075985 22 | 900: "#083857", // Darkened from #0c4a6e 23 | 950: "#062338", // Darkened from #082f49 24 | }, 25 | orange: { 26 | 100: "#f8d5c7", 27 | 200: "#f1ac9a", 28 | 300: "#ea836d", 29 | 400: "#e35a40", 30 | 500: "#d77a61", 31 | 600: "#c45b3f", 32 | 700: "#b23a1b", 33 | 800: "#D18829", // Orange from logo 34 | }, 35 | }, 36 | backgroundImage: { 37 | "gradient-radial": "radial-gradient(var(--tw-gradient-stops))", 38 | "gradient-conic": 39 | "conic-gradient(from 180deg at 50% 50%, var(--tw-gradient-stops))", 40 | }, 41 | }, 42 | }, 43 | plugins: [], 44 | }; 45 | 46 | export default config; 47 | -------------------------------------------------------------------------------- /frontend/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "lib": ["dom", "dom.iterable", "esnext"], 4 | "allowJs": true, 5 | "skipLibCheck": true, 6 | "strict": true, 7 | "noEmit": true, 8 | "esModuleInterop": true, 9 | "module": "esnext", 10 | "moduleResolution": "bundler", 11 | "resolveJsonModule": true, 12 | "isolatedModules": true, 13 | "jsx": "preserve", 14 | "incremental": true, 15 | "plugins": [ 16 | { 17 | "name": "next" 18 | } 19 | ], 20 | "paths": { 21 | "@/*": ["./*"] 22 | } 23 | }, 24 | "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], 25 | "exclude": ["node_modules"] 26 | } 27 | -------------------------------------------------------------------------------- /package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pypi-llm", 3 | "lockfileVersion": 3, 4 | "requires": true, 5 | "packages": { 6 | "": { 7 | "dependencies": { 8 | "chart.js": "^4.4.3", 9 | "react-chartjs-2": "^5.2.0" 10 | } 11 | }, 12 | "node_modules/@kurkle/color": { 13 | "version": "0.3.2", 14 | "resolved": "https://registry.npmjs.org/@kurkle/color/-/color-0.3.2.tgz", 15 | "integrity": "sha512-fuscdXJ9G1qb7W8VdHi+IwRqij3lBkosAm4ydQtEmbY58OzHXqQhvlxqEkoz0yssNVn38bcpRWgA9PP+OGoisw==" 16 | }, 17 | "node_modules/chart.js": { 18 | "version": "4.4.3", 19 | "resolved": "https://registry.npmjs.org/chart.js/-/chart.js-4.4.3.tgz", 20 | "integrity": "sha512-qK1gkGSRYcJzqrrzdR6a+I0vQ4/R+SoODXyAjscQ/4mzuNzySaMCd+hyVxitSY1+L2fjPD1Gbn+ibNqRmwQeLw==", 21 | "dependencies": { 22 | "@kurkle/color": "^0.3.0" 23 | }, 24 | "engines": { 25 | "pnpm": ">=8" 26 | } 27 | }, 28 | "node_modules/js-tokens": { 29 | "version": "4.0.0", 30 | "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", 31 | "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==", 32 | "peer": true 33 | }, 34 | "node_modules/loose-envify": { 35 | "version": "1.4.0", 36 | "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz", 37 | "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==", 38 | "peer": true, 39 | "dependencies": { 40 | "js-tokens": "^3.0.0 || ^4.0.0" 41 | }, 42 | "bin": { 43 | "loose-envify": "cli.js" 44 | } 45 | }, 46 | "node_modules/react": { 47 | "version": "18.3.1", 48 | "resolved": "https://registry.npmjs.org/react/-/react-18.3.1.tgz", 49 | "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==", 50 | "peer": true, 51 | "dependencies": { 52 | "loose-envify": "^1.1.0" 53 | }, 54 | "engines": { 55 | "node": ">=0.10.0" 56 | } 57 | }, 58 | "node_modules/react-chartjs-2": { 59 | "version": "5.2.0", 60 | "resolved": "https://registry.npmjs.org/react-chartjs-2/-/react-chartjs-2-5.2.0.tgz", 61 | "integrity": "sha512-98iN5aguJyVSxp5U3CblRLH67J8gkfyGNbiK3c+l1QI/G4irHMPQw44aEPmjVag+YKTyQ260NcF82GTQ3bdscA==", 62 | "peerDependencies": { 63 | "chart.js": "^4.1.1", 64 | "react": "^16.8.0 || ^17.0.0 || ^18.0.0" 65 | } 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "chart.js": "^4.4.3", 4 | "react-chartjs-2": "^5.2.0" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /pypi_bigquery.sql: -------------------------------------------------------------------------------- 1 | WITH recent_downloads AS ( 2 | SELECT 3 | LOWER(project) AS project_lower, 4 | project, 5 | COUNT(*) AS download_count 6 | FROM 7 | `bigquery-public-data.pypi.file_downloads` 8 | WHERE 9 | DATE(timestamp) BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY) AND CURRENT_DATE() 10 | GROUP BY 11 | LOWER(project), project 12 | HAVING 13 | COUNT(*) >= 100 14 | ), 15 | latest_metadata AS ( 16 | SELECT 17 | LOWER(name) AS name_lower, 18 | name, 19 | description, 20 | summary, 21 | version, 22 | upload_time, 23 | ROW_NUMBER() OVER (PARTITION BY LOWER(name) ORDER BY upload_time DESC) AS rn 24 | FROM 25 | `bigquery-public-data.pypi.distribution_metadata` 26 | ) 27 | SELECT 28 | lm.name AS name, 29 | lm.description AS description, 30 | lm.summary AS summary, 31 | lm.version AS latest_version, 32 | rd.download_count AS number_of_downloads 33 | FROM 34 | recent_downloads rd 35 | JOIN 36 | latest_metadata lm 37 | ON 38 | rd.project_lower = lm.name_lower 39 | WHERE 40 | lm.rn = 1 41 | ORDER BY 42 | rd.download_count DESC; 43 | -------------------------------------------------------------------------------- /pypi_scout/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgmaas/pypi-scout/593a48a2512a14c350bae98b087cd861d94a0c6b/pypi_scout/__init__.py -------------------------------------------------------------------------------- /pypi_scout/api/data_loader.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Tuple 3 | 4 | import polars as pl 5 | 6 | from pypi_scout.config import Config, StorageBackend 7 | from pypi_scout.utils.blob_io import BlobIO 8 | 9 | 10 | class ApiDataLoader: 11 | def __init__(self, config: Config): 12 | self.config = config 13 | 14 | def load_dataset(self) -> Tuple[pl.DataFrame, pl.DataFrame]: 15 | if self.config.STORAGE_BACKEND == StorageBackend.LOCAL: 16 | df_packages, df_embeddings = self._load_local_dataset() 17 | elif self.config.STORAGE_BACKEND == StorageBackend.BLOB: 18 | df_packages, df_embeddings = self._load_blob_dataset() 19 | else: 20 | raise ValueError(f"Unexpected value found for STORAGE_BACKEND: {self.config.STORAGE_BACKEND}") # noqa: TRY003 21 | 22 | df_embeddings = self._drop_rows_from_embeddings_that_do_not_appear_in_packages(df_embeddings, df_packages) 23 | return df_packages, df_embeddings 24 | 25 | def _load_local_dataset(self) -> Tuple[pl.DataFrame, pl.DataFrame]: 26 | packages_dataset_path = self.config.DATA_DIR / self.config.DATASET_FOR_API_CSV_NAME 27 | embeddings_dataset_path = self.config.DATA_DIR / self.config.EMBEDDINGS_PARQUET_NAME 28 | 29 | logging.info(f"Reading packages dataset from `{packages_dataset_path}`...") 30 | df_packages = pl.read_csv(packages_dataset_path) 31 | self._log_packages_dataset_info(df_packages) 32 | 33 | logging.info(f"Reading embeddings from `{embeddings_dataset_path}`...") 34 | df_embeddings = pl.read_parquet(embeddings_dataset_path) 35 | self._log_embeddings_dataset_info(df_embeddings) 36 | 37 | return df_packages, df_embeddings 38 | 39 | def _load_blob_dataset(self) -> Tuple[pl.DataFrame, pl.DataFrame]: 40 | blob_io = BlobIO( 41 | self.config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME, 42 | self.config.STORAGE_BACKEND_BLOB_CONTAINER_NAME, 43 | self.config.STORAGE_BACKEND_BLOB_KEY, 44 | ) 45 | 46 | logging.info( 47 | f"Downloading `{self.config.DATASET_FOR_API_CSV_NAME}` from container `{self.config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}`..." 48 | ) 49 | df_packages = blob_io.download_csv_to_df(self.config.DATASET_FOR_API_CSV_NAME) 50 | self._log_packages_dataset_info(df_packages) 51 | 52 | logging.info( 53 | f"Downloading `{self.config.EMBEDDINGS_PARQUET_NAME}` from container `{self.config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}`..." 54 | ) 55 | df_embeddings = blob_io.download_parquet_to_df(self.config.EMBEDDINGS_PARQUET_NAME) 56 | self._log_embeddings_dataset_info(df_embeddings) 57 | 58 | return df_packages, df_embeddings 59 | 60 | @staticmethod 61 | def _log_packages_dataset_info(df_packages: pl.DataFrame) -> None: 62 | logging.info(f"Finished loading the `packages` dataset. Number of rows in dataset: {len(df_packages):,}") 63 | logging.info(df_packages.describe()) 64 | 65 | @staticmethod 66 | def _log_embeddings_dataset_info(df_embeddings: pl.DataFrame) -> None: 67 | logging.info(f"Finished loading the `embeddings` dataset. Number of rows in dataset: {len(df_embeddings):,}") 68 | logging.info(df_embeddings.describe()) 69 | 70 | @staticmethod 71 | def _drop_rows_from_embeddings_that_do_not_appear_in_packages(df_embeddings, df_packages): 72 | # We only keep the packages in the vector dataset that also occur in the packages dataset. 73 | # In theory, this should never drop something. But still good to keep as a fail-safe to prevent issues in the API. 74 | logging.info("Dropping packages in the `embeddings` dataset that do not occur in the `packages` dataset...") 75 | logging.info(f"Number of rows before dropping: {len(df_embeddings):,}...") 76 | df_embeddings = df_embeddings.join(df_packages, on="name", how="semi") 77 | logging.info(f"Number of rows after dropping: {len(df_embeddings):,}...") 78 | return df_embeddings 79 | -------------------------------------------------------------------------------- /pypi_scout/api/main.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from dotenv import load_dotenv 4 | from fastapi import FastAPI, HTTPException 5 | from fastapi.middleware.cors import CORSMiddleware 6 | from sentence_transformers import SentenceTransformer 7 | from slowapi import Limiter, _rate_limit_exceeded_handler 8 | from slowapi.errors import RateLimitExceeded 9 | from slowapi.util import get_remote_address 10 | from starlette.requests import Request 11 | 12 | from pypi_scout.api.data_loader import ApiDataLoader 13 | from pypi_scout.api.models import QueryModel, SearchResponse 14 | from pypi_scout.config import Config 15 | from pypi_scout.embeddings.simple_vector_database import SimpleVectorDatabase 16 | from pypi_scout.utils.logging import setup_logging 17 | from pypi_scout.utils.score_calculator import calculate_score 18 | 19 | setup_logging() 20 | logging.info("Initializing backend...") 21 | 22 | limiter = Limiter(key_func=get_remote_address) 23 | app = FastAPI() 24 | app.state.limiter = limiter 25 | app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler) 26 | 27 | load_dotenv() 28 | config = Config() 29 | 30 | app.add_middleware( 31 | CORSMiddleware, 32 | allow_origins=["*"], 33 | allow_credentials=True, 34 | allow_methods=["*"], 35 | allow_headers=["*"], 36 | ) 37 | 38 | data_loader = ApiDataLoader(config) 39 | df_packages, df_embeddings = data_loader.load_dataset() 40 | 41 | model = SentenceTransformer(config.EMBEDDINGS_MODEL_NAME) 42 | vector_database = SimpleVectorDatabase(embeddings_model=model, df_embeddings=df_embeddings) 43 | 44 | 45 | @app.post("/api/search", response_model=SearchResponse) 46 | @limiter.limit("6/minute") 47 | async def search(query: QueryModel, request: Request): 48 | """ 49 | Search for the packages whose summary and description have the highest similarity to the query. 50 | We take the top_k * 2 most similar packages, and then calculate weighted score based on the similarity and weekly downloads. 51 | The top_k packages with the highest score are returned. 52 | """ 53 | 54 | if query.top_k > 100: 55 | raise HTTPException(status_code=400, detail="top_k cannot be larger than 100.") 56 | 57 | logging.info(f"Searching for similar projects. Query: '{query.query}'") 58 | df_matches = vector_database.find_similar(query.query, top_k=int(query.top_k * 3)) 59 | df_matches = df_matches.join(df_packages, how="left", on="name") 60 | logging.info( 61 | f"Fetched the {len(df_matches)} most similar projects. Calculating the weighted scores and filtering..." 62 | ) 63 | 64 | df_matches = calculate_score( 65 | df_matches, weight_similarity=config.WEIGHT_SIMILARITY, weight_weekly_downloads=config.WEIGHT_WEEKLY_DOWNLOADS 66 | ) 67 | df_matches = df_matches.sort("score", descending=True) 68 | 69 | if len(df_matches) > query.top_k: 70 | df_matches = df_matches.head(query.top_k) 71 | 72 | logging.info(f"Returning the {len(df_matches)} best matches.") 73 | df_matches = df_matches.select(["name", "similarity", "summary", "weekly_downloads"]) 74 | return SearchResponse(matches=df_matches.to_dicts()) 75 | -------------------------------------------------------------------------------- /pypi_scout/api/models.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class QueryModel(BaseModel): 5 | query: str 6 | top_k: int 7 | 8 | 9 | class Match(BaseModel): 10 | name: str 11 | summary: str 12 | similarity: float 13 | weekly_downloads: int 14 | 15 | 16 | class SearchResponse(BaseModel): 17 | matches: list[Match] 18 | warning: bool = False 19 | warning_message: str = None 20 | -------------------------------------------------------------------------------- /pypi_scout/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass 3 | from enum import Enum 4 | from pathlib import Path 5 | 6 | 7 | class StorageBackend(Enum): 8 | LOCAL = "LOCAL" 9 | BLOB = "BLOB" 10 | 11 | 12 | @dataclass 13 | class Config: 14 | # Name of the model used for generating vector embeddings from text. 15 | # See https://sbert.net/docs/sentence_transformer/pretrained_models.html for available models. 16 | EMBEDDINGS_MODEL_NAME = "all-mpnet-base-v2" 17 | 18 | # Boolean to overwrite raw data file if it already exists 19 | OVERWRITE: bool = True 20 | 21 | # Directory where dataset files are stored. 22 | DATA_DIR: Path = Path("data") 23 | 24 | # Filename for the raw dataset CSV. 25 | RAW_DATASET_CSV_NAME = "raw_dataset.csv" 26 | 27 | # Filename for the processed dataset CSV. 28 | PROCESSED_DATASET_CSV_NAME = "processed_dataset.csv" 29 | 30 | # Filename for the dataset that contains the minimal data that the API needs. 31 | # For example; it needs the name, weekly downloads, and the summary, but not the (cleaned) description. 32 | DATASET_FOR_API_CSV_NAME = "dataset_for_api.csv" 33 | 34 | # Filename for the dataset that contains the minimal data that the API needs. 35 | # For example; it needs the name, weekly downloads, and the summary, but not the (cleaned) description. 36 | EMBEDDINGS_PARQUET_NAME = "embeddings.parquet" 37 | 38 | # Google Drive file ID for downloading the raw dataset. 39 | GOOGLE_FILE_ID = "12AH8PwKvZqRhXBf9uS1qRZq1-k3gIhhG" 40 | 41 | # Fraction of the dataset to include in the vector database. This value determines the portion of top packages 42 | # (sorted by weekly downloads) to include. Increase this value to include a larger portion of the dataset, up to 1.0 (100%). 43 | # For reference, a value of 0.25 corresponds to including all PyPI packages with at least approximately 650 weekly downloads 44 | FRAC_DATA_TO_INCLUDE = 1 45 | 46 | # Weights for the combined score calculation. Higher WEIGHT_SIMILARITY prioritizes 47 | # relevance based on text similarity, while higher WEIGHT_WEEKLY_DOWNLOADS prioritizes 48 | # packages with more weekly downloads. 49 | WEIGHT_SIMILARITY = 0.5 50 | WEIGHT_WEEKLY_DOWNLOADS = 0.5 51 | 52 | # Storage backend configuration. Can be either StorageBackend.LOCAL or StorageBackend.BLOB. 53 | # If StorageBackend.BLOB, the processed dataset will be uploaded to Blob, and the backend API 54 | # will read the data from there, rather than from a local data directory. In order to use StorageBackend.BLOB, 55 | # the other `STORAGE_BACKEND_BLOB_` variables need to be set as environment variables. 56 | STORAGE_BACKEND: StorageBackend = StorageBackend.LOCAL 57 | STORAGE_BACKEND_BLOB_ACCOUNT_NAME: str | None = None 58 | STORAGE_BACKEND_BLOB_CONTAINER_NAME: str | None = None 59 | STORAGE_BACKEND_BLOB_KEY: str | None = None 60 | 61 | def __post_init__(self) -> None: 62 | if os.getenv("STORAGE_BACKEND") == "BLOB": 63 | self.STORAGE_BACKEND = StorageBackend.BLOB 64 | self.STORAGE_BACKEND_BLOB_ACCOUNT_NAME = os.getenv("STORAGE_BACKEND_BLOB_ACCOUNT_NAME") 65 | self.STORAGE_BACKEND_BLOB_CONTAINER_NAME = os.getenv("STORAGE_BACKEND_BLOB_CONTAINER_NAME") 66 | self.STORAGE_BACKEND_BLOB_KEY = os.getenv("STORAGE_BACKEND_BLOB_KEY") 67 | 68 | if not all( 69 | [ 70 | self.STORAGE_BACKEND_BLOB_ACCOUNT_NAME, 71 | self.STORAGE_BACKEND_BLOB_CONTAINER_NAME, 72 | self.STORAGE_BACKEND_BLOB_KEY, 73 | ] 74 | ): 75 | raise OSError("One or more BLOB storage environment variables are missing!") # noqa: TRY003 76 | -------------------------------------------------------------------------------- /pypi_scout/data/description_cleaner.py: -------------------------------------------------------------------------------- 1 | import re 2 | from dataclasses import dataclass 3 | 4 | import polars as pl 5 | from bs4 import BeautifulSoup 6 | 7 | CLEANING_FAILED = "cleaning failed!" 8 | 9 | 10 | @dataclass 11 | class DescriptionCleaner: 12 | """ 13 | A class that provides methods to clean PyPI package descriptions in a DataFrame column. 14 | """ 15 | 16 | def clean(self, df: pl.DataFrame, input_col: str, output_col: str) -> pl.DataFrame: 17 | """ 18 | Cleans the text in the specified DataFrame column and returns the modified DataFrame. 19 | 20 | Args: 21 | df (pl.DataFrame): The DataFrame containing the text column to be cleaned. 22 | input_col (str): The name of the input column containing the text to be cleaned. 23 | output_col (str): The name of the output column to store the cleaned text. 24 | 25 | Returns: 26 | pl.DataFrame: The modified DataFrame with the cleaned text. 27 | """ 28 | df = df.with_columns(pl.col(input_col).map_elements(self._clean_text, return_dtype=pl.String).alias(output_col)) 29 | return df 30 | 31 | def _clean_text(self, text: str) -> str: 32 | """ 33 | Cleans the given text by removing HTML tags, markdown image links, markdown badges, 34 | markdown links, URLs, special markdown characters, markdown headers, and extra whitespaces. 35 | 36 | Args: 37 | text (str): The text to be cleaned. 38 | 39 | Returns: 40 | str: The cleaned text. 41 | """ 42 | try: 43 | text = self._remove_html_tags(text) 44 | text = self._remove_markdown_image_links(text) 45 | text = self._remove_markdown_badges(text) 46 | text = self._remove_markdown_links(text) 47 | text = self._remove_urls(text) 48 | text = self._remove_special_markdown_characters(text) 49 | text = self._remove_markdown_headers(text) 50 | text = self._remove_extra_whitespaces(text) 51 | except: # noqa: E722 52 | return CLEANING_FAILED 53 | 54 | return text 55 | 56 | @staticmethod 57 | def _remove_html_tags(text: str) -> str: 58 | soup = BeautifulSoup(text, "lxml") 59 | return soup.get_text(separator=" ") 60 | 61 | @staticmethod 62 | def _remove_markdown_image_links(text: str) -> str: 63 | return re.sub(r"!\[.*?\]\(.*?\)", "", text) 64 | 65 | @staticmethod 66 | def _remove_markdown_badges(text: str) -> str: 67 | return re.sub(r"\[!\[.*?\]\(.*?\)\]", "", text) 68 | 69 | @staticmethod 70 | def _remove_markdown_links(text: str) -> str: 71 | return re.sub(r"\[.*?\]\(.*?\)", "", text) 72 | 73 | @staticmethod 74 | def _remove_urls(text: str) -> str: 75 | return re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE) 76 | 77 | @staticmethod 78 | def _remove_special_markdown_characters(text: str) -> str: 79 | return re.sub(r"[#*=_`]", "", text) 80 | 81 | @staticmethod 82 | def _remove_markdown_headers(text: str) -> str: 83 | return re.sub(r"\n\s*#{1,6}\s*", " ", text) 84 | 85 | @staticmethod 86 | def _remove_extra_whitespaces(text: str) -> str: 87 | return " ".join(text.split()) 88 | -------------------------------------------------------------------------------- /pypi_scout/data/raw_data_reader.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from pathlib import Path 3 | 4 | import polars as pl 5 | 6 | 7 | @dataclass 8 | class RawDataReader: 9 | """ 10 | A class for reading and processing data from a raw PyPI dataset. 11 | """ 12 | 13 | raw_dataset: Path 14 | 15 | def read(self): 16 | """ 17 | Reads the raw dataset, performs data processing operations, and returns the processed dataframe. 18 | The dataset should at least have the following columns: name, description, and number_of_downloads. 19 | 20 | Returns: 21 | DataFrame: The processed dataframe. 22 | """ 23 | df = pl.read_csv(self.raw_dataset) 24 | df = df.with_columns(weekly_downloads=pl.col("number_of_downloads").cast(pl.Int32)) 25 | df = df.drop("number_of_downloads") 26 | df = df.unique(subset="name") 27 | df = df.filter(~(pl.col("description").is_null() & pl.col("summary").is_null())) 28 | df = df.sort("weekly_downloads", descending=True) 29 | df = df.with_columns( 30 | summary=pl.col("summary").fill_null(""), 31 | description=pl.col("description").fill_null(""), 32 | ) 33 | return df 34 | -------------------------------------------------------------------------------- /pypi_scout/embeddings/embeddings_creator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import polars as pl 4 | from sentence_transformers import SentenceTransformer 5 | from tqdm import tqdm 6 | 7 | 8 | class VectorEmbeddingCreator: 9 | def __init__( 10 | self, 11 | embeddings_model: SentenceTransformer, 12 | embedding_column_name: str = "embeddings", 13 | batch_size: int = 128, 14 | ): 15 | """ 16 | Initializes the VectorEmbeddingCreator with a SentenceTransformer model, embedding column name, and batch size. 17 | 18 | Args: 19 | embeddings_model (SentenceTransformer): The SentenceTransformer model to generate embeddings. 20 | embedding_column_name (str, optional): The name of the column to store embeddings. Defaults to 'embeddings'. 21 | batch_size (int, optional): The size of batches to process at a time. Defaults to 128. 22 | """ 23 | self.model = embeddings_model 24 | self.embedding_column_name = embedding_column_name 25 | self.batch_size = batch_size 26 | 27 | def add_embeddings(self, df: pl.DataFrame, text_column: str) -> pl.DataFrame: 28 | """ 29 | Adds embeddings to the DataFrame based on the specified text column. 30 | 31 | Args: 32 | df (pl.DataFrame): The Polars DataFrame to which embeddings will be added. 33 | text_column (str): The column name containing text to generate embeddings for. 34 | 35 | Returns: 36 | pl.DataFrame: The DataFrame with an additional column containing embeddings. 37 | """ 38 | logging.info("Splitting DataFrame into batches...") 39 | df_chunks = self._split_dataframe_in_batches(df, batch_size=self.batch_size) 40 | all_embeddings = [] 41 | 42 | logging.info("Generating embeddings...") 43 | for chunk in tqdm(df_chunks, desc="Generating embeddings", unit="batch"): 44 | embeddings = self._generate_embeddings(chunk, text_column) 45 | all_embeddings.extend(embeddings) 46 | 47 | df = df.with_columns(pl.Series(self.embedding_column_name, all_embeddings)) 48 | return df 49 | 50 | def _generate_embeddings(self, chunk: pl.DataFrame, text_column: str) -> list: 51 | embeddings = self.model.encode(list(chunk[text_column]), show_progress_bar=False) 52 | return embeddings 53 | 54 | @staticmethod 55 | def _split_dataframe_in_batches(df: pl.DataFrame, batch_size: int) -> list: 56 | """ 57 | Splits a Polars DataFrame into batches. 58 | """ 59 | n_chunks = (df.height + batch_size - 1) // batch_size 60 | return [df.slice(i * batch_size, batch_size) for i in range(n_chunks)] 61 | -------------------------------------------------------------------------------- /pypi_scout/embeddings/simple_vector_database.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import polars as pl 3 | from sentence_transformers import SentenceTransformer 4 | from sklearn.metrics.pairwise import cosine_similarity 5 | 6 | 7 | class SimpleVectorDatabase: 8 | def __init__( 9 | self, 10 | embeddings_model: SentenceTransformer, 11 | df_embeddings: pl.DataFrame, 12 | embedding_column: str = "embeddings", 13 | processed_column: str = "embeddings_array", 14 | ): 15 | """ 16 | Initializes the SimpleVectorDatabase with a SentenceTransformer model and a DataFrame containing embeddings. 17 | 18 | Args: 19 | embeddings_model (SentenceTransformer): The SentenceTransformer model to generate embeddings. 20 | df_embeddings (pl.DataFrame): The Polars DataFrame containing the initial embeddings. 21 | embedding_column (str, optional): The name of the column containing the original embeddings. Defaults to 'embeddings'. 22 | """ 23 | self.embeddings_model = embeddings_model 24 | self.df_embeddings = df_embeddings 25 | self.embedding_column = embedding_column 26 | self.embeddings_matrix = self._create_embeddings_matrix() 27 | 28 | def find_similar(self, query: str, top_k: int = 25) -> pl.DataFrame: 29 | """ 30 | Finds the top_k most similar vectors in the database for a given query. 31 | 32 | Args: 33 | query (str): The query string to find similar vectors for. 34 | top_k (int, optional): The number of similar vectors to retrieve. Defaults to 25. 35 | 36 | Returns: 37 | pl.DataFrame: A Polars DataFrame containing the most similar vectors and their similarity scores. 38 | """ 39 | query_embedding = self.embeddings_model.encode(query, show_progress_bar=False) 40 | 41 | similarities = cosine_similarity([query_embedding], self.embeddings_matrix)[0] 42 | 43 | top_k_indices = np.argsort(similarities)[::-1][:top_k] 44 | top_k_scores = similarities[top_k_indices] 45 | df_best_matches = self.df_embeddings[top_k_indices] 46 | 47 | df_best_matches = df_best_matches.with_columns(pl.Series("similarity", top_k_scores)) 48 | df_best_matches = df_best_matches.drop(self.embedding_column) 49 | 50 | return df_best_matches 51 | 52 | def _create_embeddings_matrix(self) -> np.ndarray: 53 | return np.stack( 54 | self.df_embeddings[self.embedding_column].apply(lambda x: np.array(x, dtype=np.float32)).to_numpy() 55 | ) 56 | -------------------------------------------------------------------------------- /pypi_scout/scripts/create_vector_embeddings.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | 4 | import polars as pl 5 | from dotenv import load_dotenv 6 | from sentence_transformers import SentenceTransformer 7 | 8 | from pypi_scout.config import Config 9 | from pypi_scout.embeddings.embeddings_creator import VectorEmbeddingCreator 10 | from pypi_scout.utils.logging import setup_logging 11 | 12 | 13 | def read_processed_dataset(path_to_processed_dataset: Path): 14 | logging.info("📂 Reading the processed dataset...") 15 | df = pl.read_csv(path_to_processed_dataset) 16 | logging.info(f"📊 Number of rows in the processed dataset: {len(df):,}") 17 | return df 18 | 19 | 20 | def write_parquet(df: pl.DataFrame, processed_dataset_path: Path): 21 | logging.info(f"Storing dataset in {processed_dataset_path}...") 22 | df.write_parquet(processed_dataset_path) 23 | logging.info("✅ Done!") 24 | 25 | 26 | def create_vector_embeddings(): 27 | setup_logging() 28 | load_dotenv() 29 | 30 | config = Config() 31 | df = read_processed_dataset(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME) 32 | df = df.with_columns( 33 | summary_and_description_cleaned=pl.concat_str(pl.col("summary"), pl.lit(" - "), pl.col("description_cleaned")) 34 | ) 35 | df = VectorEmbeddingCreator(embeddings_model=SentenceTransformer(config.EMBEDDINGS_MODEL_NAME)).add_embeddings( 36 | df, text_column="summary_and_description_cleaned" 37 | ) 38 | 39 | df = df.select("name", "embeddings").unique(subset="name") 40 | write_parquet(df, config.DATA_DIR / config.EMBEDDINGS_PARQUET_NAME) 41 | 42 | 43 | if __name__ == "__main__": 44 | setup_logging() 45 | create_vector_embeddings() 46 | -------------------------------------------------------------------------------- /pypi_scout/scripts/download_raw_dataset.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import gdown 4 | from dotenv import load_dotenv 5 | 6 | from pypi_scout.config import Config 7 | from pypi_scout.utils.logging import setup_logging 8 | 9 | 10 | def download_raw_dataset(): 11 | """ 12 | Downloads the dataset from a Google Drive link using the gdown library. 13 | """ 14 | load_dotenv() 15 | config = Config() 16 | 17 | target_path = config.DATA_DIR / config.RAW_DATASET_CSV_NAME 18 | if target_path.exists(): 19 | if not config.OVERWRITE: 20 | logging.info(f"🔹 Raw dataset {target_path} from Google Drive already exists! Skipping download.") 21 | return 22 | else: 23 | logging.info( 24 | f"⤵️ Raw dataset {target_path} from Google Drive exists, but config.OVERWRITE is `true`. Overwriting..." 25 | ) 26 | 27 | logging.info(f"⬇️ Downloading raw dataset from Google Drive to {target_path}...") 28 | url = f"https://drive.google.com/uc?id={config.GOOGLE_FILE_ID}" 29 | gdown.download(url, str(target_path), quiet=False) 30 | logging.info("✅ Done!") 31 | 32 | 33 | if __name__ == "__main__": 34 | setup_logging() 35 | download_raw_dataset() 36 | -------------------------------------------------------------------------------- /pypi_scout/scripts/process_raw_dataset.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import polars as pl 4 | from dotenv import load_dotenv 5 | 6 | from pypi_scout.config import Config 7 | from pypi_scout.data.description_cleaner import CLEANING_FAILED, DescriptionCleaner 8 | from pypi_scout.data.raw_data_reader import RawDataReader 9 | from pypi_scout.utils.logging import setup_logging 10 | 11 | 12 | def read_raw_dataset(path_to_raw_dataset): 13 | logging.info("📂 Reading the raw dataset...") 14 | df = RawDataReader(path_to_raw_dataset).read() 15 | logging.info(f"📊 Number of rows in the raw dataset: {len(df):,}") 16 | logging.info(f"The highest weekly downloads in the raw dataset: {df['weekly_downloads'].max():,}") 17 | logging.info(f"The lowest weekly downloads in the raw dataset: {df['weekly_downloads'].min():,}") 18 | return df 19 | 20 | 21 | def filter_top_packages(df, frac_data_to_include): 22 | logging.info( 23 | f"Using only the packages with weekly_downloads in the top {frac_data_to_include * 100}% of the dataset because config.FRAC_DATA_TO_INCLUDE is set to {frac_data_to_include}!" 24 | ) 25 | logging.info( 26 | "This means packages with low download counts are excluded from the results in the dashboard. To include more data, set config.FRAC_DATA_TO_INCLUDE to a higher value." 27 | ) 28 | df = df.sort("weekly_downloads", descending=True) 29 | df = df.head(round(frac_data_to_include * len(df))) 30 | 31 | logging.info(f"📊 Number of rows after filtering: {len(df):,}") 32 | logging.info(f"The highest weekly downloads in the filtered dataset: {df['weekly_downloads'].max():,}") 33 | logging.info(f"The lowest weekly downloads in the filtered dataset: {df['weekly_downloads'].min():,}") 34 | return df 35 | 36 | 37 | def clean_descriptions(df): 38 | logging.info("🧹 Cleaning the descriptions...") 39 | df = DescriptionCleaner().clean(df, "description", "description_cleaned") 40 | df = df.filter(~pl.col("description_cleaned").is_null()) 41 | df = df.filter(pl.col("description_cleaned") != CLEANING_FAILED) 42 | return df 43 | 44 | 45 | def write_csv(df, processed_dataset_path): 46 | logging.info(f"Storing dataset in {processed_dataset_path}...") 47 | df.write_csv(processed_dataset_path) 48 | logging.info("✅ Done!") 49 | 50 | 51 | def process_raw_dataset(): 52 | load_dotenv() 53 | config = Config() 54 | df = read_raw_dataset(config.DATA_DIR / config.RAW_DATASET_CSV_NAME) 55 | if config.FRAC_DATA_TO_INCLUDE < 1.0: 56 | df = filter_top_packages(df, config.FRAC_DATA_TO_INCLUDE) 57 | df = clean_descriptions(df) 58 | 59 | write_csv(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME) 60 | write_csv(df.select(["name", "summary", "weekly_downloads"]), config.DATA_DIR / config.DATASET_FOR_API_CSV_NAME) 61 | 62 | 63 | if __name__ == "__main__": 64 | setup_logging() 65 | process_raw_dataset() 66 | -------------------------------------------------------------------------------- /pypi_scout/scripts/setup.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from pypi_scout.scripts.create_vector_embeddings import create_vector_embeddings 4 | from pypi_scout.scripts.download_raw_dataset import download_raw_dataset 5 | from pypi_scout.scripts.process_raw_dataset import process_raw_dataset 6 | from pypi_scout.scripts.upload_processed_datasets import upload_processed_datasets 7 | from pypi_scout.utils.logging import setup_logging 8 | 9 | 10 | def main(): 11 | setup_logging() 12 | 13 | logging.info("\n\nDOWNLOADING RAW DATASET -------------\n") 14 | download_raw_dataset() 15 | 16 | logging.info("\n\nPROCESSING RAW DATASET -------------\n") 17 | process_raw_dataset() 18 | 19 | logging.info("\n\nCREATING VECTOR EMBEDDINGS -------------\n") 20 | create_vector_embeddings() 21 | 22 | logging.info("\n\nUPLOADING PROCESSED DATASETS -------------\n") 23 | upload_processed_datasets() 24 | 25 | 26 | if __name__ == "__main__": 27 | main() 28 | -------------------------------------------------------------------------------- /pypi_scout/scripts/upload_processed_datasets.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from dotenv import load_dotenv 4 | 5 | from pypi_scout.config import Config, StorageBackend 6 | from pypi_scout.utils.blob_io import BlobIO 7 | from pypi_scout.utils.logging import setup_logging 8 | 9 | 10 | def upload_processed_datasets(): 11 | load_dotenv() 12 | config = Config() 13 | 14 | if config.STORAGE_BACKEND != StorageBackend.BLOB: 15 | logging.info( 16 | "Not using BLOB backend. Skipping upload. To enable, configure the `STORAGE_BACKEND_` variables in config" 17 | ) 18 | return 19 | 20 | file_names = [config.PROCESSED_DATASET_CSV_NAME, config.DATASET_FOR_API_CSV_NAME, config.EMBEDDINGS_PARQUET_NAME] 21 | 22 | blob_io = BlobIO( 23 | config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME, 24 | config.STORAGE_BACKEND_BLOB_CONTAINER_NAME, 25 | config.STORAGE_BACKEND_BLOB_KEY, 26 | ) 27 | 28 | for file_name in file_names: 29 | logging.info(f"💫 Uploading {file_name} to blob container `{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}`...") 30 | blob_io.upload_local_file(config.DATA_DIR / file_name, file_name) 31 | 32 | logging.info("✅ Done!") 33 | 34 | 35 | if __name__ == "__main__": 36 | setup_logging() 37 | upload_processed_datasets() 38 | -------------------------------------------------------------------------------- /pypi_scout/utils/blob_io.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from enum import Enum 3 | 4 | import polars as pl 5 | from azure.storage.blob import BlobServiceClient 6 | 7 | 8 | class Format(Enum): 9 | CSV = "csv" 10 | PARQUET = "parquet" 11 | 12 | 13 | class BlobIO: 14 | def __init__(self, account_name: str, container_name: str, account_key: str): 15 | self.account_name = account_name 16 | self.container_name = container_name 17 | self.account_key = account_key 18 | self.service_client = BlobServiceClient( 19 | account_url=f"https://{account_name}.blob.core.windows.net", credential=account_key 20 | ) 21 | self.container_client = self.service_client.get_container_client(container_name) 22 | 23 | def upload_local_file(self, local_file_path: str, blob_name: str) -> None: 24 | with open(local_file_path, "rb") as data: 25 | blob_client = self.container_client.get_blob_client(blob_name) 26 | blob_client.upload_blob(data, overwrite=True) 27 | 28 | def download_csv_to_df(self, blob_name: str): 29 | return self._download_as_df(blob_name, Format.CSV) 30 | 31 | def download_parquet_to_df(self, blob_name: str): 32 | return self._download_as_df(blob_name, Format.PARQUET) 33 | 34 | def _download_as_df(self, blob_name: str, format: Format) -> pl.DataFrame: # noqa: A002 35 | """ 36 | //TODO: Improve by not reading into a file first. 37 | """ 38 | blob_client = self.container_client.get_blob_client(blob_name) 39 | download_stream = blob_client.download_blob() 40 | 41 | with tempfile.NamedTemporaryFile(delete=True) as temp_file: 42 | temp_file.write(download_stream.readall()) 43 | temp_file.flush() 44 | 45 | if format == Format.CSV: 46 | return pl.read_csv(temp_file.name) 47 | 48 | if format == Format.PARQUET: 49 | return pl.read_parquet(temp_file.name) 50 | 51 | def exists(self, blob_name): 52 | blob_client = self.container_client.get_blob_client(blob_name) 53 | return blob_client.exists() 54 | -------------------------------------------------------------------------------- /pypi_scout/utils/logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def setup_logging() -> None: 5 | logging.getLogger("azure").setLevel(logging.WARNING) 6 | 7 | logging.basicConfig( 8 | level=logging.INFO, 9 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 10 | handlers=[logging.StreamHandler()], 11 | ) 12 | -------------------------------------------------------------------------------- /pypi_scout/utils/score_calculator.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | 3 | 4 | def calculate_score( 5 | df: pl.DataFrame, weight_similarity: float = 0.5, weight_weekly_downloads: float = 0.5 6 | ) -> pl.DataFrame: 7 | """ 8 | Calculate a combined score for packages based on similarity and weekly downloads. 9 | 10 | This function normalizes the 'similarity' and 'weekly_downloads' columns to a [0, 1] scale, 11 | and computes a combined score using the provided weights for similarity and weekly downloads. 12 | The combined score helps in recommending packages that are both popular and relevant based on similarity. 13 | 14 | Args: 15 | df (pl.DataFrame): DataFrame containing 'similarity' and 'weekly_downloads' columns. 16 | weight_similarity (float): Weight for the similarity score in the combined score calculation. Default is 0.5. 17 | weight_weekly_downloads (float): Weight for the weekly downloads score in the combined score calculation. Default is 0.5. 18 | 19 | Returns: 20 | pl.DataFrame: DataFrame with the combined score and sorted by this score in descending order. 21 | """ 22 | df = df.with_columns( 23 | log_weekly_downloads=pl.col("weekly_downloads").log1p() # log1p is log(1 + x) 24 | ) 25 | 26 | df = df.with_columns( 27 | normalized_similarity=(pl.col("similarity") - pl.col("similarity").min()) 28 | / (pl.col("similarity").max() - pl.col("similarity").min()), 29 | normalized_log_weekly_downloads=(pl.col("log_weekly_downloads") - pl.col("log_weekly_downloads").min()) 30 | / (pl.col("log_weekly_downloads").max() - pl.col("log_weekly_downloads").min()), 31 | ) 32 | 33 | df = df.with_columns( 34 | score=weight_similarity * pl.col("normalized_similarity") 35 | + weight_weekly_downloads * pl.col("normalized_log_weekly_downloads") 36 | ) 37 | 38 | df = df.sort("score", descending=True) 39 | return df 40 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "pypi_scout" 3 | version = "0.0.1" 4 | description = "PyPI Scout helps you find PyPI packages through natural language prompts with the help of Large Language Models (LLMs)." 5 | authors = ["Florian Maas "] 6 | repository = "https://github.com/fpgmaas/pypi-scout" 7 | documentation = "https://fpgmaas.github.io/pypi-scout/" 8 | readme = "README.md" 9 | packages = [ 10 | {include = "pypi_scout"} 11 | ] 12 | 13 | [tool.poetry.dependencies] 14 | python = ">=3.9,<4.0" 15 | beautifulsoup4 = "^4.12.3" 16 | polars = "^0.20.31" 17 | sentence-transformers = "^3.0.1" 18 | lxml = "^5.2.2" 19 | python-dotenv = "^1.0.1" 20 | tqdm = "^4.66.4" 21 | fastapi = "^0.111.0" 22 | pydantic = "^2.7.4" 23 | uvicorn = "^0.30.1" 24 | gdown = "^5.2.0" 25 | azure-storage-blob = "^12.20.0" 26 | slowapi = "^0.1.9" 27 | starlette = "^0.37.2" 28 | numpy = "^2.0.0" 29 | scikit-learn = "^1.5.0" 30 | 31 | [tool.poetry.group.dev.dependencies] 32 | pytest = "^7.2.0" 33 | pytest-cov = "^4.0.0" 34 | pytest-mock = "^3.14.0" 35 | deptry = "^0.12.0" 36 | pre-commit = "^3.4.0" 37 | tox = "^4.11.1" 38 | 39 | [build-system] 40 | requires = ["poetry-core>=1.0.0"] 41 | build-backend = "poetry.core.masonry.api" 42 | 43 | [tool.pytest.ini_options] 44 | testpaths = ["tests"] 45 | 46 | [tool.ruff] 47 | target-version = "py37" 48 | line-length = 120 49 | fix = true 50 | select = [ 51 | # flake8-2020 52 | "YTT", 53 | # flake8-bandit 54 | "S", 55 | # flake8-bugbear 56 | "B", 57 | # flake8-builtins 58 | "A", 59 | # flake8-comprehensions 60 | "C4", 61 | # flake8-debugger 62 | "T10", 63 | # flake8-simplify 64 | "SIM", 65 | # isort 66 | "I", 67 | # mccabe 68 | "C90", 69 | # pycodestyle 70 | "E", "W", 71 | # pyflakes 72 | "F", 73 | # pygrep-hooks 74 | "PGH", 75 | # pyupgrade 76 | "UP", 77 | # ruff 78 | "RUF", 79 | # tryceratops 80 | "TRY", 81 | ] 82 | ignore = [ 83 | # LineTooLong 84 | "E501", 85 | # DoNotAssignLambda 86 | "E731", 87 | ] 88 | 89 | [tool.ruff.format] 90 | preview = true 91 | 92 | [tool.coverage.report] 93 | skip_empty = true 94 | 95 | [tool.coverage.run] 96 | branch = true 97 | source = ["pypi_scout"] 98 | 99 | 100 | [tool.ruff.per-file-ignores] 101 | "tests/*" = ["S101"] 102 | 103 | [tool.deptry] 104 | extend_exclude = [ 105 | "frontend" 106 | ] 107 | 108 | [tool.deptry.per_rule_ignores] 109 | DEP002 = ["lxml", "uvicorn"] 110 | -------------------------------------------------------------------------------- /requirements-cpu.txt: -------------------------------------------------------------------------------- 1 | # This file is used in DockerfileCPU. It installs torch without GPU support and without the NVIDIA package. 2 | # This disables GPU support in the container, but reduces the size drastically (multiple GB's.) 3 | beautifulsoup4==4.12.3 4 | polars==0.20.31 5 | sentence-transformers==3.0.1 6 | lxml==5.2.2 7 | python-dotenv==1.0.1 8 | tqdm==4.66.4 9 | fastapi==0.111.0 10 | pydantic==2.7.4 11 | uvicorn==0.30.1 12 | gdown==5.2.0 13 | torch==2.0.1 14 | numpy==1.24.4 15 | azure-storage-blob==12.20.0 16 | slowapi==0.1.9 17 | starlette==0.37.2 18 | scikit-learn==1.5.0 19 | --index-url=https://download.pytorch.org/whl/cpu 20 | --extra-index-url=https://pypi.org/simple 21 | -------------------------------------------------------------------------------- /static/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgmaas/pypi-scout/593a48a2512a14c350bae98b087cd861d94a0c6b/static/demo.gif -------------------------------------------------------------------------------- /static/pypi-light.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /static/pypi.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/embeddings/test_simple_vector_database.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock 2 | 3 | import numpy as np 4 | import polars as pl 5 | import pytest 6 | 7 | from pypi_scout.embeddings.simple_vector_database import SimpleVectorDatabase 8 | 9 | 10 | @pytest.fixture 11 | def mock_model(): 12 | # Mock the SentenceTransformer model 13 | mock_model = MagicMock() 14 | # Mock the encode method to return a fixed vector 15 | mock_model.encode.return_value = np.array([0.5, 0.5, 0.5]) 16 | return mock_model 17 | 18 | 19 | @pytest.fixture 20 | def df_embeddings(): 21 | return pl.DataFrame( 22 | { 23 | "id": [1, 2, 3], 24 | "text": ["Hello world", "Hi there", "Greetings"], 25 | "embeddings": [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]], 26 | } 27 | ) 28 | 29 | 30 | @pytest.fixture 31 | def vector_db(mock_model, df_embeddings): 32 | return SimpleVectorDatabase(embeddings_model=mock_model, df_embeddings=df_embeddings) 33 | 34 | 35 | def test_embeddings_matrix_creation(vector_db): 36 | expected_matrix = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]], dtype=np.float32) 37 | 38 | np.testing.assert_allclose(vector_db.embeddings_matrix, expected_matrix, rtol=1e-6, atol=1e-8) 39 | 40 | 41 | def test_find_similar(vector_db): 42 | query = "Hello" 43 | result = vector_db.find_similar(query, top_k=2) 44 | 45 | assert result.shape[0] == 2 46 | 47 | assert result["similarity"].min() >= 0 48 | assert result["similarity"].max() <= 1 49 | 50 | expected_columns = ["id", "text", "similarity"] 51 | assert set(result.columns) == set(expected_columns) 52 | --------------------------------------------------------------------------------