├── .editorconfig ├── .env.development ├── .env.production ├── .eslintrc.json ├── .github ├── dependabot.yml └── workflows │ └── dashboard.yml ├── .gitignore ├── .prettierignore ├── .prettierrc.json ├── LICENSE ├── README.md ├── next-sitemap.config.js ├── next.config.js ├── package.json ├── postcss.config.js ├── public ├── download.sh ├── next.svg └── vercel.svg ├── scripts └── build-index.ts ├── src ├── app │ ├── datasets │ │ ├── clickhouse.tsx │ │ ├── dataset_table.tsx │ │ ├── page.tsx │ │ ├── sql │ │ │ ├── files_by_extension.sql │ │ │ ├── largest_version.sql │ │ │ ├── longest_files.sql │ │ │ ├── most_unique.sql │ │ │ ├── python_files_over_time.sql │ │ │ └── sql.ts │ │ └── syntax.tsx │ ├── download │ │ ├── example.sh │ │ ├── example_2.sh │ │ └── page.tsx │ ├── favicon.ico │ ├── globals.css │ ├── layout.tsx │ ├── layouts │ │ └── markdown.tsx │ ├── navbar.tsx │ ├── page.tsx │ ├── projects │ │ ├── page.tsx │ │ └── view │ │ │ ├── layout.tsx │ │ │ ├── page.tsx │ │ │ └── project_info.tsx │ ├── repositories │ │ ├── [name] │ │ │ └── page.tsx │ │ ├── page.tsx │ │ └── repo-stats.tsx │ ├── stats │ │ ├── chart-scroll.tsx │ │ ├── chart.tsx │ │ ├── colours.ts │ │ ├── language-stats.tsx │ │ ├── page.tsx │ │ ├── shitpost-chart.tsx │ │ ├── shitpost-model.tsx │ │ ├── sql.tsx │ │ ├── stats.tsx │ │ ├── total_stats.tsx │ │ └── utils.tsx │ ├── swr-provider.tsx │ └── table.tsx ├── data │ └── .gitkeep └── utils.ts ├── tailwind.config.js └── tsconfig.json /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | charset = utf-8 5 | end_of_line = lf 6 | indent_size = 2 7 | indent_style = space 8 | insert_final_newline = true 9 | max_line_length = 120 10 | trim_trailing_whitespace = true 11 | 12 | [*.tsx] 13 | indent_size = 2 14 | -------------------------------------------------------------------------------- /.env.development: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.env.production: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "next/core-web-vitals" 3 | } 4 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" # See documentation for possible values 4 | directory: "/" # Location of package manifests 5 | schedule: 6 | interval: "weekly" 7 | groups: 8 | dependencies: 9 | patterns: 10 | - "*" 11 | - package-ecosystem: "npm" 12 | directory: "/" # Location of package manifests 13 | schedule: 14 | interval: "weekly" 15 | groups: 16 | dependencies: 17 | patterns: 18 | - "*" 19 | -------------------------------------------------------------------------------- /.github/workflows/dashboard.yml: -------------------------------------------------------------------------------- 1 | # On every push this script is executed 2 | on: 3 | workflow_dispatch: 4 | push: 5 | branches: 6 | - main 7 | schedule: 8 | - cron: "0 10 * * *" 9 | 10 | concurrency: build 11 | 12 | name: Build and deploy GH Pages 13 | jobs: 14 | build: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: checkout 18 | uses: actions/checkout@v4 19 | 20 | - uses: actions/setup-node@v4 21 | with: 22 | node-version: "lts/*" 23 | cache: "npm" 24 | 25 | - name: Setup Pages 26 | id: setup-pages 27 | uses: actions/configure-pages@v5 28 | 29 | - name: Install deps 30 | run: | 31 | npm install 32 | 33 | - name: Build site 34 | run: | 35 | npm run build -- --no-lint 36 | cp public/sitemap*.xml out/ 37 | cp public/robots.txt out/ 38 | cp -r public/data/ out/data/ 39 | env: 40 | NEXT_PUBLIC_BASE_PATH: ${{ steps.setup-pages.outputs.base_path }} 41 | NEXT_PUBLIC_ASSET_PATH: ${{ steps.setup-pages.outputs.base_url }} 42 | 43 | - name: Upload artifact 44 | uses: actions/upload-pages-artifact@v3 45 | with: 46 | path: ${{ github.workspace }}/out/ 47 | 48 | deploy: 49 | needs: build 50 | concurrency: dashboard-publish 51 | 52 | permissions: 53 | pages: write 54 | id-token: write 55 | actions: read 56 | 57 | # Deploy to the github-pages environment 58 | environment: 59 | name: github-pages 60 | url: ${{ steps.deployment.outputs.page_url }} 61 | 62 | runs-on: ubuntu-latest 63 | steps: 64 | - name: Deploy to GitHub Pages 65 | id: deployment 66 | uses: actions/deploy-pages@v4 67 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | src/data/repositories_with_releases.json 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.js 7 | 8 | # testing 9 | /coverage 10 | 11 | # next.js 12 | /.next/ 13 | /out/ 14 | 15 | # production 16 | /build 17 | 18 | # misc 19 | .DS_Store 20 | *.pem 21 | 22 | # debug 23 | npm-debug.log* 24 | yarn-debug.log* 25 | yarn-error.log* 26 | 27 | # local env files 28 | .env*.local 29 | 30 | # vercel 31 | .vercel 32 | 33 | # typescript 34 | *.tsbuildinfo 35 | next-env.d.ts 36 | 37 | .idea/ 38 | public/data/ 39 | public/sitemap*.xml 40 | public/robots.txt 41 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | .next/ 2 | public/ 3 | -------------------------------------------------------------------------------- /.prettierrc.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Tom Forbes 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is a [Next.js](https://nextjs.org/) project bootstrapped with [`create-next-app`](https://github.com/vercel/next.js/tree/canary/packages/create-next-app). 2 | 3 | ## Getting Started 4 | 5 | First, run the development server: 6 | 7 | ```bash 8 | npm run dev 9 | # or 10 | yarn dev 11 | # or 12 | pnpm dev 13 | ``` 14 | 15 | Open [http://localhost:3000](http://localhost:3000) with your browser to see the result. 16 | 17 | You can start editing the page by modifying `app/page.tsx`. The page auto-updates as you edit the file. 18 | 19 | This project uses [`next/font`](https://nextjs.org/docs/basic-features/font-optimization) to automatically optimize and load Inter, a custom Google Font. 20 | 21 | ## Learn More 22 | 23 | To learn more about Next.js, take a look at the following resources: 24 | 25 | - [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API. 26 | - [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial. 27 | 28 | You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js/) - your feedback and contributions are welcome! 29 | 30 | ## Deploy on Vercel 31 | 32 | The easiest way to deploy your Next.js app is to use the [Vercel Platform](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme) from the creators of Next.js. 33 | 34 | Check out our [Next.js deployment documentation](https://nextjs.org/docs/deployment) for more details. 35 | -------------------------------------------------------------------------------- /next-sitemap.config.js: -------------------------------------------------------------------------------- 1 | /** @type {import('next-sitemap').IConfig} */ 2 | module.exports = { 3 | siteUrl: 'https://py-code.org', 4 | sitemapSize: 40000, 5 | generateIndexSitemap: true, 6 | generateRobotsTxt: true, 7 | additionalPaths: async (config) => { 8 | const response = await fetch("https://data.py-code.org/data/pages.json") 9 | const packageList = await response.json(); 10 | const packages = packageList.packages.toSorted((a, b) => a.toLowerCase().localeCompare(b.toLowerCase())); 11 | 12 | return packages.map((pkg) => ({ 13 | loc: `https://py-code.org/projects/view?name=${pkg.toLowerCase()}`, 14 | })); 15 | }, 16 | } 17 | -------------------------------------------------------------------------------- /next.config.js: -------------------------------------------------------------------------------- 1 | const { join } = require("path"); 2 | const { symlink, access, mkdir } = require("fs/promises"); 3 | const basePath = (process.env.NEXT_PUBLIC_BASE_PATH || "").replace("http://", "https://"); 4 | const assetPrefix = (process.env.NEXT_PUBLIC_ASSET_PATH || "").replace("http://", "https://"); 5 | 6 | const nextConfig = { 7 | pageExtensions: ["ts", "tsx", "js", "jsx", "md", "mdx"], 8 | basePath, 9 | assetPrefix, 10 | // experimental: { 11 | // mdxRs: true, 12 | // }, 13 | output: "export", 14 | webpack: (config, { isServer, dev }) => { 15 | config.experiments = Object.assign(config.experiments || {}, { 16 | asyncWebAssembly: true, 17 | layers: true, 18 | }); 19 | if (!dev && isServer) { 20 | config.output.webassemblyModuleFilename = "chunks/[id].wasm"; 21 | config.plugins.push(new WasmChunksFixPlugin()); 22 | } 23 | return config; 24 | }, 25 | }; 26 | module.exports = nextConfig; 27 | 28 | class WasmChunksFixPlugin { 29 | apply(compiler) { 30 | compiler.hooks.thisCompilation.tap("WasmChunksFixPlugin", (compilation) => { 31 | compilation.hooks.processAssets.tap({ name: "WasmChunksFixPlugin" }, (assets) => 32 | Object.entries(assets).forEach(([pathname, source]) => { 33 | if (!pathname.match(/\.wasm$/)) return; 34 | compilation.deleteAsset(pathname); 35 | 36 | const name = pathname.split("/")[1]; 37 | const info = compilation.assetsInfo.get(pathname); 38 | compilation.emitAsset(name, source, info); 39 | }), 40 | ); 41 | }); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pypi-data-site", 3 | "version": "0.1.0", 4 | "private": true, 5 | "engines": { 6 | "node": ">=18.13.0" 7 | }, 8 | "scripts": { 9 | "dev": "next dev", 10 | "build": "next build", 11 | "start": "next start", 12 | "lint": "next lint", 13 | "postbuild": "next-sitemap && npm run build-index", 14 | "build-index": "mkdir -p public/data/ && ts-node scripts/build-index.ts public/data/fuse-index.json", 15 | "postinstall": "wget https://raw.githubusercontent.com/pypi-data/data/main/stats/repositories_with_releases.json -O src/data/repositories_with_releases.json" 16 | }, 17 | "dependencies": { 18 | "@heroicons/react": "^2.1.4", 19 | "@next/bundle-analyzer": "~14.2.4", 20 | "@tailwindcss/typography": "^0.5.13", 21 | "@tanstack/react-table": "^8.17.3", 22 | "byte-size": "^8.1.1", 23 | "chroma-js": "^2.4.2", 24 | "daisyui": "^4.12.7", 25 | "date-fns": "^3.6.0", 26 | "eslint": "^9.5.0", 27 | "eslint-config-next": "~14.2.4", 28 | "fuse.js": "^7.0.0", 29 | "human-format": "^1.2.0", 30 | "lodash.samplesize": "^4.2.0", 31 | "next": "~14.2.4", 32 | "next-sitemap": "^4.2.3", 33 | "prql-js": "==0.12.1", 34 | "react": "18.3.1", 35 | "react-dom": "18.3.1", 36 | "react-syntax-highlighter": "^15.5.0", 37 | "react-timestamp": "^6.0.0", 38 | "recharts": "^2.12.7", 39 | "sql-formatter": "^15.3.2", 40 | "swr": "^2.2.5", 41 | "ts-node": "^10.9.2", 42 | "typescript": "<5.6.0", 43 | "use-debounce": "^10.0.1" 44 | }, 45 | "devDependencies": { 46 | "@types/byte-size": "^8.1.2", 47 | "@types/chroma-js": "^2.4.4", 48 | "@types/lodash.samplesize": "^4.2.9", 49 | "@types/node": "^20.14.8", 50 | "@types/react": "^18.3.3", 51 | "@types/react-dom": "^18.3.0", 52 | "@types/react-syntax-highlighter": "^15.5.13", 53 | "autoprefixer": "^10.4.19", 54 | "postcss": "^8.4.38", 55 | "prettier": "^3.3.2", 56 | "raw-loader": "^4.0.2", 57 | "tailwindcss": "^3.4.4" 58 | }, 59 | "browserslist": [ 60 | ">0.3%", 61 | "not dead", 62 | "not op_mini all" 63 | ] 64 | } 65 | -------------------------------------------------------------------------------- /postcss.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | plugins: { 3 | tailwindcss: {}, 4 | autoprefixer: {}, 5 | }, 6 | }; 7 | -------------------------------------------------------------------------------- /public/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [[ $# -eq 0 ]] ; then 4 | echo 'Usage: [path]' 5 | exit 1 6 | fi 7 | 8 | mkdir -p "$1" 9 | 10 | for url in $(curl https://raw.githubusercontent.com/pypi-data/data/main/links/repositories.txt); do 11 | git -C "$1" clone "$url" --depth=1 --no-checkout --branch=code 12 | done 13 | 14 | -------------------------------------------------------------------------------- /public/next.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /public/vercel.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/build-index.ts: -------------------------------------------------------------------------------- 1 | import Fuse from "fuse.js"; 2 | import * as fs from "fs"; 3 | 4 | async function fetchData() { 5 | const response = await fetch("https://data.py-code.org/data/pages.json"); 6 | const packageList = await response.json(); 7 | 8 | // @ts-ignore 9 | const packages: string[] = packageList.packages; //.slice(0, 100_000); 10 | // const packages: string[] = packageList.packages; 11 | 12 | const index = { 13 | json: Fuse.createIndex([], packages).toJSON(), 14 | packages: packages, 15 | }; 16 | console.log(process.argv[2], JSON.stringify(index).length); 17 | // const encoded = zlib.deflateSync(JSON.stringify(index), { level: 9 }); 18 | fs.writeFileSync(process.argv[2], JSON.stringify(index)); 19 | } 20 | 21 | fetchData(); 22 | -------------------------------------------------------------------------------- /src/app/datasets/clickhouse.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | 3 | import { useState } from "react"; 4 | 5 | export default function ClickhouseView({ queries }: { queries: { name: string; query: string }[] }) { 6 | const [index, setIndex] = useState(0); 7 | const query = queries[index].query; 8 | const url = `https://play.clickhouse.com/play?user=play#${query}`; 9 | return ( 10 | <> 11 |
12 | {queries.map(({ name }, i) => ( 13 |
14 | 22 |
23 | ))} 24 |
25 | 26 |
27 | 28 | Open in new tab 29 | 30 |
31 | 32 | ); 33 | } 34 | -------------------------------------------------------------------------------- /src/app/datasets/dataset_table.tsx: -------------------------------------------------------------------------------- 1 | import byteSize from "byte-size"; 2 | 3 | export default function LinksTable({ data }: { data: { url: string; size: number }[] }) { 4 | return ( 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | {data 14 | .sort((a, b) => (a.size < b.size ? 1 : -1)) 15 | .map((e) => ( 16 | 17 | 20 | 21 | 22 | ))} 23 | 24 | 25 | 26 | 27 | 33 | 34 | 35 |
URLSize
18 | {e.url} 19 | {byteSize(e.size, { units: "iec", precision: 1 }).toString()}
{data.length.toLocaleString()} links 28 | {byteSize( 29 | data.reduce((acc, cur) => acc + cur.size, 0), 30 | { units: "iec", precision: 1 }, 31 | ).toString()} 32 |
36 | ); 37 | } 38 | -------------------------------------------------------------------------------- /src/app/datasets/page.tsx: -------------------------------------------------------------------------------- 1 | import LinksTable from "@/app/datasets/dataset_table"; 2 | import SyntaxHighlight from "@/app/datasets/syntax"; 3 | import getStats from "@/app/stats/stats"; 4 | import { ChevronDownIcon } from "@heroicons/react/24/solid"; 5 | import Table from "@/app/table"; 6 | import ClickhouseView from "@/app/datasets/clickhouse"; 7 | import queries from "@/app/datasets/sql/sql"; 8 | 9 | const SQLITE_URL = "https://github.com/pypi-data/pypi-json-data/releases/download/latest/pypi-data.sqlite.gz"; 10 | const DATASET_URL = "https://github.com/pypi-data/data/raw/main/links/dataset.txt"; 11 | const PYTHON_DATASET_URL = "https://github.com/pypi-data/data/raw/main/links/only_python_files.txt"; 12 | const REPOSITORIES_DATASET_URL = "https://github.com/pypi-data/data/raw/main/stats/repositories.json"; 13 | const REPOSITORIES_WITH_RELEASES_DATASET_URL = 14 | "https://github.com/pypi-data/data/raw/main/stats/repositories_with_releases.json"; 15 | const CURL_EXAMPLE = `$ curl -L --remote-name-all $(curl -L "${DATASET_URL}")`; 16 | const PYTHON_CURL_EXAMPLE = `$ curl -L --remote-name-all $(curl -L "${PYTHON_DATASET_URL}")`; 17 | const SQLITE_CURL_EXAMPLE = `$ curl -L ${SQLITE_URL} | gzip -d > pypi-data.sqlite`; 18 | const DUCK_DB_EXAMPLE = `${CURL_EXAMPLE} 19 | $ duckdb -json -s "select * from '*.parquet' order by lines DESC limit 1" 20 | [ 21 | { 22 | "project_name": "EvenOrOdd", 23 | "project_version": "0.1.10", 24 | "project_release": "EvenOrOdd-0.1.10-py3-none-any.whl", 25 | "uploaded_on": "2021-02-21 02:25:57.832", 26 | "path": "EvenOrOdd/EvenOrOdd.py", 27 | "size": "514133366", 28 | "hash": "ff7f863ad0bb4413c939fb5e9aa178a5a8855774262e1171b876d1d2b51e6998", 29 | "skip_reason": "too-large", 30 | "lines": "20010001" 31 | } 32 | ] 33 | `; 34 | 35 | const EVEN_OR_ODD_EXAMPLE = `$ wget https://files.pythonhosted.org/packages/b2/82/c4265814ed9e68880ba0892eddf1664c48bb490f37113d74d32fe4757192/EvenOrOdd-0.1.10-py3-none-any.whl 36 | $ unzip EvenOrOdd-0.1.10-py3-none-any.whl 37 | $ wc -l EvenOrOdd/EvenOrOdd.py 38 | 20010000 EvenOrOdd/EvenOrOdd.py 39 | 40 | $ tail -n6 EvenOrOdd/EvenOrOdd.py 41 | elif num == 9999996: 42 | return True 43 | elif num == 9999997: 44 | return False 45 | elif num == 9999998: 46 | return True 47 | elif num == 9999999: 48 | return False 49 | else: 50 | raise Exception("Number is not within bounds") 51 | `; 52 | 53 | export default async function Page() { 54 | const stats = await getStats(); 55 | 56 | const sampleresp = await fetch("https://raw.githubusercontent.com/pypi-data/data/main/stats/random_sample.json"); 57 | const sampledata = await sampleresp.json(); 58 | 59 | const linkresp = await fetch(DATASET_URL); 60 | const links = (await linkresp.text()).split("\n").filter((e) => e.length > 0); 61 | 62 | const sqliteResponse = await fetch(SQLITE_URL, { method: "HEAD" }); 63 | const sqliteSize = Number(sqliteResponse.headers.get("content-length")); 64 | 65 | const sqliteSchemaResponse = await fetch( 66 | "https://raw.githubusercontent.com/pypi-data/pypi-json-data/main/scripts/schema.sql", 67 | ); 68 | const sqliteSchema = await sqliteSchemaResponse.text(); 69 | 70 | const sqliteSizes = [{ url: SQLITE_URL, size: sqliteSize }]; 71 | 72 | const sizes = await Promise.all( 73 | links.map(async (link) => { 74 | let resp = await fetch(link, { 75 | method: "HEAD", 76 | headers: { 77 | "accept-encoding": "", 78 | }, 79 | }); 80 | return { 81 | url: link, 82 | size: Number(resp.headers.get("content-length")), 83 | }; 84 | }), 85 | ); 86 | 87 | const pythonlinkresp = await fetch(PYTHON_DATASET_URL); 88 | const pythonlinks = (await pythonlinkresp.text()).split("\n").filter((e) => e.length > 0); 89 | 90 | const pythonsampleresp = await fetch( 91 | "https://raw.githubusercontent.com/pypi-data/data/main/stats/random_sample_python_only.json", 92 | ); 93 | const pythonsampledata = await pythonsampleresp.json(); 94 | 95 | const pythonsizes = await Promise.all( 96 | pythonlinks.map(async (link) => { 97 | let resp = await fetch(link, { 98 | method: "HEAD", 99 | headers: { 100 | "accept-encoding": "", 101 | }, 102 | }); 103 | return { 104 | url: link, 105 | size: Number(resp.headers.get("content-length")), 106 | }; 107 | }), 108 | ); 109 | 110 | const repo_metadata_example = await (await fetch(REPOSITORIES_DATASET_URL)).json(); 111 | const repo_metadata_example_element = repo_metadata_example[0]; 112 | 113 | const repo_metadata_sizes = await Promise.all( 114 | [REPOSITORIES_WITH_RELEASES_DATASET_URL, REPOSITORIES_DATASET_URL].map(async (link) => { 115 | let resp = await fetch(link, { 116 | method: "HEAD", 117 | headers: { 118 | "accept-encoding": "", 119 | }, 120 | }); 121 | return { 122 | url: link, 123 | size: Number(resp.headers.get("content-length")), 124 | }; 125 | }), 126 | ); 127 | 128 | return ( 129 | <> 130 |

Datasets

131 |
132 |

Explore the data in your browser

133 | 134 | 135 |
136 | 137 |

Download datasets locally

138 |
139 |

There are several datasets available for use:

140 |
    141 |
  1. 142 | Metadata about every file uploaded to PyPI 143 |
  2. 144 |
  3. 145 | SQLite dump of all PyPI metadata 146 |
  4. 147 |
  5. 148 | Repository metadata 149 |
  6. 150 |
  7. 151 | Unique Python files within every release 152 |
  8. 153 |
154 |

155 | These datasets allow you to analyse the contents of PyPI without having to download and process every package 156 | yourself. All of the statistics within the stats page are periodically generated using 157 | the datasets below. 158 |

159 |
160 | 161 |
162 |
163 |

164 | Metadata about every file uploaded to PyPI 165 |

166 |

About

167 |
168 |

This dataset contains information about every file within every release uploaded to PyPi, including:

169 |
    170 |
  1. Project name, version and release upload date
  2. 171 |
  3. File path, size and line count
  4. 172 |
  5. SHA256 hash
  6. 173 |
174 |

175 | The dataset should be accessed by downloading the files specified within{" "} 176 | 177 | {DATASET_URL} 178 | {" "} 179 | . The following command downloads the dataset from this URL: 180 |

181 | {CURL_EXAMPLE} 182 | 183 |
184 | 185 |

186 |
187 | 188 |
189 | Using DuckDB to process the dataset 190 |

191 |
192 |
193 |

194 | DuckDB is a great tool for processing the dataset. It is very fast 195 | and supports SQL queries over Parquet files. The following command uses DuckDB to find the largest 196 | file ever uploaded to PyPI: 197 |

198 | {DUCK_DB_EXAMPLE} 199 |

Woah, a whopping 20 million lines of code! Lets confirm it:

200 | {EVEN_OR_ODD_EXAMPLE} 201 |

Very funny, I hope this module is a joke 😅

202 |
203 |
204 | 205 |
206 | 207 |

208 |
209 | 210 |
211 | About skipped files 212 |

213 |
214 |
215 |
216 |

217 | The dataset contains a skip_reason column. If a file is not present in the git 218 | repositories then the reason for skipping is recorded here. On the right is a list of the current 219 | skip reasons and the number of files excluded from the git repositories for each reason. 220 |

221 |

222 | The exact reasons for skipping a file are not fully documented here, but ignored files 223 | include virtual environments accidentally uploaded to PyPI. text-long-lines means the 224 | file had very few lines, but the total size was large. 225 |

226 |
227 |
228 | Skipped reasons: 229 | skip_reason != "")} 231 | columns={[ 232 | { name: "skip_reason" }, 233 | { name: "count", type: "number" }, 234 | { 235 | name: "total_size", 236 | type: "bytes", 237 | }, 238 | ]} 239 | /> 240 | 241 | 242 | 243 | 244 | 245 |
246 |
247 |

Current Links

248 | 249 |
250 |
251 |

Schema

252 | {JSON.stringify(sampledata, null, 2)} 253 |
254 |
255 | 256 | 257 | 258 |
259 | 260 |
261 |
262 |

263 | SQLite dump of all PyPI metadata 264 |

265 |

About

266 |

267 | This is a SQLite dump of all PyPI metadata fetched from the PyPI API. It is updated daily. It can be 268 | accessed directly from the following url:  269 | 270 | {SQLITE_URL} 271 | 272 | : 273 |

274 | {SQLITE_CURL_EXAMPLE} 275 |
276 |
277 |

Links

278 | 279 |
280 |
281 |

Schema

282 | {sqliteSchema} 283 |
284 |
285 |
286 |
287 | 288 |
289 | 290 |
291 |
292 |

293 | Repository Metadata 294 |

295 |

296 | This dataset contains information about the pypi-data git repositories. The{" "} 297 | repositories_with_releases.json 298 | file contains a list of project names contained within each git repository. 299 |

300 | 301 |
302 |
303 |

About

304 | 305 |

Current Links

306 | 307 |
308 |
309 |

Schema

310 | 311 | {JSON.stringify([repo_metadata_example_element], null, 2)} 312 | 313 |
314 |
315 |
316 |
317 | 318 |
319 | 320 |
321 |
322 |

323 | Unique Python files 324 |

325 |

326 | This dataset contains one row per unique Python file within every release uploaded to PyPI. 327 | Only the sha256 hash and a random path to the file is provided. This dataset is useful if you want to parse 328 | the Python files yourself, but want to avoid parsing the same file multiple times. 329 |

330 |

331 | Like the main dataset, the unique files dataset should be accessed by downloading the links 332 | 333 | from the following file 334 | {" "} 335 | : 336 |

337 | {PYTHON_CURL_EXAMPLE} 338 |
339 |
340 |

About

341 | 342 |

Current Links

343 | 344 |
345 |
346 |

Schema

347 | {JSON.stringify(pythonsampledata, null, 2)} 348 |
349 |
350 |
351 |
352 | 353 | ); 354 | } 355 | -------------------------------------------------------------------------------- /src/app/datasets/sql/files_by_extension.sql: -------------------------------------------------------------------------------- 1 | select arrayElement(splitByChar('.', arrayElement(splitByChar('/', path), -1)), -1) as extension, 2 | count(*) as total_files, 3 | formatReadableSize(sum(size)) as total_size 4 | from pypi 5 | where skip_reason = '' 6 | group by 1 7 | order by sum(size) desc 8 | limit 10; 9 | -------------------------------------------------------------------------------- /src/app/datasets/sql/largest_version.sql: -------------------------------------------------------------------------------- 1 | SELECT project_name, 2 | project_version, 3 | formatReadableSize(sum(size)) as total_size, 4 | count(*) as files 5 | FROM pypi 6 | group by 1, 2 7 | order by sum(size) desc 8 | -------------------------------------------------------------------------------- /src/app/datasets/sql/longest_files.sql: -------------------------------------------------------------------------------- 1 | SELECT project_release, 2 | path, 3 | lines 4 | FROM pypi 5 | order by lines desc 6 | limit 10; 7 | -------------------------------------------------------------------------------- /src/app/datasets/sql/most_unique.sql: -------------------------------------------------------------------------------- 1 | with project_files as (SELECT project_name, 2 | count(*) as files, 3 | count(distinct hash) as unique, 4 | round((unique / files) * 100) as percent_unique, 5 | formatReadableSize(sum(size)) as total_size 6 | FROM pypi 7 | where endsWith(path, '.py') 8 | group by 1) 9 | select * 10 | from project_files 11 | where files > (select quantile(0.995)(files) from project_files) 12 | order by 4 desc 13 | limit 10 14 | -------------------------------------------------------------------------------- /src/app/datasets/sql/python_files_over_time.sql: -------------------------------------------------------------------------------- 1 | SELECT toYear(uploaded_on) as year, 2 | count(*) as python_files, 3 | count(distinct hash) as unique_files, 4 | sum(lines) as total_lines, 5 | round((unique_files/python_files) * 100) as unique_percent 6 | 7 | FROM pypi 8 | where endsWith(path, '.py') 9 | group by 1 10 | order by 1 desc 11 | limit 25; 12 | -------------------------------------------------------------------------------- /src/app/datasets/sql/sql.ts: -------------------------------------------------------------------------------- 1 | import { format } from "sql-formatter"; 2 | 3 | // @ts-ignore 4 | import mostUnique from "raw-loader!./most_unique.sql"; 5 | // @ts-ignore 6 | import overTime from "raw-loader!./python_files_over_time.sql"; 7 | // @ts-ignore 8 | import filesByExt from "raw-loader!./files_by_extension.sql"; 9 | // @ts-ignore 10 | import longestFiles from "raw-loader!./longest_files.sql"; 11 | // @ts-ignore 12 | import largestVersion from "raw-loader!./largest_version.sql"; 13 | 14 | const queries = [ 15 | createExample("Largest versions", largestVersion), 16 | createExample("Longest files", longestFiles), 17 | createExample("Largest projects by unique files", mostUnique), 18 | createExample("Unique files over time", overTime), 19 | createExample("Sizes by extension", filesByExt), 20 | ]; 21 | 22 | export default queries; 23 | 24 | function createExample(name: string, sql: string) { 25 | const formatted = format(sql, { 26 | language: "sql", 27 | indentStyle: "tabularLeft", 28 | }); 29 | const query = Buffer.from(formatted).toString("base64"); 30 | return { 31 | name, 32 | query, 33 | }; 34 | } 35 | -------------------------------------------------------------------------------- /src/app/datasets/syntax.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | 3 | import { PrismLight as SyntaxHighlighter } from "react-syntax-highlighter"; 4 | import shell from "react-syntax-highlighter/dist/esm/languages/prism/shell-session"; 5 | import json from "react-syntax-highlighter/dist/esm/languages/prism/json"; 6 | import sql from "react-syntax-highlighter/dist/esm/languages/prism/sql"; 7 | import { tomorrow } from "react-syntax-highlighter/dist/esm/styles/prism"; 8 | 9 | SyntaxHighlighter.registerLanguage("shell", shell); 10 | SyntaxHighlighter.registerLanguage("json", json); 11 | SyntaxHighlighter.registerLanguage("sql", sql); 12 | 13 | export default function SyntaxHighlight({ language, children }: { language: string; children: string | string[] }) { 14 | return ( 15 | 26 | {children} 27 | 28 | ); 29 | } 30 | -------------------------------------------------------------------------------- /src/app/download/example.sh: -------------------------------------------------------------------------------- 1 | wget https://py-code.org/download.sh 2 | chmod +x download.sh 3 | ./download.sh pypi_code 4 | -------------------------------------------------------------------------------- /src/app/download/example_2.sh: -------------------------------------------------------------------------------- 1 | git rev-list --no-object-names --all --objects --filter=object:type=blob --all -- 'packages/4suite-xml/' | git cat-file --batch 2 | -------------------------------------------------------------------------------- /src/app/download/page.tsx: -------------------------------------------------------------------------------- 1 | import byteSize from "byte-size"; 2 | import SyntaxHighlight from "@/app/datasets/syntax"; 3 | // @ts-ignore 4 | import contents from "raw-loader!@public/download.sh"; 5 | // @ts-ignore 6 | import example from "raw-loader!./example.sh"; 7 | 8 | // @ts-ignore 9 | import example_2 from "raw-loader!./example_2.sh"; 10 | import { getData as getRepoData } from "@/utils"; 11 | 12 | export default async function Download() { 13 | const repoData = await getRepoData(); 14 | const total_size = repoData.reduce((acc, repo) => acc + repo.size, 0); 15 | 16 | return ( 17 | <> 18 |

Download PyPI

19 |
20 |

Step 1: Ensure you have space

21 |

22 | The current size of all the repositories is{" "} 23 | {byteSize(total_size, { 24 | precision: 1, 25 | }).toString()} 26 | . Make sure you have enough space on your machine before continuing. 27 |

28 |

Step 2: Clone the repositories

29 |
30 |
31 |

Clone the repositories using the following command:

32 | {example} 33 |

34 | This will create a new directory called pypi_code and begin fetching all the data from 35 | GitHub. This will take several hours. 36 |

37 |
38 |
39 |

40 | download.sh contents: 41 |

42 | {contents} 43 |
44 |
45 |

Step 3: Use the data!

46 |

47 | The data is available by standard git tooling. To list all the files within the{" "} 48 | 4suite-xml package you could run: 49 |

50 | {example_2} 51 |

And listing all files can be done with:

52 | git rev-list --objects --all 53 |

54 | There is also a dataset of all the unique Python files available for download.{" "} 55 | See here for more information. 56 |

57 |
58 | 59 | ); 60 | } 61 | -------------------------------------------------------------------------------- /src/app/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pypi-data/website/HEAD/src/app/favicon.ico -------------------------------------------------------------------------------- /src/app/globals.css: -------------------------------------------------------------------------------- 1 | @tailwind base; 2 | @tailwind components; 3 | @tailwind utilities; 4 | 5 | @layer base { 6 | h1 { 7 | @apply text-4xl; 8 | @apply font-extrabold; 9 | @apply mb-3; 10 | } 11 | 12 | h1.card-title { 13 | @apply text-2xl; 14 | } 15 | 16 | .card-body h3 { 17 | @apply mt-4; 18 | } 19 | 20 | h2 { 21 | @apply text-2xl; 22 | @apply font-bold; 23 | @apply mb-3; 24 | } 25 | 26 | h3 { 27 | @apply text-xl; 28 | @apply mb-3; 29 | @apply font-bold; 30 | } 31 | 32 | p { 33 | @apply mb-1; 34 | } 35 | 36 | article { 37 | @apply mb-3; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/app/layout.tsx: -------------------------------------------------------------------------------- 1 | import "./globals.css"; 2 | import type { Metadata } from "next"; 3 | import { Inter } from "next/font/google"; 4 | import { SWRProvider } from "./swr-provider"; 5 | import NavBar from "./navbar"; 6 | import React, { Suspense } from "react"; 7 | 8 | const inter = Inter({ subsets: ["latin"] }); 9 | 10 | export const metadata: Metadata = { 11 | title: "PyPI Data", 12 | description: "PyPI code explorable on Github", 13 | }; 14 | 15 | export default function RootLayout({ children }: { children: React.ReactNode }) { 16 | return ( 17 | 18 | 19 | 20 | 21 |
22 | }>{children} 23 |
24 |
25 | 34 | 35 |
36 | 37 | ); 38 | } 39 | -------------------------------------------------------------------------------- /src/app/layouts/markdown.tsx: -------------------------------------------------------------------------------- 1 | // @ts-ignore 2 | import React from "react"; 3 | 4 | export default function Markdown({ children }: { children: React.ReactNode }) { 5 | return
{children}
; 6 | } 7 | -------------------------------------------------------------------------------- /src/app/navbar.tsx: -------------------------------------------------------------------------------- 1 | export default function NavBar() { 2 | return ( 3 | 26 | ); 27 | } 28 | -------------------------------------------------------------------------------- /src/app/page.tsx: -------------------------------------------------------------------------------- 1 | import TotalStats from "@/app/stats/total_stats"; 2 | import getStats from "@/app/stats/stats"; 3 | import byteSize from "byte-size"; 4 | import { getData as getRepoData } from "@/utils"; 5 | 6 | export default async function Home() { 7 | const data = await getStats(); 8 | const repoData = await getRepoData(); 9 | const total_size = repoData.reduce((acc, repo) => acc + repo.size, 0); 10 | const chartData = data.stats_over_time.sort((a, b) => (a.month < b.month ? -1 : 1)); 11 | const lastMonth = chartData[chartData.length - 2]; 12 | 13 | return ( 14 | <> 15 |
16 |
17 |
18 |

What is this?

19 |
20 |

21 | This project makes it easy to analyze the Python ecosystem by providing of all the code ever published to 22 | PyPI via git, parquet datasets with file metadata, and a set of tools to help analyze the data. 23 |

24 |

25 | Thanks to the power of git the contents of PyPI takes up only{" "} 26 | {byteSize(total_size, { 27 | precision: 1, 28 | }).toString()}{" "} 29 | on disk, and thanks to tools like libcst every Python 30 | file can be analysed on a consumer-grade laptop in a few hours. 31 |

32 |

33 | 34 | Download all the code 35 | 36 | 37 | Explore the datasets 38 | 39 |

40 |
41 |
42 |
43 | 56 | 57 | ); 58 | } 59 | -------------------------------------------------------------------------------- /src/app/projects/page.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | import { useRouter, useSearchParams } from "next/navigation"; 3 | import { useEffect, useMemo, useState } from "react"; 4 | import {default as Fuse, FuseIndex} from "fuse.js"; 5 | import { useDebounce } from "use-debounce"; 6 | import sampleSize from "lodash.samplesize"; 7 | import useSWRImmutable from "swr/immutable"; 8 | 9 | // const ASSET_PATH = "https://data.py-code.org" //(process.env.NEXT_PUBLIC_ASSET_PATH || "").replace("http://", "https://"); 10 | 11 | export default function ProjectsList() { 12 | const router = useRouter(); 13 | const searchParams = useSearchParams()!; 14 | const searchParam = searchParams.get("search") || ""; 15 | let [search, setSearch] = useState(searchParam); 16 | const [debouncedSearch] = useDebounce(search, 500); 17 | const [isClient, setIsClient] = useState(false); 18 | 19 | const { data, error, isLoading } = useSWRImmutable(`/data/fuse-index.json`); 20 | 21 | const fuse = useMemo(() => { 22 | if (error || isLoading || data == null) { 23 | return null; 24 | } 25 | // @ts-ignore 26 | const idx: FuseIndex = Fuse.parseIndex(data.json); 27 | // @ts-ignore 28 | return new Fuse( 29 | data.packages, 30 | { 31 | includeScore: false, 32 | threshold: 0.3, 33 | distance: 10, 34 | // ignoreLocation: true, 35 | useExtendedSearch: false, 36 | }, 37 | idx, 38 | ); 39 | }, [data, error, isLoading]); 40 | 41 | useEffect(() => { 42 | setIsClient(true); 43 | }, []); 44 | 45 | useEffect(() => { 46 | if (!debouncedSearch) { 47 | return; 48 | } 49 | // @ts-ignore 50 | const params = new URLSearchParams(searchParams); 51 | params.set("search", debouncedSearch); 52 | router.replace(`/projects/?${params}`); 53 | }, [debouncedSearch, router, searchParams]); 54 | 55 | const searchResults = useMemo(() => { 56 | if (debouncedSearch.length > 3 && fuse) { 57 | console.time(`search ${debouncedSearch}`); 58 | let result = fuse.search(debouncedSearch, { limit: 50 }); 59 | console.timeEnd(`search ${debouncedSearch}`); 60 | // @ts-ignore 61 | return result.map(({ item }) => item); 62 | } else if (debouncedSearch.length == 0 && isClient) { 63 | // Select 10 random packages 64 | // @ts-ignore 65 | return sampleSize((data && data.packages) || [], 10); 66 | } 67 | return []; 68 | }, [fuse, data, debouncedSearch, isClient]); 69 | 70 | const randomName = useMemo(() => { 71 | if (isClient && data) { 72 | // @ts-ignore 73 | return sampleSize(data.packages, 1)[0]; 74 | } else { 75 | return null; 76 | } 77 | }, [data, isClient]); 78 | 79 | if (isLoading) { 80 | return ; 81 | } else if (error) { 82 | return ( 83 |
84 | 90 | 96 | 97 | Error loading search index! {error.toString()} 98 |
99 | ); 100 | } 101 | 102 | return ( 103 | <> 104 |
105 |

Projects List

106 |

107 | This is a list of all the projects that have been uploaded to PyPI. You can fuzzy-search for a project by 108 | name. All searching is done client-side. 109 |

110 |
111 |
{ 114 | e.preventDefault(); 115 | }} 116 | > 117 |
118 | { 125 | setSearch(e.target.value); 126 | }} 127 | value={search} 128 | /> 129 | 140 |
141 | 142 |
143 |
144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | { 152 | // @ts-ignore 153 | searchResults.map((p) => { 154 | return ( 155 | 156 | 161 | 166 | 167 | ); 168 | })} 169 | 170 |
NameLink
157 |

158 | {p} 159 |

160 |
162 | 163 | View 164 | 165 |
171 |
172 | 173 | ); 174 | } 175 | -------------------------------------------------------------------------------- /src/app/projects/view/layout.tsx: -------------------------------------------------------------------------------- 1 | import React, { Suspense } from "react"; 2 | 3 | function ProjectFallback() { 4 | return <>Loading; 5 | } 6 | 7 | export default function RootLayout({ children }: { children: React.ReactNode }) { 8 | return }>{children}; 9 | } 10 | -------------------------------------------------------------------------------- /src/app/projects/view/page.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | import { useSearchParams } from "next/navigation"; 3 | import ProjectInfo from "./project_info"; 4 | 5 | export default function Page() { 6 | const searchParams = useSearchParams(); 7 | const name = searchParams.get("name"); 8 | if (name == null) { 9 | return ( 10 |
11 | 17 | 23 | 24 | Error! No project given 25 |
26 | ); 27 | } 28 | return ; 29 | } 30 | -------------------------------------------------------------------------------- /src/app/projects/view/project_info.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | import useSWRImmutable from "swr/immutable"; 3 | import Timestamp from "react-timestamp"; 4 | import { useEffect } from "react"; 5 | 6 | type PackageWithIndex = { 7 | index: number; 8 | package_filename: string; 9 | package: { 10 | project_name: string; 11 | project_version: string; 12 | url: string; 13 | upload_time: string; 14 | processed: boolean; 15 | }; 16 | }; 17 | 18 | type ProjectInfo = { 19 | name: string; 20 | packages_with_indexes: PackageWithIndex[]; 21 | }; 22 | 23 | function getInspectorLink(p: PackageWithIndex): string { 24 | const url = new URL(p.package.url); 25 | return `https://inspector.pypi.io/project/${p.package.project_name}/${p.package.project_version}${url.pathname}`; 26 | } 27 | 28 | //const ASSET_PATH = (process.env.NEXT_PUBLIC_ASSET_PATH || "").replace("http://", "https://"); 29 | const ASSET_PATH = "https://data.py-code.org"; 30 | 31 | export default function ProjectInfo({ name }: { name: string }) { 32 | const first_char = Array.from(name)[0]; 33 | const { data, error, isLoading } = useSWRImmutable(`${ASSET_PATH}/data/packages/${first_char}/${name}.json`); 34 | useEffect(() => { 35 | if (!isLoading) { 36 | document.title = `PyPI code for ${name}`; 37 | const canonical = document.createElement("link"); 38 | canonical.rel = "canonical"; 39 | canonical.href = `https://py-code.org/projects/view?name=${name}`; 40 | document.head.appendChild(canonical); 41 | } 42 | }, [isLoading, name]); 43 | if (isLoading) { 44 | return

Loading

; 45 | } 46 | const project_info: ProjectInfo = data; 47 | if (data === undefined) { 48 | return
49 |

Project not found: {name}

50 |

The project with the name {name} cannot be found.

51 |
; 52 | } 53 | return ( 54 | <> 55 |
56 |

Source code for {project_info.name}

57 |

58 | The PyPI project {project_info.name} has {project_info.packages_with_indexes.length} packages. 59 | Click the links below to view the source code for these packages on GitHub. 60 |

61 |
62 |
63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | {project_info.packages_with_indexes.map((p, idx) => { 75 | return ( 76 | 77 | 78 | 79 | 88 | 91 | 99 | 100 | ); 101 | })} 102 | 103 |
VersionReleaseGithubPublished onPyPi
{p.package.project_version}{p.package_filename} 80 | 85 | View Code 86 | 87 | 89 | 90 | 92 | 93 | Download 94 | 95 | 96 | Inspector 97 | 98 |
104 |
105 | 106 | ); 107 | } 108 | -------------------------------------------------------------------------------- /src/app/repositories/[name]/page.tsx: -------------------------------------------------------------------------------- 1 | import {parseISO, format} from "date-fns"; 2 | import byteSize from "byte-size"; 3 | import { getData } from "@/utils"; 4 | 5 | export default async function RepositoryDetail({ params }: { params: { name: string } }) { 6 | const data = await getData(); 7 | const repo = data.find((repo) => repo.name === params.name); 8 | if (repo == undefined) { 9 | return

Unknown repo

; 10 | } 11 | const earliest = parseISO(repo.stats.earliest_package); 12 | const latest = parseISO(repo.stats.latest_package); 13 | return ( 14 | <> 15 |
16 |

{repo.name}

17 |

18 | This repository contains {repo.stats.total_packages} packages published between{" "} 19 | {format(earliest, "dd/MM/yyyy")}{" "} 20 | and {format(latest, "dd/MM/yyyy")}. The compressed size of this repository is{" "} 21 | {byteSize(repo.size, { units: "iec", precision: 1 }).toString()} 22 |

23 |

24 | Link: {repo.url} 25 |

26 |
27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | {Object.entries(repo.projects) 38 | .sort(([, a], [, b]) => b - a) 39 | .map(([name, count]) => ( 40 | 41 | 44 | 49 | 54 | 55 | 56 | ))} 57 | 58 |
PackageGithubProjectCount
42 | {name} 43 | 45 | 46 | Browse Code on GitHub 47 | 48 | 50 | 51 | View all releases 52 | 53 | {count}
59 | 60 | ); 61 | } 62 | 63 | export async function generateStaticParams() { 64 | const repos = await getData(); 65 | return repos.map((repo) => ({ 66 | name: repo.name, 67 | })); 68 | } 69 | -------------------------------------------------------------------------------- /src/app/repositories/page.tsx: -------------------------------------------------------------------------------- 1 | import byteSize from "byte-size"; 2 | import { parseISO, format, differenceInDays } from "date-fns"; 3 | import RepoStats from "@/app/repositories/repo-stats"; 4 | import { getData } from "@/utils"; 5 | 6 | export default async function RepositoriesList() { 7 | const data = await getData(); 8 | 9 | return ( 10 | <> 11 |
12 |

Repositories

13 |
14 | 15 |
16 |

17 | Repositories are the top level of the PyPI data. Each repository contains one or more projects published to 18 | PyPI. This page shows the list of repositories with the size and completion percent. Click on a repository to 19 | view a list of packages contained within. 20 |

21 |
22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | {data 36 | .sort((a, b) => b.index - a.index) 37 | .map((p) => { 38 | const earliest = parseISO(p.stats.earliest_package); 39 | const latest = parseISO(p.stats.latest_package); 40 | return ( 41 | 42 | 47 | 50 | 51 | 52 | 53 | 54 | 55 | ); 56 | })} 57 | 58 |
NameRangeDaysSizePackagesProgress
43 | 44 | {p.name} 45 | 46 | 48 | {format(earliest, "dd/MM/yyyy")} to {format(latest, "dd/MM/yyyy")} 49 | {differenceInDays(latest, earliest)}{byteSize(p.size, { units: "iec", precision: 1 }).toString()}{p.stats.total_packages}{p.percent_done}
59 | 60 | ); 61 | } 62 | 63 | export type RepoData = { 64 | name: string; 65 | index: number; 66 | percent_done: number; 67 | size: number; 68 | url: string; 69 | packages_url: string; 70 | stats: { 71 | earliest_package: string; 72 | latest_package: string; 73 | total_packages: number; 74 | done_packages: number; 75 | }; 76 | projects: Map; 77 | }; 78 | 79 | -------------------------------------------------------------------------------- /src/app/repositories/repo-stats.tsx: -------------------------------------------------------------------------------- 1 | import { Bars3BottomRightIcon, CircleStackIcon, CodeBracketIcon } from "@heroicons/react/24/solid"; 2 | import byteSize from "byte-size"; 3 | import { RepoData } from "@/app/repositories/page"; 4 | 5 | export default function RepoStats({ data }: { data: RepoData[] }) { 6 | const repo_count = data.length; 7 | const total_releases = data 8 | .reduce((acc, repo) => acc + repo.stats.total_packages, 0) 9 | .toLocaleString(undefined, { minimumFractionDigits: 0 }); 10 | const total_size = data.reduce((acc, repo) => acc + repo.size, 0); 11 | return ( 12 |
13 |
14 |
15 | 16 |
17 |
Repositories
18 |
{repo_count}
19 |
20 | 21 |
22 |
23 | 24 |
25 |
Total Releases
26 |
{total_releases.toLocaleString()}
27 |
28 | 29 |
30 |
31 | 32 |
33 |
Total uncompressed size
34 |
35 | {byteSize(total_size, { 36 | units: "iec", 37 | precision: 1, 38 | }).toString()} 39 |
40 |
41 |
42 | ); 43 | } 44 | -------------------------------------------------------------------------------- /src/app/stats/chart-scroll.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | 3 | import { Chart } from "@/app/stats/chart"; 4 | import { useEffect, useState } from "react"; 5 | import ShowSQL from "@/app/stats/sql"; 6 | 7 | interface ChartScrollProps { 8 | chartData: any[]; 9 | charts: { name: string; valueNames: string[] }[]; 10 | formats?: { [key: string]: "bytes" }; 11 | sqlData?: string; 12 | cumulative?: boolean; 13 | showValueHeader?: (value: { [key: string]: string | number }) => string; 14 | } 15 | 16 | export default function ChartScroll({ 17 | chartData, 18 | charts, 19 | sqlData, 20 | cumulative = false, 21 | formats = {}, 22 | showValueHeader, 23 | }: ChartScrollProps) { 24 | const [chartIndex, setChartIndex] = useState(0); 25 | useEffect(() => { 26 | setChartIndex(0); 27 | }, [charts]); 28 | const selectedValueNames = chartIndex < charts.length ? charts[chartIndex].valueNames : charts[0].valueNames; 29 | 30 | return ( 31 | <> 32 |
33 | {charts.map((chart, index) => { 34 | const isSelected = index === chartIndex; 35 | return ( 36 |
37 | 40 |
41 | ); 42 | })} 43 |
44 | 51 | {sqlData && } 52 | 53 | ); 54 | } 55 | -------------------------------------------------------------------------------- /src/app/stats/chart.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | import { 3 | CartesianGrid, 4 | Cell, 5 | Customized, 6 | LabelList, 7 | Legend, 8 | Line, 9 | LineChart, 10 | Pie, 11 | PieChart as RechartPieChart, 12 | ResponsiveContainer, 13 | Tooltip, 14 | XAxis, 15 | YAxis, 16 | } from "recharts"; 17 | import byteSize from "byte-size"; 18 | import { cumulative_sum } from "@/app/stats/utils"; 19 | import { genColours } from "./colours"; 20 | 21 | // const COLORS = ["#0088FE", "#00C49F", "#FFBB28", "#ff5100", "#FF8042"]; 22 | 23 | export function Chart({ 24 | chartData, 25 | valueNames, 26 | cumulative = false, 27 | formats = {}, 28 | showValueHeader, 29 | }: { 30 | chartData: { [key: string]: string | number }[]; 31 | valueNames: string[]; 32 | cumulative?: boolean; 33 | formats?: { [key: string]: "bytes" }; 34 | showValueHeader?: (value: { [key: string]: string | number }) => string; 35 | }) { 36 | // remove null values from the chartData array, stopping after first non-null value is found 37 | const firstNonNullIndex = chartData.findIndex((value) => { 38 | for (const valueName of valueNames) { 39 | if (value[valueName] !== 0) { 40 | return true; 41 | } 42 | } 43 | return false; 44 | }); 45 | if (firstNonNullIndex > 0) { 46 | chartData = chartData.slice(firstNonNullIndex); 47 | } 48 | 49 | if (cumulative) { 50 | chartData = cumulative_sum(chartData, valueNames); 51 | } 52 | 53 | const colours = genColours(valueNames.length); 54 | 55 | return ( 56 | <> 57 | {showValueHeader &&

{showValueHeader(chartData[chartData.length - 1])}

} 58 | 59 | 60 | {valueNames.map((valueName, index) => { 61 | const name = valueName.replaceAll("_", " "); 62 | return ( 63 | 72 | ); 73 | })} 74 | 75 | 76 | { 79 | // @ts-ignore 80 | return new Intl.NumberFormat("en").format(value); 81 | }} 82 | /> 83 | { 85 | // @ts-ignore 86 | if (formats[item.dataKey] == "bytes" && typeof value === "number") { 87 | return byteSize(value, { precision: 2, units: "iec" }).toString(); 88 | } 89 | // @ts-ignore 90 | return new Intl.NumberFormat("en").format(value); 91 | }} 92 | /> 93 | 94 | 95 | 96 | 97 | ); 98 | } 99 | 100 | export function PieChart({ 101 | chartData, 102 | dataKey, 103 | nameKey, 104 | limit, 105 | }: { 106 | chartData: any[]; 107 | dataKey: string; 108 | nameKey: string; 109 | limit?: number; 110 | }) { 111 | if (limit) { 112 | let rest_total = chartData.slice(limit, chartData.length).reduce((acc, value) => acc + value[dataKey], 0); 113 | let rest_item = { 114 | [nameKey]: "Other", 115 | [dataKey]: rest_total, 116 | }; 117 | chartData = [...chartData.slice(0, limit), rest_item]; 118 | } 119 | 120 | const colours = genColours(chartData.length); 121 | 122 | return ( 123 |
124 | 125 | 126 | 134 | {chartData.map((entry, index) => ( 135 | 136 | ))} 137 | {/**/} 138 | 139 | 140 | 141 | 142 | 143 |
144 | ); 145 | } 146 | -------------------------------------------------------------------------------- /src/app/stats/colours.ts: -------------------------------------------------------------------------------- 1 | import chroma from "chroma-js"; 2 | 3 | // chroma.scale('RdYlBu').domain(myValues, 7, 'quantiles'); 4 | let colors = ["orange", "skyblue", "red"]; 5 | 6 | export function genColours(value: number): string[] { 7 | return chroma.scale(colors).colors(value); 8 | } 9 | -------------------------------------------------------------------------------- /src/app/stats/language-stats.tsx: -------------------------------------------------------------------------------- 1 | export default async function getLanguageStats(): Promise { 2 | const res = await fetch("https://raw.githubusercontent.com/pypi-data/data/main/stats/language_stats.json"); 3 | 4 | if (!res.ok) { 5 | throw new Error("Failed to fetch data"); 6 | } 7 | const json_res = await res.text(); 8 | let data = []; 9 | for (const line of json_res.split("\n")) { 10 | if (line !== "") { 11 | data.push(JSON.parse(line)); 12 | } 13 | } 14 | 15 | return data as LanguageStats[]; 16 | } 17 | 18 | export type LanguageStats = { 19 | month: string; 20 | total: number; 21 | has_async: number; 22 | has_async_comp: number; 23 | 24 | has_fstring: number; 25 | has_annotations: number; 26 | 27 | has_try_star: number; 28 | has_match: number; 29 | has_walrus: number; 30 | 31 | has_dataclasses: number; 32 | 33 | has_generator_expression: number; 34 | has_list_comp: number; 35 | has_dict_comp: number; 36 | has_set_comp: number; 37 | }; 38 | 39 | export type TotalLanguageStats = { 40 | total: number; 41 | has_async: number; 42 | has_async_comp: number; 43 | 44 | has_fstring: number; 45 | has_annotations: number; 46 | 47 | has_try_star: number; 48 | has_match: number; 49 | has_walrus: number; 50 | 51 | has_dataclasses: number; 52 | 53 | has_generator_expression: number; 54 | has_list_comp: number; 55 | has_dict_comp: number; 56 | has_set_comp: number; 57 | }; 58 | 59 | export async function getTotalLanguageStats(): Promise { 60 | const res = await fetch("https://raw.githubusercontent.com/pypi-data/data/main/stats/language_stats_totals.json"); 61 | 62 | if (!res.ok) { 63 | throw new Error("Failed to fetch data"); 64 | } 65 | const json_res = await res.json(); 66 | return json_res as TotalLanguageStats; 67 | } 68 | -------------------------------------------------------------------------------- /src/app/stats/page.tsx: -------------------------------------------------------------------------------- 1 | import getStats from "@/app/stats/stats"; 2 | import TotalStats from "@/app/stats/total_stats"; 3 | import { PieChart } from "@/app/stats/chart"; 4 | import Table from "@/app/table"; 5 | import { InformationCircleIcon } from "@heroicons/react/24/solid"; 6 | import byteSize from "byte-size"; 7 | import ChartScroll from "@/app/stats/chart-scroll"; 8 | import ShowSQL from "@/app/stats/sql"; 9 | import extrapolate from "@/app/stats/shitpost-model"; 10 | import ShitpostChart from "@/app/stats/shitpost-chart"; 11 | import { cumulative_sum } from "@/app/stats/utils"; 12 | import getLanguageStats, { getTotalLanguageStats } from "@/app/stats/language-stats"; 13 | 14 | function InfoBubble({ text }: { text: string }) { 15 | return ( 16 |
17 | 18 | {text} 19 |
20 | ); 21 | } 22 | 23 | export default async function Page() { 24 | const data = await getStats(); 25 | const new_projects_over_time = data.new_projects_over_time.sort((a, b) => (a.month < b.month ? -1 : 1)); 26 | const new_project_versions_over_time = data.new_project_versions_over_time.sort((a, b) => 27 | a.month < b.month ? -1 : 1, 28 | ); 29 | const new_releases_over_time = data.new_releases_over_time.sort((a, b) => (a.month < b.month ? -1 : 1)); 30 | const chartData = data.stats_over_time.sort((a, b) => (a.month < b.month ? -1 : 1)); 31 | 32 | const combined_over_time_stats = new_projects_over_time.map((el, i) => ({ 33 | month: el.month, 34 | new_projects: el.count, 35 | new_project_versions: new_project_versions_over_time[i].count, 36 | new_releases: new_releases_over_time[i].count, 37 | total_files: chartData[i].total_files, 38 | total_lines: chartData[i].total_lines, 39 | total_size: chartData[i].total_size, 40 | })); 41 | console.log("This months stats:", combined_over_time_stats[combined_over_time_stats.length - 1]); 42 | 43 | const projectStats = data.project_level_breakdowns.sort((a, b) => (a.month < b.month ? -1 : 1)); 44 | 45 | const secretTypesResponse = await fetch( 46 | "https://raw.githubusercontent.com/pypi-data/data/main/stats/github_secret_totals.json", 47 | ); 48 | const secretTypes: Map = await secretTypesResponse.json(); 49 | const secretTypesTable = Object.entries(secretTypes) 50 | .sort((a, b) => (a[1] < b[1] ? 1 : -1)) 51 | .map(([type, count]) => ({ 52 | type, 53 | count, 54 | })); 55 | 56 | const lastMonth = chartData[chartData.length - 1]; 57 | 58 | const binarySizes = data.binary_sizes.map((el) => { 59 | const is_binary = el.is_binary ? "Binary" : "Text"; 60 | const text = `${is_binary}: ${byteSize(el.total_size, { precision: 1, units: "iec" })}`; 61 | return { 62 | ...el, 63 | text, 64 | }; 65 | }); 66 | 67 | const tensorflow_total_size = data.projects_by_files 68 | .filter((el) => el.project_name.startsWith("tf-") || el.project_name.startsWith("tensorflow-")) 69 | .reduce((acc, el) => acc + el.total_size, 0); 70 | const total_size = binarySizes.reduce((acc, el) => acc + el.total_size, 0); 71 | const tensorflow_percentage = Math.round((tensorflow_total_size / total_size) * 100); 72 | const tensorflow_human_size = byteSize(tensorflow_total_size, { precision: 1, units: "iec" }); 73 | 74 | const skip_reason_stats = data.skip_reason_stats.filter(({ skip_reason }) => skip_reason != ""); 75 | 76 | const years = 8; 77 | const extrapolated = extrapolate(years, combined_over_time_stats); 78 | // const extrapolated_cumulative_slice = cumulative_sum(extrapolated, ["new_releases"]); 79 | const in_future_years = cumulative_sum(extrapolated, ["new_releases"])[extrapolated.length - 2]; 80 | 81 | const languageStats = await getLanguageStats(); 82 | const languageStatsByMonth = languageStats.sort((a, b) => (a.month < b.month ? -1 : 1)); 83 | 84 | const totalLanguageStats = await getTotalLanguageStats(); 85 | let totalLanguageCount = totalLanguageStats.total; 86 | 87 | return ( 88 | <> 89 |

The contents of PyPI, in numbers

90 |
91 | 92 |
93 |
94 |

95 | This page contains a breakdown of the contents of PyPI from parsing the contents of packages. You can{" "} 96 | 97 | download PyPI locally 98 | {" "} 99 | to do your own analysis or run{" "} 100 | 101 | SQL queries on the data in your browser 102 | 103 |

104 |
105 |

Project Contents

106 |

107 | This data only counts unique projects, not versions. e.g if a project has published 10 versions 108 | in a month, each with a setup.py file, it will only be counted once. 109 |

110 | 127 |
128 |

Language Features

129 |

130 | This data only counts unique projects, not versions. e.g if a project has published 10 versions 131 | in a month, each containing an async function, it will only be counted once. 132 |

133 |
134 |
135 | 159 |
160 |
161 |

Breakdown

162 | key != "total") 166 | .map(([key, value]) => { 167 | const percent = Math.round((value / totalLanguageCount) * 100); 168 | return { 169 | Name: key.replace("has_", "").replace("_", " "), 170 | Projects: value, 171 | Percent: percent, 172 | }; 173 | }) 174 | .sort((a, b) => (a.Projects < b.Projects ? 1 : -1))} 175 | columns={[{ name: "Name" }, { name: "Projects", type: "number" }, { name: "Percent", type: "number" }]} 176 | /> 177 | 178 | 179 |
180 |

Secrets Detected

181 |
182 |
183 | PyPI contains a lot of secrets. 184 | 185 |
186 |
187 |
188 | 189 | 190 |
191 |

Growth

192 | 193 | {/*
*/} 194 | {/*
*/} 195 | 201 | {/**/} 202 | {/*
*/} 203 | {/*
*/} 204 |
205 |

Binary files

206 |
207 |
208 |

209 | This shows a breakdown of the binary files on PyPI, by extension. Binary files are the vast majority of the 210 | content on PyPI, accounting for nearly 75% of the uncompressed size. 211 |

212 | 213 | 214 |
215 |
216 |
225 | 226 | 227 | 228 |
229 |
230 |
231 |

Largest Projects by size

232 | 235 |
245 | 246 | 247 |
248 |

Stats By Extensions

249 | 250 |
260 | 261 | 262 | 263 | 264 |
265 |

Files not committed to Github

266 |
267 |
268 | Not all files can be committed to GitHub due to size limits. Some have a few very, very long lines whilst 269 | others are junk like mistakenly added virtualenvs or VCS directories. This table shows a breakdown of the 270 | reasons why files where skipped. 271 |
272 |
273 |
288 | 289 | 290 | 291 | 292 | ); 293 | } 294 | -------------------------------------------------------------------------------- /src/app/stats/shitpost-chart.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | 3 | import ChartScroll from "@/app/stats/chart-scroll"; 4 | import React, { useEffect, useState } from "react"; 5 | 6 | function easeInExpo(x: number) { 7 | return x === 0 ? 0 : Math.pow(2, 10 * x - 10); 8 | } 9 | 10 | export default function ShitpostChart({ 11 | chartData, 12 | extrapolated, 13 | years, 14 | future_value, 15 | }: { 16 | chartData: any[]; 17 | extrapolated: { month: string; new_releases: number; total_size: number; total_files: number }[]; 18 | years: number; 19 | future_value: number; 20 | }) { 21 | let [seeExtrapolate, setSeeExtrapolate] = useState(false); 22 | let [extrapolatedIndex, setExtrapolatedIndex] = useState(chartData.length); 23 | const extrapolatedLength = extrapolated.length - chartData.length; 24 | const extrapolatedOffset = extrapolatedIndex - chartData.length; 25 | 26 | const percentDone = extrapolatedOffset / extrapolatedLength; 27 | const percentLeft = 1 - percentDone; 28 | 29 | const humans = 8_000_000_000; 30 | const packages_per_human = future_value / humans; 31 | 32 | useEffect(() => { 33 | if (!seeExtrapolate && extrapolatedIndex !== 0) { 34 | setExtrapolatedIndex(chartData.length); 35 | } 36 | }, [seeExtrapolate, chartData, extrapolatedIndex]); 37 | 38 | useEffect(() => { 39 | if (seeExtrapolate && extrapolatedIndex < extrapolated.length - 1) { 40 | const timer = setTimeout(() => setExtrapolatedIndex(extrapolatedIndex + 1), easeInExpo(percentLeft) * 750); 41 | return () => clearTimeout(timer); 42 | } 43 | }, [percentLeft, chartData, extrapolated, seeExtrapolate, extrapolatedIndex]); 44 | 45 | const time = extrapolated[extrapolatedIndex].month; 46 | 47 | return ( 48 | <> 49 |
50 |
51 | 55 | Intl.NumberFormat("en-US", { 56 | notation: "compact", 57 | compactDisplay: "long", 58 | maximumFractionDigits: 2, 59 | // @ts-ignore 60 | }).format(new_releases) 61 | } 62 | formats={{ total_size: "bytes" }} 63 | charts={[{ name: "Releases", valueNames: ["new_releases"] }]} 64 | /> 65 |
66 |
67 | 71 | Intl.NumberFormat("en", { 72 | notation: "compact", 73 | style: "unit", 74 | unit: "byte", 75 | unitDisplay: "narrow", 76 | // @ts-ignore 77 | }).format(total_size) 78 | } 79 | formats={{ total_size: "bytes" }} 80 | charts={[{ name: "Size", valueNames: ["total_size"] }]} 81 | /> 82 |
83 |
84 | 88 | Intl.NumberFormat("en-US", { 89 | notation: "compact", 90 | compactDisplay: "long", 91 | maximumFractionDigits: 1, 92 | // @ts-ignore 93 | }).format(total_files) 94 | } 95 | formats={{ total_size: "bytes" }} 96 | charts={[{ name: "Files", valueNames: ["total_files"] }]} 97 | /> 98 |
99 |
100 |

101 | PyPI is growing fast. If this dangerous expansion not stopped, our advanced machine learning models predict that 102 | in only {years} years the number of packages will outnumber human beings. 103 |

104 | 109 | 110 | ); 111 | } 112 | -------------------------------------------------------------------------------- /src/app/stats/shitpost-model.tsx: -------------------------------------------------------------------------------- 1 | import { cumulative_sum } from "@/app/stats/utils"; 2 | 3 | export default function extrapolate( 4 | years: number, 5 | values: { 6 | month: string; 7 | new_releases: number; 8 | total_files: number; 9 | total_size: number; 10 | }[], 11 | ) { 12 | let releases_extrapolated = extrapolateSeries( 13 | years, 14 | values.map((el) => el.new_releases), 15 | ); 16 | let files_extrapolated = extrapolateSeries( 17 | years, 18 | values.map((el) => el.total_files), 19 | ); 20 | let size_extrapolated = extrapolateSeries( 21 | years, 22 | values.map((el) => el.total_size), 23 | ); 24 | 25 | let extrapolated = []; 26 | let date = new Date(values[values.length - 1].month); 27 | for (let i = 0; i < years * 12; i++) { 28 | let month = `${date.getFullYear()}-${String(date.getMonth() + 1).padStart(2, "0")}-01`; 29 | // increase by growth per month 30 | 31 | extrapolated.push({ 32 | month, 33 | new_releases: releases_extrapolated[i], 34 | total_files: files_extrapolated[i], 35 | total_size: size_extrapolated[i], 36 | }); 37 | 38 | date = new Date(date.setMonth(date.getMonth() + 1)); 39 | } 40 | 41 | // console.log(extrapolated.slice(10)) 42 | return [...values, ...extrapolated]; 43 | } 44 | 45 | function extrapolateSeries(years: number, values: number[]) { 46 | let time_slice = cumulative_sum( 47 | values.slice(values.length - 12, values.length).map((x) => ({ x })), 48 | ["x"], 49 | ).map(({ x }) => x); 50 | let releases_diff = time_slice[time_slice.length - 1] - time_slice[0]; 51 | let growth_percent = releases_diff / time_slice[0]; 52 | let growth_per_month = growth_percent / time_slice.length; 53 | 54 | let last_value = values[values.length - 1]; 55 | let extrapolated = []; 56 | for (let i = 0; i < years * 12; i++) { 57 | // increase by growth per month 58 | // let increase = last_value * growth_per_month; 59 | last_value = last_value * growth_per_month; 60 | extrapolated.push(last_value); 61 | } 62 | return extrapolated; 63 | } 64 | -------------------------------------------------------------------------------- /src/app/stats/sql.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | 3 | import SyntaxHighlight from "@/app/datasets/syntax"; 4 | import { useState } from "react"; 5 | 6 | export default function ShowSQL({ sqlData }: { sqlData: string }) { 7 | const [expandSQL, setExpandSQL] = useState(false); 8 | return ( 9 |
10 |
11 | 14 | {expandSQL && ( 15 | {`-- https://github.com/pypi-data/data/\n\n${sqlData}`} 16 | )} 17 |
18 |
19 | ); 20 | } 21 | -------------------------------------------------------------------------------- /src/app/stats/stats.tsx: -------------------------------------------------------------------------------- 1 | import { compile, CompileOptions } from "prql-js/dist/bundler"; 2 | 3 | export default async function getStats(): Promise { 4 | const res = await fetch("https://raw.githubusercontent.com/pypi-data/data/main/stats/totals.json"); 5 | 6 | if (!res.ok) { 7 | throw new Error("Failed to fetch data"); 8 | } 9 | 10 | const json_res = await res.json(); 11 | 12 | const rawPrqlRes = await fetch("https://raw.githubusercontent.com/pypi-data/data/main/sql/_stats.prql"); 13 | const rawPrql = await rawPrqlRes.text(); 14 | const opts = new CompileOptions(); 15 | opts.target = "sql.duckdb"; 16 | opts.format = true; 17 | opts.signature_comment = false; 18 | 19 | const repo_stats = { sql: {} }; 20 | for (const item of json_res as { name: string; stat: any[] }[]) { 21 | // @ts-ignore 22 | repo_stats[item.name] = item.stat; 23 | const sql = compile(`${rawPrql}\nrelation_to_json(${item.name})`, opts); 24 | if (sql === undefined) { 25 | throw Error(`Failed to compile PRQL for ${item.name}`); 26 | } 27 | // @ts-ignore 28 | repo_stats.sql[item.name] = sql.replace("$1", "'data_from_the_datasets_page/*.parquet'"); 29 | } 30 | return repo_stats as RepoStats; 31 | } 32 | 33 | export type RepoStats = { 34 | total_stats: [TotalStat]; 35 | stats_over_time: StatsOverTime[]; 36 | skipped_files_stats: InnerStat[]; 37 | binary_extension_stats: InnerStat[]; 38 | extension_stats: InnerStat[]; 39 | projects_by_files: ProjectStat[]; 40 | skip_reason_stats: SkipReasonStat[]; 41 | binary_sizes: [{ is_binary: boolean; total_files: number; total_size: number }]; 42 | project_level_breakdowns: ProjectLevelBreakdown[]; 43 | new_projects_over_time: [{ month: string; count: number }]; 44 | new_project_versions_over_time: [{ month: string; count: number }]; 45 | new_releases_over_time: [{ month: string; count: number }]; 46 | 47 | sql: { 48 | stats_over_time: string; 49 | project_level_breakdowns: string; 50 | binary_sizes: string; 51 | binary_extension_stats: string; 52 | projects_by_files: string; 53 | extension_stats: string; 54 | }; 55 | }; 56 | 57 | export type TotalStat = { 58 | total_files: number; 59 | total_lines: number; 60 | total_size: number; 61 | unique_files: number; 62 | }; 63 | 64 | export type ProjectLevelBreakdown = { 65 | month: String; 66 | total_project_uploads: number; 67 | project_version_releases: number; 68 | 69 | has_pyproject: number; 70 | has_setup_py: number; 71 | has_setup_py_and_pyproject: number; 72 | has_requirements_txt: number; 73 | 74 | init_py_files: number; 75 | 76 | has_markdown: number; 77 | has_rst: number; 78 | 79 | has_tests: number; 80 | has_tox: number; 81 | has_pytest: number; 82 | 83 | has_ini: number; 84 | has_json: number; 85 | has_xml: number; 86 | has_toml: number; 87 | has_yaml: number; 88 | has_rust: number; 89 | has_c_or_cpp: number; 90 | 91 | has_pyi: number; 92 | has_py_typed: number; 93 | }; 94 | 95 | export type InnerStat = { 96 | extension: string; 97 | total_files: number; 98 | total_lines: number; 99 | total_size: number; 100 | unique_files: number; 101 | }; 102 | 103 | export type ProjectStat = { 104 | project_name: string; 105 | unique_files: number; 106 | total_files: number; 107 | total_lines: number; 108 | total_size: number; 109 | }; 110 | 111 | export type StatsOverTime = { 112 | month: string; 113 | total_files: number; 114 | total_size: number; 115 | total_lines: number; 116 | }; 117 | 118 | export type SkipReasonStat = { 119 | skip_reason: string; 120 | total_projects: number; 121 | count: number; 122 | unique_files: number; 123 | total_size: number; 124 | total_lines: number; 125 | max_size: number; 126 | max_lines: number; 127 | }; 128 | -------------------------------------------------------------------------------- /src/app/stats/total_stats.tsx: -------------------------------------------------------------------------------- 1 | import byteSize from "byte-size"; 2 | import { Bars3BottomRightIcon, BoltIcon, CircleStackIcon, CodeBracketIcon } from "@heroicons/react/24/solid"; 3 | import { StatsOverTime, TotalStat } from "@/app/stats/stats"; 4 | 5 | export default function TotalStats({ stats, lastMonth }: { stats: TotalStat; lastMonth: StatsOverTime }) { 6 | const total_hours_in_a_month = 24 * 30; 7 | const lines_per_second = lastMonth.total_lines / (total_hours_in_a_month * 60 * 60); 8 | return ( 9 | <> 10 |
11 |
12 |
13 | 14 |
15 |
Total files
16 |
17 | {(stats.total_files / 1000 / 1000 / 1000).toLocaleString(undefined, { maximumFractionDigits: 2 })} Billion 18 |
19 |
{stats.unique_files.toLocaleString()} unique
20 |
21 | 22 |
23 |
24 | 25 |
26 |
Total lines of text
27 |
28 | {(stats.total_lines / 1000 / 1000 / 1000).toLocaleString(undefined, { maximumFractionDigits: 1 })} Billion 29 |
30 |
{stats.total_lines.toLocaleString()} to be precise
31 |
32 | 33 |
34 |
35 | 36 |
37 |
Total uncompressed size
38 |
39 | {byteSize(stats.total_size, { 40 | units: "iec", 41 | precision: 1, 42 | }).toString()} 43 |
44 |
That is ~{(stats.total_size / 1468006).toLocaleString()} floppy disks
45 |
46 | 47 |
48 |
49 | 50 |
51 |
Lines of code added per second
52 |
53 | {lines_per_second.toLocaleString(undefined, { maximumFractionDigits: 0 })} 54 |
55 |
In the month {lastMonth.month}
56 |
57 |
58 | 59 | ); 60 | } 61 | -------------------------------------------------------------------------------- /src/app/stats/utils.tsx: -------------------------------------------------------------------------------- 1 | export function cumulative_sum(data: any[], valueNames: string[]): any[] { 2 | const chartDataDeepCopy = JSON.parse(JSON.stringify(data)); 3 | for (const valueName of valueNames) { 4 | let sum = 0; 5 | for (const value of chartDataDeepCopy) { 6 | sum += value[valueName] as number; 7 | value[valueName] = sum; 8 | } 9 | } 10 | return chartDataDeepCopy; 11 | } 12 | -------------------------------------------------------------------------------- /src/app/swr-provider.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | import { SWRConfig } from "swr"; 3 | // @ts-ignore 4 | export const SWRProvider = ({ children }) => { 5 | return fetch(url).then((res) => res.json()) }}>{children}; 6 | }; 7 | -------------------------------------------------------------------------------- /src/app/table.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | 3 | import { flexRender, getCoreRowModel, useReactTable } from "@tanstack/react-table"; 4 | import byteSize from "byte-size"; 5 | import { useMemo, useState } from "react"; 6 | 7 | type Column = { 8 | name: string; 9 | type?: "string" | "number" | "bytes"; 10 | }; 11 | 12 | interface TableProps { 13 | data: Record[]; 14 | columns: Column[]; 15 | initialLimit?: number; 16 | addFooter?: boolean; 17 | } 18 | 19 | export default function Table({ data, columns, initialLimit = 15, addFooter = true }: TableProps) { 20 | const [expanded, setExpanded] = useState(false); 21 | // This is needed to stop a re-render loop? No idea why. 22 | const limitedData = useMemo(() => { 23 | const numbersCopy = JSON.parse(JSON.stringify(data)); 24 | if (!expanded) { 25 | return numbersCopy.slice(0, initialLimit); 26 | } else { 27 | return numbersCopy; 28 | } 29 | }, [data, expanded, initialLimit]); 30 | const hasMore = data.length > initialLimit; 31 | 32 | const table = useReactTable({ 33 | data: limitedData, 34 | columns: columns.map((column) => ({ 35 | id: column.name, 36 | header: column.name.replace("_", " "), 37 | footer: !addFooter 38 | ? undefined 39 | : ({ table }) => { 40 | if (column.name == columns[0].name) { 41 | return "Total"; 42 | } 43 | if (column.type === "bytes" || column.type === "number") { 44 | // @ts-ignore 45 | const total = data.reduce((total, row) => total + row[column.name], 0); 46 | if (column.type == "number") { 47 | return total.toLocaleString(); 48 | } 49 | return byteSize(total, { units: "iec", precision: 1 }).toString(); 50 | } 51 | }, 52 | cell: (props) => { 53 | const row = props.getValue(); 54 | if (column.type === undefined) { 55 | if (row == "") { 56 | return `No ${column.name}`; 57 | } else { 58 | return row; 59 | } 60 | } 61 | if (column.type === "string" || typeof row === "string") { 62 | return row; 63 | } else if (column.type === "number") { 64 | return row.toLocaleString(); 65 | } else if (column.type === "bytes") { 66 | return byteSize(row, { units: "iec", precision: 1 }).toString(); 67 | } 68 | }, 69 | accessorKey: column.name, 70 | })), 71 | getCoreRowModel: getCoreRowModel(), 72 | }); 73 | 74 | return ( 75 |
76 | 77 | {table.getHeaderGroups().map((headerGroup) => ( 78 | 79 | {headerGroup.headers.map((header) => ( 80 | 83 | ))} 84 | 85 | ))} 86 | 87 | 88 | {table.getRowModel().rows.map((row) => ( 89 | 90 | {row.getVisibleCells().map((cell) => ( 91 | 92 | ))} 93 | 94 | ))} 95 | 96 | 97 | {table.getFooterGroups().map((footerGroup) => ( 98 | 99 | {footerGroup.headers.map((header) => ( 100 | 101 | ))} 102 | 103 | ))} 104 | {hasMore && ( 105 | 106 | 116 | 117 | )} 118 | 119 |
81 | {header.isPlaceholder ? null : flexRender(header.column.columnDef.header, header.getContext())} 82 |
{flexRender(cell.column.columnDef.cell, cell.getContext())}
{flexRender(header.column.columnDef.footer, header.getContext())}
107 | 115 |
120 | ); 121 | } 122 | -------------------------------------------------------------------------------- /src/data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/utils.ts: -------------------------------------------------------------------------------- 1 | import fs from "fs"; 2 | import path from "path"; 3 | import { RepoData } from "@/app/repositories/page"; 4 | 5 | const allRepoData = JSON.parse( 6 | fs.readFileSync(path.join(process.cwd(), "src/data/repositories_with_releases.json"), "utf-8"), 7 | ) as RepoData[]; 8 | 9 | export async function getData(): Promise { 10 | return allRepoData as RepoData[]; 11 | // const res = await fetch('https://raw.githubusercontent.com/pypi-data/data/main/stats/repositories_with_releases.json') 12 | // 13 | // if (!res.ok) { 14 | // // This will activate the closest `error.js` Error Boundary 15 | // throw new Error('Failed to fetch data') 16 | // } 17 | // 18 | // return res.json() 19 | } 20 | -------------------------------------------------------------------------------- /tailwind.config.js: -------------------------------------------------------------------------------- 1 | /** @type {import('tailwindcss').Config} */ 2 | module.exports = { 3 | content: [ 4 | "./src/pages/**/*.{js,ts,jsx,tsx,mdx}", 5 | "./src/components/**/*.{js,ts,jsx,tsx,mdx}", 6 | "./src/app/**/*.{js,ts,jsx,tsx,mdx}", 7 | ], 8 | plugins: [require("daisyui"), require("@tailwindcss/typography")], 9 | daisyui: { 10 | themes: ["dark"], 11 | }, 12 | theme: { 13 | extend: { 14 | typography: { 15 | DEFAULT: { 16 | css: { 17 | maxWidth: null, // full width 18 | }, 19 | }, 20 | }, 21 | }, 22 | }, 23 | }; 24 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es6", 4 | "lib": ["dom", "dom.iterable", "esnext"], 5 | "allowJs": true, 6 | "skipLibCheck": true, 7 | "strict": true, 8 | "forceConsistentCasingInFileNames": true, 9 | "noEmit": true, 10 | "esModuleInterop": true, 11 | "module": "esnext", 12 | "moduleResolution": "node", 13 | "resolveJsonModule": true, 14 | "isolatedModules": true, 15 | "jsx": "preserve", 16 | "incremental": true, 17 | "plugins": [ 18 | { 19 | "name": "next" 20 | } 21 | ], 22 | "paths": { 23 | "@/*": ["./src/*"], 24 | "@public/*": ["./public/*"] 25 | } 26 | }, 27 | "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], 28 | "exclude": ["node_modules"], 29 | "ts-node": { 30 | // these options are overrides used only by ts-node 31 | // same as the --compilerOptions flag and the TS_NODE_COMPILER_OPTIONS environment variable 32 | "compilerOptions": { 33 | "module": "commonjs" 34 | } 35 | } 36 | } 37 | --------------------------------------------------------------------------------