├── .github └── workflows │ ├── update_csv_files.yml │ └── website.yml ├── .gitignore ├── README.md ├── charts_data ├── contests_prize_pool_distribution.ipynb ├── duplicates.ipynb ├── findings_value.ipynb ├── participants_by_report_severity.ipynb ├── participants_growth.ipynb ├── participants_longevity.ipynb ├── severity_revenue.ipynb ├── warden_stats.ipynb └── wardens_team_vs_solo.ipynb ├── contests_code4rena.csv ├── github_code4rena.csv ├── leaderboard_code4rena.csv ├── main.py ├── requirements.txt ├── runtime.txt ├── scrapers ├── GithubScraper.py ├── WebScraper.py └── __init__.py └── site ├── index.html └── static ├── contests_prize_pool_distribution.html ├── duplicates.html ├── findings_value.html ├── index.html ├── participants_by_report_severity.html ├── participants_growth.html ├── participants_longevity.html ├── severity_revenue.html ├── warden_stats.html └── wardens_team_vs_solo.html /.github/workflows/update_csv_files.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: Update scraped data 4 | 5 | # Controls when the workflow will run 6 | on: 7 | schedule: 8 | - cron: "0 12 * * *" 9 | # Allows you to run this workflow manually from the Actions tab 10 | workflow_dispatch: 11 | 12 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 13 | jobs: 14 | # This workflow contains a single job called "build" 15 | scrape: 16 | # The type of runner that the job will run on 17 | runs-on: ubuntu-latest 18 | env: 19 | API_ACCESS_TOKEN: ${{ secrets.API_ACCESS_TOKEN }} 20 | GH_TOKEN: ${{ secrets.API_ACCESS_TOKEN }} 21 | 22 | # Steps represent a sequence of tasks that will be executed as part of the job 23 | steps: 24 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 25 | - uses: actions/checkout@v3 26 | 27 | - name: Setup Python 28 | uses: actions/setup-python@v4.2.0 29 | 30 | - name: Install dependencies 31 | run: | 32 | python -m pip install --upgrade pip 33 | pip install -r requirements.txt 34 | 35 | - name: Run scraping script 36 | run: python main.py all 37 | 38 | - name: Git Auto Commit 39 | uses: stefanzweifel/git-auto-commit-action@v4.14.1 40 | with: 41 | commit_message: Updated all scraped data (CSV) 42 | file_pattern: '*.csv' 43 | -------------------------------------------------------------------------------- /.github/workflows/website.yml: -------------------------------------------------------------------------------- 1 | name: Build website 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | paths: 8 | - '**.csv' 9 | workflow_dispatch: 10 | 11 | permissions: 12 | contents: write 13 | pages: write 14 | id-token: write 15 | 16 | # Allow one concurrent deployment 17 | concurrency: 18 | group: "pages" 19 | cancel-in-progress: true 20 | 21 | jobs: 22 | # Single deploy job since we're just deploying 23 | deploy: 24 | environment: 25 | name: github-pages 26 | url: ${{ steps.deployment.outputs.page_url }} 27 | runs-on: ubuntu-latest 28 | steps: 29 | - name: Checkout 30 | uses: actions/checkout@v3 31 | - name: Setup Pages 32 | uses: actions/configure-pages@v2 33 | - uses: actions/setup-python@v4 34 | with: 35 | python-version: "3.11" 36 | - run: | 37 | python -m pip install --upgrade pip 38 | pip install -r requirements.txt 39 | pip uninstall rfc3986-validator -y 40 | - name: Update analysis notebooks 41 | run: | 42 | for filename in charts_data/*.ipynb; do 43 | jupyter nbconvert --to notebook --execute $filename --ExecutePreprocessor.kernel_name='python3' --inplace 44 | done 45 | - name: Convert notebooks to HTML 46 | run: | 47 | mkdir -p site/static 48 | for filename in charts_data/*.ipynb; do 49 | jupyter nbconvert --to html $filename 50 | mv ${filename%.*}.html site/static/ 51 | done 52 | - name: Install and build index 53 | run: | 54 | sudo apt-get update 55 | sudo apt-get install curl git -y 56 | curl https://raw.githubusercontent.com/jayanta525/apindex-v2/master/sudo-install.sh | bash 57 | cd site/ 58 | apindex . 59 | - name: Upload artifacts 60 | uses: actions/upload-pages-artifact@v1 61 | with: 62 | path: 'site/' 63 | - name: Deploy to GitHub Pages 64 | id: deployment 65 | uses: actions/deploy-pages@v1 66 | - name: Commit 67 | uses: stefanzweifel/git-auto-commit-action@v4 68 | with: 69 | commit_message: Automated static html notebooks build 70 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | __pycache__/ 3 | repos_data/ 4 | *.ipynb_* 5 | *.log 6 | *.sqlite 7 | *.xlsx 8 | *.sublime* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | > [!IMPORTANT] 2 | > This repository is archived due to significant changes to both Code4rena's website and repos since first started over a year ago. Additionally, Code4rena started providing data through their [community resources](https://github.com/code-423n4/code423n4.com/blob/main/_data/README-community_resources.md). 3 | > A new repo is available as [code4rena-stats](https://github.com/Krow10/code4rena-stats) for the charts and insights. 4 | 5 | # Archived README.md 6 | 7 | ## code4rena-scraper 8 | Scraping [Code4rena](https://www.code4rena.com) contest audits reports for stats, fun (and profit ?). 9 | 10 | For accurate prize money numbers check the Code4rena [leaderboard](https://code4rena.com/leaderboard) directly. 11 | 12 | ### Why ? 13 | 14 | To play around with the [Github API](https://docs.github.com/en/rest) and work my python scripting skills. It also gave me the chance to work with data analysis tools such as [Jupyter notebooks](https://jupyter.org/), [Pandas](https://pandas.pydata.org/docs/index.html) for manipulating the data and [Altair](https://altair-viz.github.io/index.html), a visualization framework for generating charts. 15 | 16 | In the beginning, I was curious since I found out that the audits reports repos contains the address of each participant for sending their prize money (see [here](https://github.com/code-423n4/2021-05-nftx-findings/tree/main/data) for example, in the .json files). I thought it would be interesting to try and track the flow of funds (which could be an issue if certain people wants to stay anonymous on this platform). However, this part is currently left out and the project quickly evolved into extracting data and building statistics from the Code4rena contests. 17 | 18 | Also, I realized after a week of working on this project that the [website repo](https://github.com/code-423n4/code423n4.com/tree/main/_data) of Code4rena already contains data for contests, findings and handles but hey, I learned a lot about the scraping process ! 19 | 20 | ### What ? 21 | 22 | Data is scraped from the [Code4rena](https://www.code4rena.com) published audits repos using the [Github API](https://docs.github.com/en/rest), as well as directly from the [leaderboard](https://code4rena.com/leaderboard) and [contests](https://code4rena.com/contests/) entries of the Code4rena website and is parsed to CSV files. Original CSV files can also be used directly from the [Code4rena repo](https://github.com/code-423n4/code423n4.com/tree/main/_data) in the contests/ and findings/ folders. 23 | 24 | Part of the data extracted can be used to link ETH/Polygon addresses to contest participants. Using tools like [polygonscan](https://polygonscan.com), [etherscan](https://etherscan.io) or [Bitquery](https://explorer.bitquery.io/) allows to look at the flow of funds from and to those wallets (***this part hasn't been implemented or explored too much yet***). 25 | 26 | Is it useful ? Probably not. 27 | 28 | Worth the time ? I'd say yes as it gave me insights as to how to track funds accross different chains (Polygon, Ethereum mainnet, etc.). 29 | 30 | Also, the extracted data allows to see who might be most efficient, writes the most duplicates, percentage of invalid submission, etc. 31 | 32 | #### Jupyter notebooks 33 | Notebooks can be found in the [charts_data](charts_data/) folder to visualize the data. A link is provided below each chart for a static view of each notebook. 34 | For an interactive lab, you could setup your own locally or run one online [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Krow10/code4rena-scraper/HEAD). 35 | 36 | You can also run non-interactive notebooks through [nbviewer](https://nbviewer.org/github/Krow10/code4rena-scraper/tree/master/charts_data/) or view the static generated html at [https://krow10.github.io/code4rena-scraper/](https://krow10.github.io/code4rena-scraper/). 37 | 38 | ### How ? 39 | 40 | Install all requirements through `pip install -r requirements.txt` and setup your own [Github access token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) in the `.env` file. 41 | 42 | Then use [`main.py [leaderboard|contests|github|all]`](main.py) to fetch and parse the latest data in CSV files. A Github action is available for updating the CSV files in this repo directly. 43 | 44 | Currently, the extracted data from the Github API ([github_code4rena.csv](github_code4rena.csv)) looks like this: 45 | | contest_id | handle | address | risk | title | issueId | issueUrl | contest_sponsor | date | tags | issueCreation | 46 | | ---------- | ------ | ------- | ---- | ----- | ------- | -------- | --------------- | ---- | ---- | ------------- | 47 | | Identifiy the contest | Name of the warden | Polygon address | Caracterize the submission criticity (0 to 3, G for gas optimization, Q for QA) | Title of the submission | Github issue number | Github issue URL (unused) | Contest sponsor extracted from repo's name | Contest running date extracted from repo's name | Tags associated with issue (further caracterize the submission) | Creation time of the issue | 48 | 49 | So each line in the csv file corresponds to one submission (identified by the `issueId`) of a warden (identified by his/her `(handle, address)` pair) for a given contest (identified by the `contest_id`). 50 | 51 | The data can then be imported inside a Jupyter notebook (or anywhere else, how you want to parse it) for easy processing and visualization like so: 52 | ```python 53 | import pandas as pd 54 | import altair as alt 55 | 56 | alt.data_transformers.disable_max_rows() # Disable 5_000 rows limit 57 | data = pd.read_csv("../github_code4rena.csv") # Set path accordingly 58 | 59 | # Visualize whatever (see https://altair-viz.github.io) 60 | alt.Chart(...) 61 | ``` 62 | 63 | For the leaderboard ([leaderboard_code4rena.csv](leaderboard_code4rena.csv)), the data looks like this: 64 | | period | handle | is_team | prize_money | total_reports | high_all | high_solo | med_all | med_solo | gas_all 65 | | ------ | ------ | ------- | ----------- | ------------- | -------- | --------- | ------- | -------- | ------- 66 | | The period for which the data comes from | Name of the warden | Boolean indicating if the handle refers to a team or not | Total earnings for the period (in $USD) | Total accepted reports for the period | High severity issues found with others | High severity issues found alone | Medium severity issues found with others | Medium severity issues found alone | Gas optimization reports submitted 67 | 68 | And for the contests ([contests_code4rena.csv](contests_cod4rena.csv)), the data looks like this: 69 | | contest_report_repo | contest_sponsor | contest_desc | start | end | prize_pool | handle | prize_money | total_reports | high_all | high_solo | med_all | med_solo | gas_all 70 | | - | - | - | - | - | - | - | - | - | - | - | - | - | - 71 | | The name of the Github repo for the contest audit report or empty if not published yet | Name of the contest sponsor (lowercase, stripped) | Description of the contest sponsor | Starting date of the contest | Ending date of the contest | Total prize pool (calculated from the sum of warden's prize money) | Name of the warden | Total earnings for the contest (in $USD) | Total accepted reports for the contest | High severity issues found with others | High severity issues found alone | Medium severity issues found with others | Medium severity issues found alone | Gas optimization reports submitted 72 | 73 | ### Next ? 74 | 75 | - [x] Get linked audits issues tags and add the data to the csv (helps flag invalid, duplicate and accepted submissions) 76 | - [x] Use data analysis modules or external programs to actually do something with the data 77 | - [x] For each contest, scrape the prize pool and results from the Code4rena contest page ([example](https://code4rena.com/contests/2021-02-slingshot-finance-contest)) and make a [ridgeline plot](https://altair-viz.github.io/gallery/ridgeline_plot.html) showing the distribution of rewards for each prize pool amount (with layered distribution for same pool amount) or simpler [boxplots](https://altair-viz.github.io/gallery/boxplot.html) 78 | - [x] Rework Github scraping for returning DataFrame for consistency 79 | - [x] ~~Try to make [ridgeline](https://altair-viz.github.io/gallery/ridgeline_plot.html) work (it looks so sick!)~~ *not best for this kind of data actually* 80 | - [x] ~~Rework scraping of issue labels to identify first labels (meaning original submission severity level) and last labels or maybe track entire history of labels in chronological order~~ *done through parsing with pandas* 81 | - [x] ~~Valid / invalid reports charts by contest sorted by start date (bars again ?)~~ *done and more in warden_stats* 82 | - [ ] Connect to Polygon/Ethereum blockchain to show the balances of the addresses listed 83 | - [ ] Add command line argument parsing 84 | - [ ] Make CSV files auto-update through workflow when changes happens on the Code4rena repo 85 | - [x] ~~Some more data mining from on-chain data maybe (GraphQL API would be best)~~ *won't do, no time* 86 | -------------------------------------------------------------------------------- /charts_data/findings_value.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "b0ba6704", 7 | "metadata": { 8 | "execution": { 9 | "iopub.execute_input": "2023-01-06T15:59:46.927300Z", 10 | "iopub.status.busy": "2023-01-06T15:59:46.926643Z", 11 | "iopub.status.idle": "2023-01-06T15:59:47.306523Z", 12 | "shell.execute_reply": "2023-01-06T15:59:47.305684Z" 13 | } 14 | }, 15 | "outputs": [ 16 | { 17 | "data": { 18 | "text/plain": [ 19 | "DataTransformerRegistry.enable('default')" 20 | ] 21 | }, 22 | "execution_count": 1, 23 | "metadata": {}, 24 | "output_type": "execute_result" 25 | } 26 | ], 27 | "source": [ 28 | "import pandas as pd\n", 29 | "import altair as alt\n", 30 | "alt.data_transformers.disable_max_rows() # Disable 5_000 rows limit" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "id": "93c05a5d", 37 | "metadata": { 38 | "execution": { 39 | "iopub.execute_input": "2023-01-06T15:59:47.309189Z", 40 | "iopub.status.busy": "2023-01-06T15:59:47.308760Z", 41 | "iopub.status.idle": "2023-01-06T15:59:47.436029Z", 42 | "shell.execute_reply": "2023-01-06T15:59:47.435491Z" 43 | } 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "findings_data = pd.read_csv(\"https://raw.githubusercontent.com/code-423n4/code423n4.com/main/_data/findings/findings.csv\") # Set path accordingly\n", 48 | "findings_data[\"contestid\"] = findings_data[\"contest\"]\n", 49 | "contests_data = pd.read_csv(\"https://raw.githubusercontent.com/code-423n4/code423n4.com/main/_data/contests/contests.csv\")" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "id": "11933531", 56 | "metadata": { 57 | "execution": { 58 | "iopub.execute_input": "2023-01-06T15:59:47.439249Z", 59 | "iopub.status.busy": "2023-01-06T15:59:47.438932Z", 60 | "iopub.status.idle": "2023-01-06T15:59:47.475227Z", 61 | "shell.execute_reply": "2023-01-06T15:59:47.474686Z" 62 | } 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "df = pd.merge(findings_data, contests_data[[\"contestid\", \"end_time\"]], on=\"contestid\").drop_duplicates()\n", 67 | "df[\"end_time\"] = pd.to_datetime(df[\"end_time\"])\n", 68 | "df[\"risk_label\"] = df[\"risk\"].map(\n", 69 | " {\n", 70 | " '0': '0_Very low (unused since February 2022)', \n", 71 | " '1': '1_Low (unused since February 2022)',\n", 72 | " '2': '2_Medium',\n", 73 | " '3': '3_High',\n", 74 | " 'g': 'g_Gas optimization',\n", 75 | " 'q': 'q_QA report',\n", 76 | " }\n", 77 | ")" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 4, 83 | "id": "150ea944", 84 | "metadata": { 85 | "execution": { 86 | "iopub.execute_input": "2023-01-06T15:59:47.482025Z", 87 | "iopub.status.busy": "2023-01-06T15:59:47.481823Z", 88 | "iopub.status.idle": "2023-01-06T15:59:47.486206Z", 89 | "shell.execute_reply": "2023-01-06T15:59:47.485427Z" 90 | } 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "ordered_legend_reports_labels = [\n", 95 | " '3_High',\n", 96 | " '2_Medium',\n", 97 | " 'g_Gas optimization',\n", 98 | " 'q_QA report',\n", 99 | " '1_Low (unused since February 2022)',\n", 100 | " '0_Very low (unused since February 2022)', \n", 101 | "]\n", 102 | "label_colors = [\"#FE266D\",\"#FA6C44\",\"#F2E713\",\"#D1D811\",\"#0AB6F8\",\"#5688C1\"]\n", 103 | "chart_width = 850\n", 104 | "chart_height = 350" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 5, 110 | "id": "7c14990c", 111 | "metadata": { 112 | "execution": { 113 | "iopub.execute_input": "2023-01-06T15:59:47.488513Z", 114 | "iopub.status.busy": "2023-01-06T15:59:47.488221Z", 115 | "iopub.status.idle": "2023-01-06T15:59:47.553951Z", 116 | "shell.execute_reply": "2023-01-06T15:59:47.553394Z" 117 | } 118 | }, 119 | "outputs": [ 120 | { 121 | "name": "stderr", 122 | "output_type": "stream", 123 | "text": [ 124 | "/opt/hostedtoolcache/Python/3.11.1/x64/lib/python3.11/site-packages/altair/utils/core.py:317: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.\n", 125 | " for col_name, dtype in df.dtypes.iteritems():\n" 126 | ] 127 | }, 128 | { 129 | "data": { 130 | "text/html": [ 131 | "\n", 132 | "
\n", 133 | "" 186 | ], 187 | "text/plain": [ 188 | "alt.Chart(...)" 189 | ] 190 | }, 191 | "execution_count": 5, 192 | "metadata": {}, 193 | "output_type": "execute_result" 194 | } 195 | ], 196 | "source": [ 197 | "alt.Chart(df.groupby([\"risk_label\", \"split\"])[\"awardUSD\"].median().reset_index(),\n", 198 | " width=400,\n", 199 | " title=\"Findings' value distribution according to number of shared submissions by risk level\"\n", 200 | ").transform_filter(\n", 201 | " alt.datum.risk_label != \"0_Very low (unused since February 2022)\"\n", 202 | ").mark_bar().encode(\n", 203 | " x=alt.X(\"split:O\", title=\"Number of wardens sharing a finding\"),\n", 204 | " y=alt.Y(\"awardUSD:Q\", title=\"Finding $USD value\", axis=alt.Axis(format='$,.0f')),\n", 205 | " color=alt.Color(\n", 206 | " 'risk_label:N', \n", 207 | " title=\"Risk level\",\n", 208 | " scale=alt.Scale(domain=ordered_legend_reports_labels[:-1], range=label_colors[:-1]),\n", 209 | " legend=alt.Legend(orient=\"top\", labelFontSize=12, labelLimit=250),\n", 210 | " ),\n", 211 | " column=alt.Column(\"risk_label:N\", sort=ordered_legend_reports_labels[:-1], title=\"\"),\n", 212 | " tooltip=[\"risk_label:N\", \"split:O\", \"awardUSD:Q\"]\n", 213 | ").resolve_scale(\n", 214 | " y='independent'\n", 215 | ").resolve_axis(\n", 216 | " x='independent'\n", 217 | ")" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 6, 223 | "id": "55aacb8f", 224 | "metadata": { 225 | "execution": { 226 | "iopub.execute_input": "2023-01-06T15:59:47.557198Z", 227 | "iopub.status.busy": "2023-01-06T15:59:47.557014Z", 228 | "iopub.status.idle": "2023-01-06T15:59:47.596518Z", 229 | "shell.execute_reply": "2023-01-06T15:59:47.595958Z" 230 | } 231 | }, 232 | "outputs": [ 233 | { 234 | "name": "stderr", 235 | "output_type": "stream", 236 | "text": [ 237 | "/tmp/ipykernel_1890/3862203495.py:2: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.\n", 238 | " df.groupby([df.end_time.dt.to_period(\"M\"), \"risk_label\"])[\"awardUSD\"].mean().reset_index().astype({\"end_time\": str}),\n" 239 | ] 240 | } 241 | ], 242 | "source": [ 243 | "rewards = alt.Chart(\n", 244 | " df.groupby([df.end_time.dt.to_period(\"M\"), \"risk_label\"])[\"awardUSD\"].mean().reset_index().astype({\"end_time\": str}), \n", 245 | " width=chart_width, \n", 246 | " height=chart_height,\n", 247 | " title=\"Value of a submission ($USD) over time by risk level\"\n", 248 | ").mark_line(\n", 249 | " point=True\n", 250 | ").encode(\n", 251 | " x=alt.X('end_time:T', title=\"\"),\n", 252 | " y=alt.Y('awardUSD:Q', title=\"\", axis=alt.Axis(format='$,.0f')),\n", 253 | " color=alt.Color(\n", 254 | " 'risk_label:N', \n", 255 | " title=\"Risk level\",\n", 256 | " scale=alt.Scale(domain=ordered_legend_reports_labels, range=label_colors),\n", 257 | " legend=alt.Legend(orient=\"top\", labelFontSize=12, labelLimit=250)\n", 258 | " ),\n", 259 | " tooltip=['end_time:T', 'risk_label:N', 'awardUSD:Q']\n", 260 | ")" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 7, 266 | "id": "99b7b5ee", 267 | "metadata": { 268 | "execution": { 269 | "iopub.execute_input": "2023-01-06T15:59:47.599031Z", 270 | "iopub.status.busy": "2023-01-06T15:59:47.598841Z", 271 | "iopub.status.idle": "2023-01-06T15:59:47.619268Z", 272 | "shell.execute_reply": "2023-01-06T15:59:47.618721Z" 273 | } 274 | }, 275 | "outputs": [ 276 | { 277 | "name": "stderr", 278 | "output_type": "stream", 279 | "text": [ 280 | "/tmp/ipykernel_1890/1810809613.py:2: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.\n", 281 | " df.groupby([df.end_time.dt.to_period(\"M\"), \"risk_label\"])[\"contest\"].count().reset_index().astype({\"end_time\": str}),\n" 282 | ] 283 | } 284 | ], 285 | "source": [ 286 | "submissions = alt.Chart(\n", 287 | " df.groupby([df.end_time.dt.to_period(\"M\"), \"risk_label\"])[\"contest\"].count().reset_index().astype({\"end_time\": str}),\n", 288 | " width=chart_width,\n", 289 | " height=chart_height,\n", 290 | " title=\"Expected warden reward ($USD) and number of reports over time by risk level\",\n", 291 | ").mark_line(\n", 292 | " opacity=.75,\n", 293 | " strokeDash=[2]\n", 294 | ").encode(\n", 295 | " x=alt.X('end_time:T', title=\"\"),\n", 296 | " y=alt.Y('contest:Q', title=\"Number of reports\"),\n", 297 | " color=alt.Color(\n", 298 | " 'risk_label:N', \n", 299 | " title=\"Risk level\",\n", 300 | " scale=alt.Scale(domain=ordered_legend_reports_labels, range=label_colors),\n", 301 | " legend=alt.Legend(orient=\"top\", labelFontSize=12, labelLimit=250),\n", 302 | " ),\n", 303 | ")" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "id": "f63ff21f", 309 | "metadata": {}, 310 | "source": [ 311 | "## Expected warden reward calculation\n", 312 | "\n", 313 | "1. For each contest, take the total number of submissions for each risk level and divide it by the number of participants. This gives a value *X* that corresponds to **the average number of submission for each risk level by a single warden**.\n", 314 | "2. Now take the mean reward value for each contest and risk level and multiply that by *X*. This gives a value *Y* that corresponds to **the expected warden reward according the average number of submission**.\n", 315 | "3. Group the contests by their ending date (month/year) and take the mean of the *Y*s. This gives **the expected warden reward per month for each risk level** which is what is plotted in the second chart. " 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 8, 321 | "id": "c6714ef5", 322 | "metadata": { 323 | "execution": { 324 | "iopub.execute_input": "2023-01-06T15:59:47.621610Z", 325 | "iopub.status.busy": "2023-01-06T15:59:47.621430Z", 326 | "iopub.status.idle": "2023-01-06T15:59:47.640546Z", 327 | "shell.execute_reply": "2023-01-06T15:59:47.640046Z" 328 | } 329 | }, 330 | "outputs": [], 331 | "source": [ 332 | "df2 = pd.merge(df.groupby([\"contest\", \"end_time\", \"risk\"])[[\"finding\"]].count().reset_index(), df.groupby(\"contest\")[\"handle\"].nunique(), on=\"contest\")" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 9, 338 | "id": "7bd3543c", 339 | "metadata": { 340 | "execution": { 341 | "iopub.execute_input": "2023-01-06T15:59:47.642868Z", 342 | "iopub.status.busy": "2023-01-06T15:59:47.642695Z", 343 | "iopub.status.idle": "2023-01-06T15:59:47.648473Z", 344 | "shell.execute_reply": "2023-01-06T15:59:47.647936Z" 345 | } 346 | }, 347 | "outputs": [], 348 | "source": [ 349 | "df2[\"average_findings\"] = df2.finding / df2.handle\n", 350 | "df2[\"end_time\"] = pd.to_datetime(df2[\"end_time\"])" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 10, 356 | "id": "db1134bd", 357 | "metadata": { 358 | "execution": { 359 | "iopub.execute_input": "2023-01-06T15:59:47.650524Z", 360 | "iopub.status.busy": "2023-01-06T15:59:47.650349Z", 361 | "iopub.status.idle": "2023-01-06T15:59:47.662726Z", 362 | "shell.execute_reply": "2023-01-06T15:59:47.662232Z" 363 | } 364 | }, 365 | "outputs": [], 366 | "source": [ 367 | "df3 = pd.merge(df, df2[[\"contest\", \"average_findings\", \"risk\"]], on=[\"contest\", \"risk\"])" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 11, 373 | "id": "eb907555", 374 | "metadata": { 375 | "execution": { 376 | "iopub.execute_input": "2023-01-06T15:59:47.665095Z", 377 | "iopub.status.busy": "2023-01-06T15:59:47.664843Z", 378 | "iopub.status.idle": "2023-01-06T15:59:47.682620Z", 379 | "shell.execute_reply": "2023-01-06T15:59:47.681999Z" 380 | } 381 | }, 382 | "outputs": [ 383 | { 384 | "data": { 385 | "text/html": [ 386 | "
\n", 387 | "\n", 400 | "\n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | "
end_timecontestaverage_findingsriskrisk_labelawardUSDaverage_usd_per_risk
02021-02-22 23:59:00+00:0010.12500033_High5702.88712.860000
12021-02-22 23:59:00+00:0012.12500011_Low (unused since February 2022)153.98327.207500
22021-02-22 23:59:00+00:0012.12500011_Low (unused since February 2022)256.63545.338750
32021-02-22 23:59:00+00:0012.12500011_Low (unused since February 2022)256.63545.338750
42021-02-22 23:59:00+00:0012.12500011_Low (unused since February 2022)570.291211.866250
........................
248492022-11-18 20:00:00+00:001820.195652gg_Gas optimization1074.64210.255652
248502022-11-18 20:00:00+00:001820.195652gg_Gas optimization826.64161.733913
248512022-11-18 20:00:00+00:001820.195652gg_Gas optimization68.1413.331739
248522022-11-18 20:00:00+00:001820.195652gg_Gas optimization68.1413.331739
248532022-11-18 20:00:00+00:001820.195652gg_Gas optimization68.1413.331739
\n", 526 | "

24854 rows × 7 columns

\n", 527 | "
" 528 | ], 529 | "text/plain": [ 530 | " end_time contest average_findings risk \\\n", 531 | "0 2021-02-22 23:59:00+00:00 1 0.125000 3 \n", 532 | "1 2021-02-22 23:59:00+00:00 1 2.125000 1 \n", 533 | "2 2021-02-22 23:59:00+00:00 1 2.125000 1 \n", 534 | "3 2021-02-22 23:59:00+00:00 1 2.125000 1 \n", 535 | "4 2021-02-22 23:59:00+00:00 1 2.125000 1 \n", 536 | "... ... ... ... ... \n", 537 | "24849 2022-11-18 20:00:00+00:00 182 0.195652 g \n", 538 | "24850 2022-11-18 20:00:00+00:00 182 0.195652 g \n", 539 | "24851 2022-11-18 20:00:00+00:00 182 0.195652 g \n", 540 | "24852 2022-11-18 20:00:00+00:00 182 0.195652 g \n", 541 | "24853 2022-11-18 20:00:00+00:00 182 0.195652 g \n", 542 | "\n", 543 | " risk_label awardUSD average_usd_per_risk \n", 544 | "0 3_High 5702.88 712.860000 \n", 545 | "1 1_Low (unused since February 2022) 153.98 327.207500 \n", 546 | "2 1_Low (unused since February 2022) 256.63 545.338750 \n", 547 | "3 1_Low (unused since February 2022) 256.63 545.338750 \n", 548 | "4 1_Low (unused since February 2022) 570.29 1211.866250 \n", 549 | "... ... ... ... \n", 550 | "24849 g_Gas optimization 1074.64 210.255652 \n", 551 | "24850 g_Gas optimization 826.64 161.733913 \n", 552 | "24851 g_Gas optimization 68.14 13.331739 \n", 553 | "24852 g_Gas optimization 68.14 13.331739 \n", 554 | "24853 g_Gas optimization 68.14 13.331739 \n", 555 | "\n", 556 | "[24854 rows x 7 columns]" 557 | ] 558 | }, 559 | "execution_count": 11, 560 | "metadata": {}, 561 | "output_type": "execute_result" 562 | } 563 | ], 564 | "source": [ 565 | "df3[\"average_usd_per_risk\"] = df3.average_findings * df3.awardUSD\n", 566 | "df3[[\"end_time\", \"contest\", \"average_findings\", \"risk\", \"risk_label\", \"awardUSD\", \"average_usd_per_risk\"]]" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": 12, 572 | "id": "9032c059", 573 | "metadata": { 574 | "execution": { 575 | "iopub.execute_input": "2023-01-06T15:59:47.684785Z", 576 | "iopub.status.busy": "2023-01-06T15:59:47.684613Z", 577 | "iopub.status.idle": "2023-01-06T15:59:47.731480Z", 578 | "shell.execute_reply": "2023-01-06T15:59:47.730944Z" 579 | } 580 | }, 581 | "outputs": [ 582 | { 583 | "name": "stderr", 584 | "output_type": "stream", 585 | "text": [ 586 | "/tmp/ipykernel_1890/1760306320.py:1: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.\n", 587 | " weighted_rewards = alt.Chart(df3.groupby([df3.end_time.dt.to_period(\"M\"), \"risk_label\"])[\"average_usd_per_risk\"].mean().reset_index().astype({\"end_time\": str}),\n" 588 | ] 589 | } 590 | ], 591 | "source": [ 592 | "weighted_rewards = alt.Chart(df3.groupby([df3.end_time.dt.to_period(\"M\"), \"risk_label\"])[\"average_usd_per_risk\"].mean().reset_index().astype({\"end_time\": str}),\n", 593 | " width=chart_width,\n", 594 | " height=chart_height\n", 595 | ").mark_line(point=True).encode(\n", 596 | " x='end_time:T',\n", 597 | " y=alt.Y('average_usd_per_risk:Q', title=\"\", axis=alt.Axis(format='$,.0f')),\n", 598 | " color=alt.Color(\n", 599 | " 'risk_label:N', \n", 600 | " title=\"Risk level\",\n", 601 | " scale=alt.Scale(domain=ordered_legend_reports_labels, range=label_colors),\n", 602 | " legend=alt.Legend(orient=\"top\", labelFontSize=12, labelLimit=250)\n", 603 | " ),\n", 604 | " tooltip=['end_time:T', 'risk_label:N', 'average_usd_per_risk:Q']\n", 605 | ")" 606 | ] 607 | }, 608 | { 609 | "cell_type": "markdown", 610 | "id": "a42a166a", 611 | "metadata": {}, 612 | "source": [ 613 | "## Analysis\n", 614 | "\n", 615 | "While the hierarchy for a submission value is well respected over time (as shown in the first graph), the expected value tells another story about which category is more worth in the eyes of wardens. With the amount of reports increasing, the expected reward for each category tends to uniformize and it's not clear which of the *mediums* or *highs* findings are more worth than the other.\n", 616 | "\n", 617 | "Certainly, the though competiton of recent contests has made the value of *high* and *medium* findings diminish which is not particularly a good sign since their edging closer to the values of easier and potentially automated *gas optimization* and *QA reports*." 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": 13, 623 | "id": "0cb36b64", 624 | "metadata": { 625 | "execution": { 626 | "iopub.execute_input": "2023-01-06T15:59:47.733925Z", 627 | "iopub.status.busy": "2023-01-06T15:59:47.733721Z", 628 | "iopub.status.idle": "2023-01-06T15:59:47.774446Z", 629 | "shell.execute_reply": "2023-01-06T15:59:47.773792Z" 630 | }, 631 | "scrolled": false 632 | }, 633 | "outputs": [ 634 | { 635 | "name": "stderr", 636 | "output_type": "stream", 637 | "text": [ 638 | "/opt/hostedtoolcache/Python/3.11.1/x64/lib/python3.11/site-packages/altair/utils/core.py:317: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.\n", 639 | " for col_name, dtype in df.dtypes.iteritems():\n" 640 | ] 641 | }, 642 | { 643 | "data": { 644 | "text/html": [ 645 | "\n", 646 | "
\n", 647 | "" 700 | ], 701 | "text/plain": [ 702 | "alt.VConcatChart(...)" 703 | ] 704 | }, 705 | "execution_count": 13, 706 | "metadata": {}, 707 | "output_type": "execute_result" 708 | } 709 | ], 710 | "source": [ 711 | "rewards & (weighted_rewards + submissions).resolve_scale(y='independent')" 712 | ] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "execution_count": null, 717 | "id": "f656ebbe", 718 | "metadata": {}, 719 | "outputs": [], 720 | "source": [] 721 | } 722 | ], 723 | "metadata": { 724 | "kernelspec": { 725 | "display_name": "Python 3 (ipykernel)", 726 | "language": "python", 727 | "name": "python3" 728 | }, 729 | "language_info": { 730 | "codemirror_mode": { 731 | "name": "ipython", 732 | "version": 3 733 | }, 734 | "file_extension": ".py", 735 | "mimetype": "text/x-python", 736 | "name": "python", 737 | "nbconvert_exporter": "python", 738 | "pygments_lexer": "ipython3", 739 | "version": "3.11.1" 740 | } 741 | }, 742 | "nbformat": 4, 743 | "nbformat_minor": 5 744 | } 745 | -------------------------------------------------------------------------------- /charts_data/participants_growth.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "5de0fa20", 7 | "metadata": { 8 | "execution": { 9 | "iopub.execute_input": "2023-01-06T15:59:53.274664Z", 10 | "iopub.status.busy": "2023-01-06T15:59:53.274158Z", 11 | "iopub.status.idle": "2023-01-06T15:59:53.634015Z", 12 | "shell.execute_reply": "2023-01-06T15:59:53.633085Z" 13 | } 14 | }, 15 | "outputs": [], 16 | "source": [ 17 | "import pandas as pd\n", 18 | "import altair as alt" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "id": "9d1203fb", 25 | "metadata": { 26 | "execution": { 27 | "iopub.execute_input": "2023-01-06T15:59:53.637381Z", 28 | "iopub.status.busy": "2023-01-06T15:59:53.636825Z", 29 | "iopub.status.idle": "2023-01-06T15:59:53.643236Z", 30 | "shell.execute_reply": "2023-01-06T15:59:53.642656Z" 31 | } 32 | }, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/plain": [ 37 | "DataTransformerRegistry.enable('default')" 38 | ] 39 | }, 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "output_type": "execute_result" 43 | } 44 | ], 45 | "source": [ 46 | "alt.data_transformers.disable_max_rows()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "id": "72690c0e", 53 | "metadata": { 54 | "execution": { 55 | "iopub.execute_input": "2023-01-06T15:59:53.645968Z", 56 | "iopub.status.busy": "2023-01-06T15:59:53.645793Z", 57 | "iopub.status.idle": "2023-01-06T15:59:53.844257Z", 58 | "shell.execute_reply": "2023-01-06T15:59:53.843715Z" 59 | } 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "data = pd.read_csv(\"https://raw.githubusercontent.com/Krow10/code4rena-scraper/master/github_code4rena.csv\")\n", 64 | "data[\"date\"] = pd.to_datetime(data[\"date\"])\n", 65 | "plt_data = pd.DataFrame()" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "id": "aef95b25", 71 | "metadata": {}, 72 | "source": [ 73 | "### Active wardens\n", 74 | "Represents the number of wardens who participated in at least one contest during the month" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "id": "15007a51", 81 | "metadata": { 82 | "execution": { 83 | "iopub.execute_input": "2023-01-06T15:59:53.848443Z", 84 | "iopub.status.busy": "2023-01-06T15:59:53.848045Z", 85 | "iopub.status.idle": "2023-01-06T15:59:53.857690Z", 86 | "shell.execute_reply": "2023-01-06T15:59:53.857197Z" 87 | } 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "plt_data[\"active_wardens\"] = data.groupby(\"date\")[\"handle\"].nunique()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "id": "ce2a9dde", 97 | "metadata": {}, 98 | "source": [ 99 | "### Inactive wardens\n", 100 | "Represents wardens who have only been active for one month maximum " 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 5, 106 | "id": "30b03e60", 107 | "metadata": { 108 | "execution": { 109 | "iopub.execute_input": "2023-01-06T15:59:53.860098Z", 110 | "iopub.status.busy": "2023-01-06T15:59:53.859806Z", 111 | "iopub.status.idle": "2023-01-06T15:59:53.877265Z", 112 | "shell.execute_reply": "2023-01-06T15:59:53.876227Z" 113 | } 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "one_timers = data.groupby([\"handle\", \"date\"]).size().groupby(level=0).size().to_frame()[lambda x: x.iloc[:, [0]] <= 1].dropna().reset_index()[\"handle\"]\n", 118 | "plt_data[\"inactive_wardens\"] = data[data[\"handle\"].isin(one_timers)].groupby(\"date\")[\"handle\"].nunique().shift(1, fill_value=0)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "id": "0b02cfea", 124 | "metadata": {}, 125 | "source": [ 126 | "### New wardens\n", 127 | "Represents wardens that made their first contest appearance during the month" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 6, 133 | "id": "4034df57", 134 | "metadata": { 135 | "execution": { 136 | "iopub.execute_input": "2023-01-06T15:59:53.879706Z", 137 | "iopub.status.busy": "2023-01-06T15:59:53.879433Z", 138 | "iopub.status.idle": "2023-01-06T15:59:53.894352Z", 139 | "shell.execute_reply": "2023-01-06T15:59:53.893868Z" 140 | } 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "plt_data[\"new_wardens\"] = (data.groupby(\"date\")[\"handle\"].unique().map(lambda x: set(x)) - data.groupby(\"date\")[\"handle\"].unique().map(lambda x: set(x)).shift(1)).map(lambda x: len(x) if isinstance(x, set) else 0)\n", 145 | "plt_data.iat[0, 2] = plt_data.iat[0, 0] # Active wardens = New wardens for first contest" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "id": "df97972e", 151 | "metadata": {}, 152 | "source": [ 153 | "### Non-participating wardens\n", 154 | "Wardens who have been active at different times and have not participated for this particular month (doesn't include inactive wardens)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 7, 160 | "id": "ac36a961", 161 | "metadata": { 162 | "execution": { 163 | "iopub.execute_input": "2023-01-06T15:59:53.896645Z", 164 | "iopub.status.busy": "2023-01-06T15:59:53.896468Z", 165 | "iopub.status.idle": "2023-01-06T15:59:53.901824Z", 166 | "shell.execute_reply": "2023-01-06T15:59:53.901294Z" 167 | } 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "plt_data[\"total_inactive_wardens\"] = plt_data[\"inactive_wardens\"].cumsum()\n", 172 | "plt_data[\"total_wardens\"] = plt_data[\"new_wardens\"].cumsum()\n", 173 | "plt_data[\"non_participating_wardens\"] = plt_data[\"total_wardens\"] - plt_data[\"active_wardens\"] - plt_data[\"total_inactive_wardens\"]\n", 174 | "plt_data.reset_index(inplace=True)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 8, 180 | "id": "5b9e1e92", 181 | "metadata": { 182 | "execution": { 183 | "iopub.execute_input": "2023-01-06T15:59:53.903954Z", 184 | "iopub.status.busy": "2023-01-06T15:59:53.903674Z", 185 | "iopub.status.idle": "2023-01-06T15:59:53.912447Z", 186 | "shell.execute_reply": "2023-01-06T15:59:53.911923Z" 187 | }, 188 | "scrolled": false 189 | }, 190 | "outputs": [ 191 | { 192 | "data": { 193 | "text/html": [ 194 | "
\n", 195 | "\n", 208 | "\n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | "
dateactive_wardensinactive_wardensnew_wardenstotal_inactive_wardenstotal_wardensnon_participating_wardens
02021-04-01170170170
12021-05-0122383250
22021-06-0119578325
32021-07-012821410468
42021-08-0130615166115
52021-09-0138722238322
62021-10-01375172810035
72021-11-01804533215341
82021-12-016514294618271
92022-01-011197715325381
102022-02-01102153368286116
112022-03-01113115379339147
122022-04-01155117790416171
132022-05-0123217119107535196
142022-06-012384599152634244
152022-07-012291879170713314
162022-08-0130318143188856365
172022-09-0133334129222985430
182022-10-01384551632771148487
192022-11-01106103153801163677
\n", 424 | "
" 425 | ], 426 | "text/plain": [ 427 | " date active_wardens inactive_wardens new_wardens \\\n", 428 | "0 2021-04-01 17 0 17 \n", 429 | "1 2021-05-01 22 3 8 \n", 430 | "2 2021-06-01 19 5 7 \n", 431 | "3 2021-07-01 28 2 14 \n", 432 | "4 2021-08-01 30 6 15 \n", 433 | "5 2021-09-01 38 7 22 \n", 434 | "6 2021-10-01 37 5 17 \n", 435 | "7 2021-11-01 80 4 53 \n", 436 | "8 2021-12-01 65 14 29 \n", 437 | "9 2022-01-01 119 7 71 \n", 438 | "10 2022-02-01 102 15 33 \n", 439 | "11 2022-03-01 113 11 53 \n", 440 | "12 2022-04-01 155 11 77 \n", 441 | "13 2022-05-01 232 17 119 \n", 442 | "14 2022-06-01 238 45 99 \n", 443 | "15 2022-07-01 229 18 79 \n", 444 | "16 2022-08-01 303 18 143 \n", 445 | "17 2022-09-01 333 34 129 \n", 446 | "18 2022-10-01 384 55 163 \n", 447 | "19 2022-11-01 106 103 15 \n", 448 | "\n", 449 | " total_inactive_wardens total_wardens non_participating_wardens \n", 450 | "0 0 17 0 \n", 451 | "1 3 25 0 \n", 452 | "2 8 32 5 \n", 453 | "3 10 46 8 \n", 454 | "4 16 61 15 \n", 455 | "5 23 83 22 \n", 456 | "6 28 100 35 \n", 457 | "7 32 153 41 \n", 458 | "8 46 182 71 \n", 459 | "9 53 253 81 \n", 460 | "10 68 286 116 \n", 461 | "11 79 339 147 \n", 462 | "12 90 416 171 \n", 463 | "13 107 535 196 \n", 464 | "14 152 634 244 \n", 465 | "15 170 713 314 \n", 466 | "16 188 856 365 \n", 467 | "17 222 985 430 \n", 468 | "18 277 1148 487 \n", 469 | "19 380 1163 677 " 470 | ] 471 | }, 472 | "execution_count": 8, 473 | "metadata": {}, 474 | "output_type": "execute_result" 475 | } 476 | ], 477 | "source": [ 478 | "plt_data" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 9, 484 | "id": "7896954e", 485 | "metadata": { 486 | "execution": { 487 | "iopub.execute_input": "2023-01-06T15:59:53.914717Z", 488 | "iopub.status.busy": "2023-01-06T15:59:53.914547Z", 489 | "iopub.status.idle": "2023-01-06T15:59:53.927443Z", 490 | "shell.execute_reply": "2023-01-06T15:59:53.926960Z" 491 | } 492 | }, 493 | "outputs": [], 494 | "source": [ 495 | "base = alt.Chart(plt_data).transform_fold(\n", 496 | " ['non_participating_wardens', 'total_inactive_wardens', 'active_wardens'],\n", 497 | " as_=['column', 'value']\n", 498 | ").encode(\n", 499 | " x=alt.X('date:T', axis=alt.Axis(format='%m/%y', title='', grid=False)),\n", 500 | " y=alt.Y('value:Q', axis=alt.Axis(title='Total warden count')),\n", 501 | ")" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": 10, 507 | "id": "976b40f7", 508 | "metadata": { 509 | "execution": { 510 | "iopub.execute_input": "2023-01-06T15:59:53.929582Z", 511 | "iopub.status.busy": "2023-01-06T15:59:53.929407Z", 512 | "iopub.status.idle": "2023-01-06T15:59:53.935528Z", 513 | "shell.execute_reply": "2023-01-06T15:59:53.934971Z" 514 | } 515 | }, 516 | "outputs": [], 517 | "source": [ 518 | "bars = base.mark_bar(size=30).encode(\n", 519 | " color=alt.Color('column:N', title='Warden categories'),\n", 520 | ")" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": 11, 526 | "id": "b9a3b724", 527 | "metadata": { 528 | "execution": { 529 | "iopub.execute_input": "2023-01-06T15:59:53.938114Z", 530 | "iopub.status.busy": "2023-01-06T15:59:53.937670Z", 531 | "iopub.status.idle": "2023-01-06T15:59:53.958590Z", 532 | "shell.execute_reply": "2023-01-06T15:59:53.957458Z" 533 | } 534 | }, 535 | "outputs": [], 536 | "source": [ 537 | "active_labels = base.mark_text(\n", 538 | " dy=10,\n", 539 | " color='white'\n", 540 | ").transform_calculate(\n", 541 | " percentActive=\"datum.active_wardens / datum.total_wardens\",\n", 542 | " percentNonParticipating=\"datum.non_participating_wardens / datum.total_wardens\",\n", 543 | " percentInactive=\"datum.total_inactive_wardens / datum.total_wardens\"\n", 544 | ").encode(\n", 545 | " y='total_wardens:Q',\n", 546 | " text=alt.Text('percentActive:Q', format='.0%')\n", 547 | ")" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": 12, 553 | "id": "142f82fd", 554 | "metadata": { 555 | "execution": { 556 | "iopub.execute_input": "2023-01-06T15:59:53.960848Z", 557 | "iopub.status.busy": "2023-01-06T15:59:53.960580Z", 558 | "iopub.status.idle": "2023-01-06T15:59:53.992083Z", 559 | "shell.execute_reply": "2023-01-06T15:59:53.991043Z" 560 | } 561 | }, 562 | "outputs": [], 563 | "source": [ 564 | "non_p_labels = base.mark_text(\n", 565 | " dy=10,\n", 566 | " color='white'\n", 567 | ").transform_calculate(\n", 568 | " dy='datum.non_participating_wardens + datum.total_inactive_wardens',\n", 569 | " percentNonParticipating=\"datum.non_participating_wardens / datum.total_wardens\"\n", 570 | ").encode(\n", 571 | " y='dy:Q',\n", 572 | " text=alt.Text('percentNonParticipating:Q', format='.0%'),\n", 573 | " opacity=alt.condition('datum.non_participating_wardens > 10', alt.value(1), alt.value(0))\n", 574 | ")" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": 13, 580 | "id": "c30dac5e", 581 | "metadata": { 582 | "execution": { 583 | "iopub.execute_input": "2023-01-06T15:59:53.994261Z", 584 | "iopub.status.busy": "2023-01-06T15:59:53.994089Z", 585 | "iopub.status.idle": "2023-01-06T15:59:54.020058Z", 586 | "shell.execute_reply": "2023-01-06T15:59:54.019251Z" 587 | } 588 | }, 589 | "outputs": [], 590 | "source": [ 591 | "inactive_labels = base.mark_text(\n", 592 | " dy=10,\n", 593 | " color='white'\n", 594 | ").transform_calculate(\n", 595 | " percentInactive=\"datum.total_inactive_wardens / datum.total_wardens\"\n", 596 | ").encode(\n", 597 | " y='total_inactive_wardens:Q',\n", 598 | " text=alt.Text('percentInactive:Q', format='.0%'),\n", 599 | " opacity=alt.condition('datum.total_inactive_wardens > 10', alt.value(1), alt.value(0))\n", 600 | ")" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 14, 606 | "id": "f7c43c0a", 607 | "metadata": { 608 | "execution": { 609 | "iopub.execute_input": "2023-01-06T15:59:54.022179Z", 610 | "iopub.status.busy": "2023-01-06T15:59:54.021907Z", 611 | "iopub.status.idle": "2023-01-06T15:59:54.071489Z", 612 | "shell.execute_reply": "2023-01-06T15:59:54.070961Z" 613 | } 614 | }, 615 | "outputs": [ 616 | { 617 | "name": "stderr", 618 | "output_type": "stream", 619 | "text": [ 620 | "/opt/hostedtoolcache/Python/3.11.1/x64/lib/python3.11/site-packages/altair/utils/core.py:317: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.\n", 621 | " for col_name, dtype in df.dtypes.iteritems():\n" 622 | ] 623 | }, 624 | { 625 | "data": { 626 | "text/html": [ 627 | "\n", 628 | "
\n", 629 | "" 682 | ], 683 | "text/plain": [ 684 | "alt.LayerChart(...)" 685 | ] 686 | }, 687 | "execution_count": 14, 688 | "metadata": {}, 689 | "output_type": "execute_result" 690 | } 691 | ], 692 | "source": [ 693 | "(bars + active_labels + non_p_labels + inactive_labels).properties(width=700,height=400)" 694 | ] 695 | }, 696 | { 697 | "cell_type": "code", 698 | "execution_count": null, 699 | "id": "0a4ec6e1", 700 | "metadata": {}, 701 | "outputs": [], 702 | "source": [] 703 | } 704 | ], 705 | "metadata": { 706 | "kernelspec": { 707 | "display_name": "Python 3 (ipykernel)", 708 | "language": "python", 709 | "name": "python3" 710 | }, 711 | "language_info": { 712 | "codemirror_mode": { 713 | "name": "ipython", 714 | "version": 3 715 | }, 716 | "file_extension": ".py", 717 | "mimetype": "text/x-python", 718 | "name": "python", 719 | "nbconvert_exporter": "python", 720 | "pygments_lexer": "ipython3", 721 | "version": "3.11.1" 722 | } 723 | }, 724 | "nbformat": 4, 725 | "nbformat_minor": 5 726 | } 727 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | from dotenv import load_dotenv 5 | from scrapers.WebScraper import * 6 | from scrapers.GithubScraper import * 7 | 8 | load_dotenv() 9 | 10 | def scrape(scrape_method, scrape_data_desc, url, csv_file=None): 11 | logging.info(f"Starting {scrape_data_desc} data scraping at '{url}'...") 12 | df = scrape_method(url) 13 | 14 | if (csv_file): 15 | df.to_csv(csv_file, index=False) 16 | 17 | logging.info(f"Finished {scrape_data_desc} data scraping: got {len(df.index)} rows of data [success]") 18 | return df 19 | 20 | if __name__ == "__main__": 21 | file_handler = logging.FileHandler("code4rena.log", mode='w', encoding='utf8') 22 | console_handler = logging.StreamHandler() 23 | console_handler.setLevel(logging.INFO) 24 | logging.basicConfig( 25 | handlers=[file_handler, console_handler], 26 | level=logging.DEBUG, 27 | format='%(module)s:T+%(relativeCreated)d\t%(levelname)s %(message)s' 28 | ) 29 | logging.getLogger('selenium').setLevel(logging.WARNING) # Prevent log file from being filed with Selenium debug output 30 | 31 | logging.addLevelName(logging.DEBUG, '[DEBUG]') 32 | logging.addLevelName(logging.INFO, '[*]') 33 | logging.addLevelName(logging.WARNING, '[!]') 34 | logging.addLevelName(logging.ERROR, '[ERROR]') 35 | logging.addLevelName(logging.CRITICAL, '[CRITICAL]') 36 | 37 | leaderboard_url = "https://code4rena.com/leaderboard" 38 | leaderboard_csv_file = 'leaderboard_code4rena.csv' 39 | 40 | contests_url = "https://code4rena.com/contests" 41 | contests_csv_file = 'contests_code4rena.csv' 42 | 43 | github_org = "code-423n4" 44 | github_csv_file = 'github_code4rena.csv' 45 | 46 | github_scraper = GithubScraper(console_handler) 47 | target = sys.argv[1].lower() # TODO : Parse command line arguments 48 | 49 | if (target == 'github'): 50 | scrape(github_scraper.scrape_repos, "Github repos", github_org, github_csv_file) 51 | else: 52 | web_scraper = WebScraper(console_handler) # Initialize Selenium driver only if needed 53 | if (target == 'leaderboard'): 54 | scrape(web_scraper.scrape_leaderboard_table, "Code4rena leaderboard", leaderboard_url, leaderboard_csv_file) 55 | elif (target == 'contests'): 56 | scrape(web_scraper.scrape_contests_data, "Code4rena contests", contests_url, contests_csv_file) 57 | else: 58 | scrape(web_scraper.scrape_leaderboard_table, "Code4rena leaderboard", leaderboard_url, leaderboard_csv_file) 59 | scrape(web_scraper.scrape_contests_data, "Code4rena contests", contests_url, contests_csv_file) 60 | scrape(github_scraper.scrape_repos, "Github repos", github_org, github_csv_file) 61 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | altair>=4.2.0 2 | bs4>=0.0.1 3 | GitPython>=3.1.27 4 | jupyter==1.0.0 5 | lxml>=4.6.3 6 | nbconvert==7.2.2 7 | pandas>=1.4.3 8 | python-dotenv>=0.20.0 9 | requests>=2.26.0 10 | requests_cache>=0.9.5 11 | selenium>=4.3.0 12 | webdriver-manager>=3.8.3 13 | -------------------------------------------------------------------------------- /runtime.txt: -------------------------------------------------------------------------------- 1 | python-3.8 -------------------------------------------------------------------------------- /scrapers/GithubScraper.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import pandas as pd 5 | import requests 6 | import requests_cache 7 | import time 8 | 9 | from datetime import datetime, date, timedelta 10 | from git import Repo 11 | 12 | class GithubScraper(): 13 | """docstring for GithubScraper""" 14 | def __init__(self, console_handler): 15 | super(GithubScraper, self).__init__() 16 | requests_cache.install_cache('code4rena_cache', expire_after=timedelta(days=1)) # Cache repo data for one day (prevent reaching API rate limit) 17 | self.console_handler = console_handler 18 | self.base = f"https://api.github.com/" 19 | self.headers = {'Authorization': 'token ' + os.getenv('API_ACCESS_TOKEN'), 'User-Agent': 'Bot'} # Using auth bumps the rate limit to 5_000 requests per HOUR 20 | 21 | def _check_request(self, req): 22 | if (req.status_code == 403 or req.status_code == 404): 23 | logging.critical(f"Request returned {req.status_code}: {req.json()}") 24 | exit(1) 25 | elif all(k in req.headers for k in ['x-ratelimit-limit', 'x-ratelimit-remaining']): 26 | logging.debug(f"Rate limit: {req.headers['x-ratelimit-remaining']} requests remaining (limit: {req.headers['x-ratelimit-limit']})") 27 | 28 | return req 29 | 30 | def _get_paginated(self, start_url, redirect=None): 31 | url = redirect if redirect != None else (self.base + start_url) 32 | return self._check_request(requests.get(url, headers=self.headers)) 33 | 34 | def get_repos(self, redirect=None): 35 | return self._get_paginated(f"orgs/{self.org}/repos?type=all&per_page=100", redirect) 36 | 37 | def get_issues(self, repo, redirect=None): 38 | return self._get_paginated(f"repos/{self.org}/{repo}/issues?state=all&per_page=100", redirect) 39 | 40 | def get_next_page_url(self, link_header): # Link header format: ; rel=[prev|next|last], ... 41 | if (link_header == None): 42 | return None 43 | 44 | try: 45 | for (url, rel) in [x.split(';') for x in link_header.split(',')]: 46 | if (rel.strip().split('=')[1].strip('\"') == "next"): # Split 'rel=[prev|next|last]' 47 | return url.strip().replace('<', '').replace('>', '') 48 | except ValueError as e: 49 | pass 50 | 51 | return None 52 | 53 | def is_last_page(self, headers): 54 | return 'Link' not in headers or 'next' not in headers['Link'] 55 | 56 | def repo_creation_to_date(self, s): # format : [Y]-[M]-[D]T[H]:[M]:[S]Z 57 | return datetime.strptime(s, '%Y-%m-%dT%H:%M:%SZ').date() 58 | 59 | def scrape_repos(self, org): 60 | self.org = org 61 | logging.info(f"Fetching all public repos from '{self.org}'...") 62 | repos = [] 63 | req = requests.Response() 64 | req.headers = {'Link': 'next'} # Run loop at least once 65 | while not(self.is_last_page(req.headers)): 66 | next_page_url = self.get_next_page_url(req.headers['Link']) 67 | req = self.get_repos(next_page_url) 68 | repos += req.json() 69 | logging.debug(f"Got {len(repos)} repos, page {'1' if next_page_url == None else next_page_url[next_page_url.rindex('=')+1:]}") 70 | logging.info(f"Fetched {len(repos)} repos from '{self.org}' [success]") 71 | 72 | # Keep only audits reports starting from 20 March 2021 (earlier repos used a different format for tracking contributions) 73 | repos = list(filter(lambda repo: "findings" in repo['name'] and self.repo_creation_to_date(repo['created_at']) > date(2021, 3, 20), repos)) 74 | if (len(repos) == 0): 75 | logging.critical(f"No completed audits repos found, terminating...") 76 | exit(1) 77 | 78 | total_repos_size = sum([repo['size'] for repo in repos]) 79 | logging.info(f"Found {len(repos)} completed audits repos (total size: {total_repos_size} Kb)") 80 | 81 | repos_data_folder = "repos_data/" 82 | os.makedirs(repos_data_folder, exist_ok=True) # Create cloning directory if needed 83 | cloned_repos = 0 84 | logging.info(f"Cloning new repositories to '{repos_data_folder}'...") 85 | for repo in repos: 86 | if not(os.path.isdir(repos_data_folder + repo['name'])): 87 | logging.info(f"Cloning {repo['name']} ({repos.index(repo) + 1}/{len(repos)})...") 88 | Repo.clone_from(repo['clone_url'], repos_data_folder + repo['name']) 89 | cloned_repos += 1 90 | 91 | if (cloned_repos > 0): 92 | logging.info(f"Cloned {cloned_repos} new repos to '{repos_data_folder}' [success]") 93 | else: 94 | logging.warning(f"No new repos to clone") 95 | 96 | logging.info("Getting issues data for each repo (this may take some time)...") 97 | issues = {repo['name'] : [] for repo in repos} 98 | self.console_handler.terminator = "\r" 99 | for repo in repos: 100 | req = requests.Response() 101 | req.headers = {'Link': 'next'} # Run loop at least once 102 | count_repo_issues = 0 103 | while not(self.is_last_page(req.headers)): 104 | next_page_url = self.get_next_page_url(req.headers['Link']) 105 | req = self.get_issues(repo['name'], next_page_url) 106 | issues[repo['name']] += req.json() 107 | count_repo_issues += len(issues[repo['name']]) 108 | logging.debug(f"Got {count_repo_issues} issues for repo '{repo['name']}', page {'1' if next_page_url == None else next_page_url[next_page_url.rindex('=')+1:]}") 109 | logging.info(f"Processed {repos.index(repo) + 1} / {len(repos)} repos") 110 | self.console_handler.terminator = "\n" 111 | logging.info(f"Got {sum([len(k) for k in issues.values()])} total issues in {len(repos)} repos from {self.org} [success]") 112 | 113 | ''' 114 | At this point we have for each public contest report: 115 | - Sponsor 116 | - Rough date for when it took place (month, year) 117 | - Participants 118 | - Handle 119 | - Address 120 | - Issues reported 121 | - Issues (= audit submission) tags 122 | - Risk (QA, Non-critical/0, Low/1, Med/2, High/3) 123 | - Sponsor acknowledged, confirmed, disputed, addressed/resolved 124 | - Duplicate 125 | - Is gas optimization 126 | - Is judged invalid 127 | - Has been upgraded by judge 128 | - Has been withdrawn by warden 129 | ... others 130 | ''' 131 | 132 | logging.info(f"Parsing cloned repos data (this may take some time)...") 133 | repos_columns = ['contest', 'contest_sponsor', 'date', 'handle', 'address', 'risk', 'title', 'issueId', 'issueUrl', 'tags'] 134 | repos_data = pd.DataFrame(columns=repos_columns) 135 | repo_names = os.listdir(repos_data_folder) 136 | 137 | self.console_handler.terminator = "\r" 138 | for repo in repo_names: 139 | repo_issues = issues[repo] 140 | for json_filename in os.listdir(repos_data_folder + repo + '/data/'): 141 | with open(repos_data_folder + repo + '/data/' + json_filename, 'r') as json_file: 142 | ''' 143 | Sample JSON data file: 144 | { 145 | "contest": "[ID]", 146 | "handle": "[HANDLE]", 147 | "address": "[ADDRESS]", 148 | "risk": "[1/2/3]", 149 | "title": "[TITLE]", 150 | "issueId": [ISSUE NUMBER], 151 | "issueUrl": "[UNUSED]" 152 | } 153 | ''' 154 | try: 155 | json_data = json.loads(json_file.read()) # Loads dict from json data file 156 | issue = next(i for i in repo_issues if i['number'] == json_data['issueId']) # Get issue details 157 | 158 | # Additional infos 159 | json_data['contest_sponsor'] = " ".join(repo.split('-')[2:-1]) 160 | json_data['date'] = "/".join(repo.split('-')[:2]) 161 | json_data['issueCreation'] = issue['created_at'] 162 | json_data['tags'] = ";".join([l['name'] for l in issue['labels']]) 163 | 164 | repos_data = pd.concat([repos_data, pd.DataFrame([json_data])], ignore_index=True) 165 | except Exception as e: 166 | logging.warning(f"Failed to parse '{json_filename}'' for repo '{repo}': {e}\n") 167 | logging.info(f"Processed {repo_names.index(repo) + 1} / {len(repo_names)} repos") 168 | self.console_handler.terminator = "\n" 169 | 170 | return repos_data.reset_index(drop=True) -------------------------------------------------------------------------------- /scrapers/WebScraper.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import pandas as pd 4 | import time 5 | 6 | from bs4 import BeautifulSoup 7 | from selenium import webdriver 8 | from webdriver_manager.firefox import GeckoDriverManager 9 | from selenium.webdriver.common.by import By 10 | from selenium.webdriver.firefox.options import Options 11 | from selenium.common.exceptions import NoSuchElementException 12 | 13 | class WebScraper(): 14 | """docstring for WebScraper""" 15 | def __init__(self, console_handler): 16 | super(WebScraper, self).__init__() 17 | self.console_handler = console_handler 18 | logging.info("Starting Selenium driver (firefox headless)...") 19 | browser_options = Options() 20 | browser_options.headless = True 21 | self.driver = webdriver.Firefox(options=browser_options, executable_path=GeckoDriverManager().install()) 22 | logging.info(f"Selenium driver started [success]") 23 | 24 | def __del__(self): 25 | self.driver.quit() 26 | 27 | def scrape_leaderboard_table(self, url): 28 | logging.info(f"Parsing dropdown options from '{url}'...") 29 | self.driver.get(url) 30 | time.sleep(1) # Wait for JS to load page 31 | 32 | periods = self.driver.find_element(By.XPATH, "//select[@class='dropdown']").find_elements(By.TAG_NAME, 'option') 33 | logging.info(f"Got {len(periods)} options from '{url}' [success]") 34 | 35 | leaderboard_columns = ['period', 'handle', 'prize_money', 'total_reports', 'high_all', 'high_solo', 'med_all', 'med_solo', 'gas_all'] 36 | leaderboard_data = pd.DataFrame(columns=leaderboard_columns) 37 | 38 | logging.info(f"Parsing leaderboard data for each option...") 39 | self.console_handler.terminator = "\r" 40 | for period in periods: 41 | period.click() 42 | time.sleep(10) # Wait for JS to load page 43 | 44 | table = BeautifulSoup(self.driver.find_element(By.XPATH, "//table[@class='leaderboard-table']").get_attribute("outerHTML"), 'lxml') 45 | for div in table.find_all(attrs={'class': 'sb-avatar__text'}): # Remove avatar text for correct parsing of wardens handle 46 | div.extract() 47 | 48 | df = pd.read_html(str(table))[0] 49 | df.columns = leaderboard_columns 50 | 51 | if df["handle"].str.contains('No results to show. Try changing filter criteria').any(): 52 | continue 53 | df["period"] = period.text 54 | df["prize_money"] = pd.to_numeric(df["prize_money"].str.replace(r'\$|,', '', regex=True), errors='coerce').fillna(0).astype(float) 55 | 56 | is_team_data = [] 57 | for div in table.find_all(attrs={'class': 'wrapper-competitor'}): 58 | is_team_data.append(div.find('div', attrs={'class': 'wrapper-members'}) != None) 59 | df["is_team"] = is_team_data if is_team_data else False 60 | 61 | leaderboard_data = pd.concat([leaderboard_data, df]) 62 | logging.info(f"Parsed {periods.index(period) + 1}/{len(periods)} options ({len(df.index)} rows added for '{period.text}')") 63 | self.console_handler.terminator = "\n" 64 | 65 | leaderboard_data.insert(2, 'is_team', leaderboard_data.pop('is_team')) # Re-order column next to 'handle' column 66 | return leaderboard_data.reset_index(drop=True) 67 | 68 | def scrape_contests_data(self, url): 69 | logging.info(f"Getting all contests links from '{url}'...") 70 | self.driver.get(url) 71 | time.sleep(1) # Wait for JS to load page 72 | 73 | contests = [] 74 | for c in self.driver.find_elements(By.XPATH, "//div[@class='wrapper-contest-content']"): 75 | try: 76 | contests.append(c.find_element(By.XPATH, "./a[contains(@class, 'contest-repo')]").get_attribute("href")) 77 | except NoSuchElementException as e: 78 | logging.warning(f"Could not find contest link for '{c.find_element(By.TAG_NAME, 'h4').get_attribute('innerText')}'\n") 79 | continue 80 | logging.info(f"Got {len(contests)} contests from '{url}' [success]") 81 | 82 | contest_table_columns = ['handle', 'prize_money', 'total_reports', 'high_all', 'high_solo', 'med_all', 'med_solo', 'gas_all'] 83 | contests_columns = ['contest_report_repo', 'contest_sponsor', 'contest_desc', 'start', 'end', 'prize_pool'] + contest_table_columns 84 | contests_data = pd.DataFrame(columns=contests_columns) 85 | 86 | logging.info(f"Scraping each contest entry data...") 87 | self.console_handler.terminator = "\r" 88 | for contest_link in contests: 89 | logging.debug(f"Getting data for '{contest_link}'...") 90 | self.driver.get(contest_link) 91 | time.sleep(1) # Wait for JS to load page 92 | 93 | try: 94 | contest_tabs = self.driver.find_element(By.XPATH, "//div[@class='contest-tabs']") 95 | except NoSuchElementException as e: 96 | logging.warning(f"Could not parse '{contest_link}': contests tabs not found\n") 97 | continue 98 | 99 | try: 100 | leaderboard_table = contest_tabs.find_element(By.XPATH, "//table[@class='leaderboard-table']") 101 | except NoSuchElementException as e: 102 | logging.warning(f"No awards distributed yet for '{contest_link}'\n") 103 | continue 104 | 105 | if "No results to show" in leaderboard_table.get_attribute("outerHTML"): 106 | logging.warning(f"No awards distributed yet for '{contest_link}'\n") 107 | continue 108 | 109 | df = pd.read_html(leaderboard_table.get_attribute("outerHTML"))[0] 110 | df.columns = ["id"] + contest_table_columns 111 | df.drop("id", axis=1, inplace=True) 112 | 113 | df["contest_report_repo"] = '' 114 | repos_buttons = self.driver.find_element(By.XPATH, "//div[@class='button-wrapper']") 115 | if (len(repos_buttons.find_elements(By.TAG_NAME, 'a')) > 1): # Check that the contest report has been published 116 | ''' 117 | Report link is either a PDF file (older contests) or a link containing the repo name which can be used with the scraped Github data 118 | Examples : 119 | https://ipfs.io/ipfs/bafybeicjla2h26q3wz4s344bsrtvhkxr3ypm44owvrzyorb2t6tcptlmem/C4%20Slingshot%20report.pdf 120 | https://code4rena.com/reports/2021-06-pooltogether 121 | ''' 122 | report_link = repos_buttons.find_elements(By.TAG_NAME, 'a')[-1].get_attribute("href") 123 | df["contest_report_repo"] = '' if not 'code4rena.com/reports/' in report_link else report_link[report_link.rindex('/') + 1:] 124 | 125 | contest_header_div = self.driver.find_element(By.XPATH, "//div[@class='top-section-text']") 126 | df["contest_sponsor"] = contest_header_div.find_element(By.TAG_NAME, 'h1').get_attribute('innerText').lower().replace(' ', '').replace('contest', '') 127 | df["contest_desc"] = contest_header_div.find_element(By.TAG_NAME, 'p').get_attribute('innerText') 128 | 129 | contest_date_div = self.driver.find_element(By.XPATH, "//div[@class='contest-tippy-top']") 130 | dates = contest_date_div.find_element(By.TAG_NAME, 'p').get_attribute("innerText").split('—') # Example: Contest ran 16 February 2021—22 February 2021 131 | df["start"] = ' '.join(dates[0].split(' ')[-3:]) # Remove the 'Contest ran ' and keep only starting date 132 | df["end"] = dates[1] 133 | 134 | df["prize_money"] = df["prize_money"].str.replace(r'\$|,', '', regex=True).astype(float) 135 | df["prize_pool"] = round(sum(df["prize_money"])) 136 | 137 | contests_data = pd.concat([contests_data, df]) 138 | logging.info(f"Parsed {contests.index(contest_link) + 1}/{len(contests)} contests ({len(df.index)} rows added) ") 139 | self.console_handler.terminator = "\n" 140 | 141 | return contests_data.reset_index(drop=True) 142 | -------------------------------------------------------------------------------- /scrapers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0237h/code4rena-scraper/0ecc09da0d37b541ca49d96942574a8fbaf325eb/scrapers/__init__.py -------------------------------------------------------------------------------- /site/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Index of / 6 | 7 | 8 | 9 | 10 |
11 |

Index of /

12 | 13 | 14 | 15 | 18 | 21 | 24 | 25 | 26 | 27 | 28 | 32 | 33 | 34 | 35 | 36 | 40 | 41 | 42 | 43 | 44 | 45 | 46 |
16 | Filename 17 | 19 | Size 20 | 22 | Last Modified 23 |
29 |

..

.. 31 |
-06-Jan-2023 16:00
37 |

static

static 39 |
-06-Jan-2023 16:00
47 | 48 | generated by apindex 50 | 51 | 52 |
53 | 54 | 55 | -------------------------------------------------------------------------------- /site/static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Index of / 6 | 7 | 8 | 9 | 10 |
11 |

Index of /static

12 | 13 | 14 | 15 | 18 | 21 | 24 | 25 | 26 | 27 | 28 | 32 | 33 | 34 | 35 | 36 | 37 | 41 | 42 | 43 | 44 | 45 | 49 | 50 | 51 | 52 | 53 | 57 | 58 | 59 | 60 | 61 | 65 | 66 | 67 | 68 | 69 | 73 | 74 | 75 | 76 | 77 | 81 | 82 | 83 | 84 | 85 | 89 | 90 | 91 | 92 | 93 | 97 | 98 | 99 | 100 | 101 | 105 | 106 | 107 | 108 | 109 | 110 |
16 | Filename 17 | 19 | Size 20 | 22 | Last Modified 23 |
29 |

..

.. 31 |
-06-Jan-2023 16:00
38 |

contests_prize_pool_distribution.html

contests_prize_pool_distribution.html 40 |
4093 kB06-Jan-2023 16:00
46 |

duplicates.html

duplicates.html 48 |
8676 kB06-Jan-2023 16:00
54 |

findings_value.html

findings_value.html 56 |
665 kB06-Jan-2023 16:00
62 |

participants_by_report_severity.html

participants_by_report_severity.html 64 |
819 kB06-Jan-2023 16:00
70 |

participants_growth.html

participants_growth.html 72 |
616 kB06-Jan-2023 16:00
78 |

participants_longevity.html

participants_longevity.html 80 |
722 kB06-Jan-2023 16:00
86 |

severity_revenue.html

severity_revenue.html 88 |
10373 kB06-Jan-2023 16:00
94 |

warden_stats.html

warden_stats.html 96 |
739 kB06-Jan-2023 16:00
102 |

wardens_team_vs_solo.html

wardens_team_vs_solo.html 104 |
2224 kB06-Jan-2023 16:00
111 | 112 | generated by apindex 114 | 115 | 116 |
117 | 118 | 119 | --------------------------------------------------------------------------------