├── .github
└── workflows
│ ├── update_csv_files.yml
│ └── website.yml
├── .gitignore
├── README.md
├── charts_data
├── contests_prize_pool_distribution.ipynb
├── duplicates.ipynb
├── findings_value.ipynb
├── participants_by_report_severity.ipynb
├── participants_growth.ipynb
├── participants_longevity.ipynb
├── severity_revenue.ipynb
├── warden_stats.ipynb
└── wardens_team_vs_solo.ipynb
├── contests_code4rena.csv
├── github_code4rena.csv
├── leaderboard_code4rena.csv
├── main.py
├── requirements.txt
├── runtime.txt
├── scrapers
├── GithubScraper.py
├── WebScraper.py
└── __init__.py
└── site
├── index.html
└── static
├── contests_prize_pool_distribution.html
├── duplicates.html
├── findings_value.html
├── index.html
├── participants_by_report_severity.html
├── participants_growth.html
├── participants_longevity.html
├── severity_revenue.html
├── warden_stats.html
└── wardens_team_vs_solo.html
/.github/workflows/update_csv_files.yml:
--------------------------------------------------------------------------------
1 | # This is a basic workflow to help you get started with Actions
2 |
3 | name: Update scraped data
4 |
5 | # Controls when the workflow will run
6 | on:
7 | schedule:
8 | - cron: "0 12 * * *"
9 | # Allows you to run this workflow manually from the Actions tab
10 | workflow_dispatch:
11 |
12 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
13 | jobs:
14 | # This workflow contains a single job called "build"
15 | scrape:
16 | # The type of runner that the job will run on
17 | runs-on: ubuntu-latest
18 | env:
19 | API_ACCESS_TOKEN: ${{ secrets.API_ACCESS_TOKEN }}
20 | GH_TOKEN: ${{ secrets.API_ACCESS_TOKEN }}
21 |
22 | # Steps represent a sequence of tasks that will be executed as part of the job
23 | steps:
24 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
25 | - uses: actions/checkout@v3
26 |
27 | - name: Setup Python
28 | uses: actions/setup-python@v4.2.0
29 |
30 | - name: Install dependencies
31 | run: |
32 | python -m pip install --upgrade pip
33 | pip install -r requirements.txt
34 |
35 | - name: Run scraping script
36 | run: python main.py all
37 |
38 | - name: Git Auto Commit
39 | uses: stefanzweifel/git-auto-commit-action@v4.14.1
40 | with:
41 | commit_message: Updated all scraped data (CSV)
42 | file_pattern: '*.csv'
43 |
--------------------------------------------------------------------------------
/.github/workflows/website.yml:
--------------------------------------------------------------------------------
1 | name: Build website
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 | paths:
8 | - '**.csv'
9 | workflow_dispatch:
10 |
11 | permissions:
12 | contents: write
13 | pages: write
14 | id-token: write
15 |
16 | # Allow one concurrent deployment
17 | concurrency:
18 | group: "pages"
19 | cancel-in-progress: true
20 |
21 | jobs:
22 | # Single deploy job since we're just deploying
23 | deploy:
24 | environment:
25 | name: github-pages
26 | url: ${{ steps.deployment.outputs.page_url }}
27 | runs-on: ubuntu-latest
28 | steps:
29 | - name: Checkout
30 | uses: actions/checkout@v3
31 | - name: Setup Pages
32 | uses: actions/configure-pages@v2
33 | - uses: actions/setup-python@v4
34 | with:
35 | python-version: "3.11"
36 | - run: |
37 | python -m pip install --upgrade pip
38 | pip install -r requirements.txt
39 | pip uninstall rfc3986-validator -y
40 | - name: Update analysis notebooks
41 | run: |
42 | for filename in charts_data/*.ipynb; do
43 | jupyter nbconvert --to notebook --execute $filename --ExecutePreprocessor.kernel_name='python3' --inplace
44 | done
45 | - name: Convert notebooks to HTML
46 | run: |
47 | mkdir -p site/static
48 | for filename in charts_data/*.ipynb; do
49 | jupyter nbconvert --to html $filename
50 | mv ${filename%.*}.html site/static/
51 | done
52 | - name: Install and build index
53 | run: |
54 | sudo apt-get update
55 | sudo apt-get install curl git -y
56 | curl https://raw.githubusercontent.com/jayanta525/apindex-v2/master/sudo-install.sh | bash
57 | cd site/
58 | apindex .
59 | - name: Upload artifacts
60 | uses: actions/upload-pages-artifact@v1
61 | with:
62 | path: 'site/'
63 | - name: Deploy to GitHub Pages
64 | id: deployment
65 | uses: actions/deploy-pages@v1
66 | - name: Commit
67 | uses: stefanzweifel/git-auto-commit-action@v4
68 | with:
69 | commit_message: Automated static html notebooks build
70 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | __pycache__/
3 | repos_data/
4 | *.ipynb_*
5 | *.log
6 | *.sqlite
7 | *.xlsx
8 | *.sublime*
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | > [!IMPORTANT]
2 | > This repository is archived due to significant changes to both Code4rena's website and repos since first started over a year ago. Additionally, Code4rena started providing data through their [community resources](https://github.com/code-423n4/code423n4.com/blob/main/_data/README-community_resources.md).
3 | > A new repo is available as [code4rena-stats](https://github.com/Krow10/code4rena-stats) for the charts and insights.
4 |
5 | # Archived README.md
6 |
7 | ## code4rena-scraper
8 | Scraping [Code4rena](https://www.code4rena.com) contest audits reports for stats, fun (and profit ?).
9 |
10 | For accurate prize money numbers check the Code4rena [leaderboard](https://code4rena.com/leaderboard) directly.
11 |
12 | ### Why ?
13 |
14 | To play around with the [Github API](https://docs.github.com/en/rest) and work my python scripting skills. It also gave me the chance to work with data analysis tools such as [Jupyter notebooks](https://jupyter.org/), [Pandas](https://pandas.pydata.org/docs/index.html) for manipulating the data and [Altair](https://altair-viz.github.io/index.html), a visualization framework for generating charts.
15 |
16 | In the beginning, I was curious since I found out that the audits reports repos contains the address of each participant for sending their prize money (see [here](https://github.com/code-423n4/2021-05-nftx-findings/tree/main/data) for example, in the .json files). I thought it would be interesting to try and track the flow of funds (which could be an issue if certain people wants to stay anonymous on this platform). However, this part is currently left out and the project quickly evolved into extracting data and building statistics from the Code4rena contests.
17 |
18 | Also, I realized after a week of working on this project that the [website repo](https://github.com/code-423n4/code423n4.com/tree/main/_data) of Code4rena already contains data for contests, findings and handles but hey, I learned a lot about the scraping process !
19 |
20 | ### What ?
21 |
22 | Data is scraped from the [Code4rena](https://www.code4rena.com) published audits repos using the [Github API](https://docs.github.com/en/rest), as well as directly from the [leaderboard](https://code4rena.com/leaderboard) and [contests](https://code4rena.com/contests/) entries of the Code4rena website and is parsed to CSV files. Original CSV files can also be used directly from the [Code4rena repo](https://github.com/code-423n4/code423n4.com/tree/main/_data) in the contests/ and findings/ folders.
23 |
24 | Part of the data extracted can be used to link ETH/Polygon addresses to contest participants. Using tools like [polygonscan](https://polygonscan.com), [etherscan](https://etherscan.io) or [Bitquery](https://explorer.bitquery.io/) allows to look at the flow of funds from and to those wallets (***this part hasn't been implemented or explored too much yet***).
25 |
26 | Is it useful ? Probably not.
27 |
28 | Worth the time ? I'd say yes as it gave me insights as to how to track funds accross different chains (Polygon, Ethereum mainnet, etc.).
29 |
30 | Also, the extracted data allows to see who might be most efficient, writes the most duplicates, percentage of invalid submission, etc.
31 |
32 | #### Jupyter notebooks
33 | Notebooks can be found in the [charts_data](charts_data/) folder to visualize the data. A link is provided below each chart for a static view of each notebook.
34 | For an interactive lab, you could setup your own locally or run one online [](https://mybinder.org/v2/gh/Krow10/code4rena-scraper/HEAD).
35 |
36 | You can also run non-interactive notebooks through [nbviewer](https://nbviewer.org/github/Krow10/code4rena-scraper/tree/master/charts_data/) or view the static generated html at [https://krow10.github.io/code4rena-scraper/](https://krow10.github.io/code4rena-scraper/).
37 |
38 | ### How ?
39 |
40 | Install all requirements through `pip install -r requirements.txt` and setup your own [Github access token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) in the `.env` file.
41 |
42 | Then use [`main.py [leaderboard|contests|github|all]`](main.py) to fetch and parse the latest data in CSV files. A Github action is available for updating the CSV files in this repo directly.
43 |
44 | Currently, the extracted data from the Github API ([github_code4rena.csv](github_code4rena.csv)) looks like this:
45 | | contest_id | handle | address | risk | title | issueId | issueUrl | contest_sponsor | date | tags | issueCreation |
46 | | ---------- | ------ | ------- | ---- | ----- | ------- | -------- | --------------- | ---- | ---- | ------------- |
47 | | Identifiy the contest | Name of the warden | Polygon address | Caracterize the submission criticity (0 to 3, G for gas optimization, Q for QA) | Title of the submission | Github issue number | Github issue URL (unused) | Contest sponsor extracted from repo's name | Contest running date extracted from repo's name | Tags associated with issue (further caracterize the submission) | Creation time of the issue |
48 |
49 | So each line in the csv file corresponds to one submission (identified by the `issueId`) of a warden (identified by his/her `(handle, address)` pair) for a given contest (identified by the `contest_id`).
50 |
51 | The data can then be imported inside a Jupyter notebook (or anywhere else, how you want to parse it) for easy processing and visualization like so:
52 | ```python
53 | import pandas as pd
54 | import altair as alt
55 |
56 | alt.data_transformers.disable_max_rows() # Disable 5_000 rows limit
57 | data = pd.read_csv("../github_code4rena.csv") # Set path accordingly
58 |
59 | # Visualize whatever (see https://altair-viz.github.io)
60 | alt.Chart(...)
61 | ```
62 |
63 | For the leaderboard ([leaderboard_code4rena.csv](leaderboard_code4rena.csv)), the data looks like this:
64 | | period | handle | is_team | prize_money | total_reports | high_all | high_solo | med_all | med_solo | gas_all
65 | | ------ | ------ | ------- | ----------- | ------------- | -------- | --------- | ------- | -------- | -------
66 | | The period for which the data comes from | Name of the warden | Boolean indicating if the handle refers to a team or not | Total earnings for the period (in $USD) | Total accepted reports for the period | High severity issues found with others | High severity issues found alone | Medium severity issues found with others | Medium severity issues found alone | Gas optimization reports submitted
67 |
68 | And for the contests ([contests_code4rena.csv](contests_cod4rena.csv)), the data looks like this:
69 | | contest_report_repo | contest_sponsor | contest_desc | start | end | prize_pool | handle | prize_money | total_reports | high_all | high_solo | med_all | med_solo | gas_all
70 | | - | - | - | - | - | - | - | - | - | - | - | - | - | -
71 | | The name of the Github repo for the contest audit report or empty if not published yet | Name of the contest sponsor (lowercase, stripped) | Description of the contest sponsor | Starting date of the contest | Ending date of the contest | Total prize pool (calculated from the sum of warden's prize money) | Name of the warden | Total earnings for the contest (in $USD) | Total accepted reports for the contest | High severity issues found with others | High severity issues found alone | Medium severity issues found with others | Medium severity issues found alone | Gas optimization reports submitted
72 |
73 | ### Next ?
74 |
75 | - [x] Get linked audits issues tags and add the data to the csv (helps flag invalid, duplicate and accepted submissions)
76 | - [x] Use data analysis modules or external programs to actually do something with the data
77 | - [x] For each contest, scrape the prize pool and results from the Code4rena contest page ([example](https://code4rena.com/contests/2021-02-slingshot-finance-contest)) and make a [ridgeline plot](https://altair-viz.github.io/gallery/ridgeline_plot.html) showing the distribution of rewards for each prize pool amount (with layered distribution for same pool amount) or simpler [boxplots](https://altair-viz.github.io/gallery/boxplot.html)
78 | - [x] Rework Github scraping for returning DataFrame for consistency
79 | - [x] ~~Try to make [ridgeline](https://altair-viz.github.io/gallery/ridgeline_plot.html) work (it looks so sick!)~~ *not best for this kind of data actually*
80 | - [x] ~~Rework scraping of issue labels to identify first labels (meaning original submission severity level) and last labels or maybe track entire history of labels in chronological order~~ *done through parsing with pandas*
81 | - [x] ~~Valid / invalid reports charts by contest sorted by start date (bars again ?)~~ *done and more in warden_stats*
82 | - [ ] Connect to Polygon/Ethereum blockchain to show the balances of the addresses listed
83 | - [ ] Add command line argument parsing
84 | - [ ] Make CSV files auto-update through workflow when changes happens on the Code4rena repo
85 | - [x] ~~Some more data mining from on-chain data maybe (GraphQL API would be best)~~ *won't do, no time*
86 |
--------------------------------------------------------------------------------
/charts_data/findings_value.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "b0ba6704",
7 | "metadata": {
8 | "execution": {
9 | "iopub.execute_input": "2023-01-06T15:59:46.927300Z",
10 | "iopub.status.busy": "2023-01-06T15:59:46.926643Z",
11 | "iopub.status.idle": "2023-01-06T15:59:47.306523Z",
12 | "shell.execute_reply": "2023-01-06T15:59:47.305684Z"
13 | }
14 | },
15 | "outputs": [
16 | {
17 | "data": {
18 | "text/plain": [
19 | "DataTransformerRegistry.enable('default')"
20 | ]
21 | },
22 | "execution_count": 1,
23 | "metadata": {},
24 | "output_type": "execute_result"
25 | }
26 | ],
27 | "source": [
28 | "import pandas as pd\n",
29 | "import altair as alt\n",
30 | "alt.data_transformers.disable_max_rows() # Disable 5_000 rows limit"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 2,
36 | "id": "93c05a5d",
37 | "metadata": {
38 | "execution": {
39 | "iopub.execute_input": "2023-01-06T15:59:47.309189Z",
40 | "iopub.status.busy": "2023-01-06T15:59:47.308760Z",
41 | "iopub.status.idle": "2023-01-06T15:59:47.436029Z",
42 | "shell.execute_reply": "2023-01-06T15:59:47.435491Z"
43 | }
44 | },
45 | "outputs": [],
46 | "source": [
47 | "findings_data = pd.read_csv(\"https://raw.githubusercontent.com/code-423n4/code423n4.com/main/_data/findings/findings.csv\") # Set path accordingly\n",
48 | "findings_data[\"contestid\"] = findings_data[\"contest\"]\n",
49 | "contests_data = pd.read_csv(\"https://raw.githubusercontent.com/code-423n4/code423n4.com/main/_data/contests/contests.csv\")"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 3,
55 | "id": "11933531",
56 | "metadata": {
57 | "execution": {
58 | "iopub.execute_input": "2023-01-06T15:59:47.439249Z",
59 | "iopub.status.busy": "2023-01-06T15:59:47.438932Z",
60 | "iopub.status.idle": "2023-01-06T15:59:47.475227Z",
61 | "shell.execute_reply": "2023-01-06T15:59:47.474686Z"
62 | }
63 | },
64 | "outputs": [],
65 | "source": [
66 | "df = pd.merge(findings_data, contests_data[[\"contestid\", \"end_time\"]], on=\"contestid\").drop_duplicates()\n",
67 | "df[\"end_time\"] = pd.to_datetime(df[\"end_time\"])\n",
68 | "df[\"risk_label\"] = df[\"risk\"].map(\n",
69 | " {\n",
70 | " '0': '0_Very low (unused since February 2022)', \n",
71 | " '1': '1_Low (unused since February 2022)',\n",
72 | " '2': '2_Medium',\n",
73 | " '3': '3_High',\n",
74 | " 'g': 'g_Gas optimization',\n",
75 | " 'q': 'q_QA report',\n",
76 | " }\n",
77 | ")"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 4,
83 | "id": "150ea944",
84 | "metadata": {
85 | "execution": {
86 | "iopub.execute_input": "2023-01-06T15:59:47.482025Z",
87 | "iopub.status.busy": "2023-01-06T15:59:47.481823Z",
88 | "iopub.status.idle": "2023-01-06T15:59:47.486206Z",
89 | "shell.execute_reply": "2023-01-06T15:59:47.485427Z"
90 | }
91 | },
92 | "outputs": [],
93 | "source": [
94 | "ordered_legend_reports_labels = [\n",
95 | " '3_High',\n",
96 | " '2_Medium',\n",
97 | " 'g_Gas optimization',\n",
98 | " 'q_QA report',\n",
99 | " '1_Low (unused since February 2022)',\n",
100 | " '0_Very low (unused since February 2022)', \n",
101 | "]\n",
102 | "label_colors = [\"#FE266D\",\"#FA6C44\",\"#F2E713\",\"#D1D811\",\"#0AB6F8\",\"#5688C1\"]\n",
103 | "chart_width = 850\n",
104 | "chart_height = 350"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 5,
110 | "id": "7c14990c",
111 | "metadata": {
112 | "execution": {
113 | "iopub.execute_input": "2023-01-06T15:59:47.488513Z",
114 | "iopub.status.busy": "2023-01-06T15:59:47.488221Z",
115 | "iopub.status.idle": "2023-01-06T15:59:47.553951Z",
116 | "shell.execute_reply": "2023-01-06T15:59:47.553394Z"
117 | }
118 | },
119 | "outputs": [
120 | {
121 | "name": "stderr",
122 | "output_type": "stream",
123 | "text": [
124 | "/opt/hostedtoolcache/Python/3.11.1/x64/lib/python3.11/site-packages/altair/utils/core.py:317: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.\n",
125 | " for col_name, dtype in df.dtypes.iteritems():\n"
126 | ]
127 | },
128 | {
129 | "data": {
130 | "text/html": [
131 | "\n",
132 | "
\n",
133 | ""
186 | ],
187 | "text/plain": [
188 | "alt.Chart(...)"
189 | ]
190 | },
191 | "execution_count": 5,
192 | "metadata": {},
193 | "output_type": "execute_result"
194 | }
195 | ],
196 | "source": [
197 | "alt.Chart(df.groupby([\"risk_label\", \"split\"])[\"awardUSD\"].median().reset_index(),\n",
198 | " width=400,\n",
199 | " title=\"Findings' value distribution according to number of shared submissions by risk level\"\n",
200 | ").transform_filter(\n",
201 | " alt.datum.risk_label != \"0_Very low (unused since February 2022)\"\n",
202 | ").mark_bar().encode(\n",
203 | " x=alt.X(\"split:O\", title=\"Number of wardens sharing a finding\"),\n",
204 | " y=alt.Y(\"awardUSD:Q\", title=\"Finding $USD value\", axis=alt.Axis(format='$,.0f')),\n",
205 | " color=alt.Color(\n",
206 | " 'risk_label:N', \n",
207 | " title=\"Risk level\",\n",
208 | " scale=alt.Scale(domain=ordered_legend_reports_labels[:-1], range=label_colors[:-1]),\n",
209 | " legend=alt.Legend(orient=\"top\", labelFontSize=12, labelLimit=250),\n",
210 | " ),\n",
211 | " column=alt.Column(\"risk_label:N\", sort=ordered_legend_reports_labels[:-1], title=\"\"),\n",
212 | " tooltip=[\"risk_label:N\", \"split:O\", \"awardUSD:Q\"]\n",
213 | ").resolve_scale(\n",
214 | " y='independent'\n",
215 | ").resolve_axis(\n",
216 | " x='independent'\n",
217 | ")"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": 6,
223 | "id": "55aacb8f",
224 | "metadata": {
225 | "execution": {
226 | "iopub.execute_input": "2023-01-06T15:59:47.557198Z",
227 | "iopub.status.busy": "2023-01-06T15:59:47.557014Z",
228 | "iopub.status.idle": "2023-01-06T15:59:47.596518Z",
229 | "shell.execute_reply": "2023-01-06T15:59:47.595958Z"
230 | }
231 | },
232 | "outputs": [
233 | {
234 | "name": "stderr",
235 | "output_type": "stream",
236 | "text": [
237 | "/tmp/ipykernel_1890/3862203495.py:2: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.\n",
238 | " df.groupby([df.end_time.dt.to_period(\"M\"), \"risk_label\"])[\"awardUSD\"].mean().reset_index().astype({\"end_time\": str}),\n"
239 | ]
240 | }
241 | ],
242 | "source": [
243 | "rewards = alt.Chart(\n",
244 | " df.groupby([df.end_time.dt.to_period(\"M\"), \"risk_label\"])[\"awardUSD\"].mean().reset_index().astype({\"end_time\": str}), \n",
245 | " width=chart_width, \n",
246 | " height=chart_height,\n",
247 | " title=\"Value of a submission ($USD) over time by risk level\"\n",
248 | ").mark_line(\n",
249 | " point=True\n",
250 | ").encode(\n",
251 | " x=alt.X('end_time:T', title=\"\"),\n",
252 | " y=alt.Y('awardUSD:Q', title=\"\", axis=alt.Axis(format='$,.0f')),\n",
253 | " color=alt.Color(\n",
254 | " 'risk_label:N', \n",
255 | " title=\"Risk level\",\n",
256 | " scale=alt.Scale(domain=ordered_legend_reports_labels, range=label_colors),\n",
257 | " legend=alt.Legend(orient=\"top\", labelFontSize=12, labelLimit=250)\n",
258 | " ),\n",
259 | " tooltip=['end_time:T', 'risk_label:N', 'awardUSD:Q']\n",
260 | ")"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": 7,
266 | "id": "99b7b5ee",
267 | "metadata": {
268 | "execution": {
269 | "iopub.execute_input": "2023-01-06T15:59:47.599031Z",
270 | "iopub.status.busy": "2023-01-06T15:59:47.598841Z",
271 | "iopub.status.idle": "2023-01-06T15:59:47.619268Z",
272 | "shell.execute_reply": "2023-01-06T15:59:47.618721Z"
273 | }
274 | },
275 | "outputs": [
276 | {
277 | "name": "stderr",
278 | "output_type": "stream",
279 | "text": [
280 | "/tmp/ipykernel_1890/1810809613.py:2: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.\n",
281 | " df.groupby([df.end_time.dt.to_period(\"M\"), \"risk_label\"])[\"contest\"].count().reset_index().astype({\"end_time\": str}),\n"
282 | ]
283 | }
284 | ],
285 | "source": [
286 | "submissions = alt.Chart(\n",
287 | " df.groupby([df.end_time.dt.to_period(\"M\"), \"risk_label\"])[\"contest\"].count().reset_index().astype({\"end_time\": str}),\n",
288 | " width=chart_width,\n",
289 | " height=chart_height,\n",
290 | " title=\"Expected warden reward ($USD) and number of reports over time by risk level\",\n",
291 | ").mark_line(\n",
292 | " opacity=.75,\n",
293 | " strokeDash=[2]\n",
294 | ").encode(\n",
295 | " x=alt.X('end_time:T', title=\"\"),\n",
296 | " y=alt.Y('contest:Q', title=\"Number of reports\"),\n",
297 | " color=alt.Color(\n",
298 | " 'risk_label:N', \n",
299 | " title=\"Risk level\",\n",
300 | " scale=alt.Scale(domain=ordered_legend_reports_labels, range=label_colors),\n",
301 | " legend=alt.Legend(orient=\"top\", labelFontSize=12, labelLimit=250),\n",
302 | " ),\n",
303 | ")"
304 | ]
305 | },
306 | {
307 | "cell_type": "markdown",
308 | "id": "f63ff21f",
309 | "metadata": {},
310 | "source": [
311 | "## Expected warden reward calculation\n",
312 | "\n",
313 | "1. For each contest, take the total number of submissions for each risk level and divide it by the number of participants. This gives a value *X* that corresponds to **the average number of submission for each risk level by a single warden**.\n",
314 | "2. Now take the mean reward value for each contest and risk level and multiply that by *X*. This gives a value *Y* that corresponds to **the expected warden reward according the average number of submission**.\n",
315 | "3. Group the contests by their ending date (month/year) and take the mean of the *Y*s. This gives **the expected warden reward per month for each risk level** which is what is plotted in the second chart. "
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 8,
321 | "id": "c6714ef5",
322 | "metadata": {
323 | "execution": {
324 | "iopub.execute_input": "2023-01-06T15:59:47.621610Z",
325 | "iopub.status.busy": "2023-01-06T15:59:47.621430Z",
326 | "iopub.status.idle": "2023-01-06T15:59:47.640546Z",
327 | "shell.execute_reply": "2023-01-06T15:59:47.640046Z"
328 | }
329 | },
330 | "outputs": [],
331 | "source": [
332 | "df2 = pd.merge(df.groupby([\"contest\", \"end_time\", \"risk\"])[[\"finding\"]].count().reset_index(), df.groupby(\"contest\")[\"handle\"].nunique(), on=\"contest\")"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": 9,
338 | "id": "7bd3543c",
339 | "metadata": {
340 | "execution": {
341 | "iopub.execute_input": "2023-01-06T15:59:47.642868Z",
342 | "iopub.status.busy": "2023-01-06T15:59:47.642695Z",
343 | "iopub.status.idle": "2023-01-06T15:59:47.648473Z",
344 | "shell.execute_reply": "2023-01-06T15:59:47.647936Z"
345 | }
346 | },
347 | "outputs": [],
348 | "source": [
349 | "df2[\"average_findings\"] = df2.finding / df2.handle\n",
350 | "df2[\"end_time\"] = pd.to_datetime(df2[\"end_time\"])"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": 10,
356 | "id": "db1134bd",
357 | "metadata": {
358 | "execution": {
359 | "iopub.execute_input": "2023-01-06T15:59:47.650524Z",
360 | "iopub.status.busy": "2023-01-06T15:59:47.650349Z",
361 | "iopub.status.idle": "2023-01-06T15:59:47.662726Z",
362 | "shell.execute_reply": "2023-01-06T15:59:47.662232Z"
363 | }
364 | },
365 | "outputs": [],
366 | "source": [
367 | "df3 = pd.merge(df, df2[[\"contest\", \"average_findings\", \"risk\"]], on=[\"contest\", \"risk\"])"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": 11,
373 | "id": "eb907555",
374 | "metadata": {
375 | "execution": {
376 | "iopub.execute_input": "2023-01-06T15:59:47.665095Z",
377 | "iopub.status.busy": "2023-01-06T15:59:47.664843Z",
378 | "iopub.status.idle": "2023-01-06T15:59:47.682620Z",
379 | "shell.execute_reply": "2023-01-06T15:59:47.681999Z"
380 | }
381 | },
382 | "outputs": [
383 | {
384 | "data": {
385 | "text/html": [
386 | "