├── README.md
├── requirements.txt
├── fuzzer
    ├── syzkaller
    │   ├── db_to_list.sh
    │   ├── process_osv.sh
    │   ├── README.md
    │   ├── list_to_unique.py
    │   ├── summary_to_processed.py
    │   ├── processed_to_osv.py
    │   ├── db_to_list.sql
    │   ├── unique_to_delta.py
    │   └── delta_to_summary.py
    └── README.md
├── .gitignore
└── .github
    └── workflows
        ├── main.yml
        └── syzkaller.yml


/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | google-auth
2 | requests
3 | 


--------------------------------------------------------------------------------
/fuzzer/syzkaller/db_to_list.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | sqlite3 -json mirror.sl3 ".read db_to_list.sql" | jq -c '. | map(. | to_entries | map({"key": .key, "value": (.value//""|split(","))}) | from_entries) | .[]'
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | mirror.sl3
 2 | fuzzer/syzkaller/wget-log*
 3 | fuzzer/syzkaller/unique.json
 4 | fuzzer/syzkaller/delta.json
 5 | fuzzer/syzkaller/processed.json
 6 | fuzzer/syzkaller/list.json
 7 | fuzzer/syzkaller/syzkaller/*
 8 | fuzzer/syzkaller/second.json
 9 | fuzzer/syzkaller/base.json
10 | fuzzer/syzkaller/syzkaller.tar.gz
11 | fuzzer/syzkaller/syzkaller_extid.txt
12 | fuzzer/syzkaller/syzkaller_fixed.txt
13 | fuzzer/syzkaller/reported_by.csv
14 | fuzzer/syzkaller/syzkaller.csv
15 | fuzzer/syzkaller/summary.json
16 | 


--------------------------------------------------------------------------------
/fuzzer/syzkaller/process_osv.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | set -o pipefail
 4 | 
 5 | function process_osv() {
 6 |     ./db_to_list.sh > list.json
 7 |     python3 ./list_to_unique.py list.json > unique.json
 8 |     python3 ./unique_to_delta.py unique.json > delta.json
 9 |     python3 ./delta_to_summary.py delta.json > summary.json
10 |     python3 ./summary_to_processed.py $1 summary.json > $2
11 |     python3 ./processed_to_osv.py $2
12 | }
13 | 
14 | wget -q -N https://linux-mirror-db.storage.googleapis.com/mirror.sl3
15 | wget -q -N https://linux-mirror-db.storage.googleapis.com/syzkaller.tar.gz
16 | tar xzf syzkaller.tar.gz syzkaller
17 | 
18 | process_osv $1 $2 | jq -c .[]
19 | 


--------------------------------------------------------------------------------
/fuzzer/syzkaller/README.md:
--------------------------------------------------------------------------------
 1 | This directory has an OSV-generation prototype for syzkaller bugs.
 2 | 
 3 | Once the bug lands on OSV, and there are no updates for some time (backports, regressions), a CVE would follow.
 4 | 
 5 | The workflow is as follows:
 6 |   1. db_to_list.sh: just executes db_to_list.sql over mirror.sl3
 7 |   2. db_to_list.sql: obtains all syzkaller reports and their corresponding commits and cves
 8 |   3. list_to_unique.py: uses the list of syzkaller reports and commits and deduplicates them by fix, cve and syzkaller report
 9 |   4. unique_to_delta.py: gets the unique crashes and obtains all metadata needed for the advisory
10 |   5. delta_to_processed.py: merges the advisories and obtains the new ones
11 |   6. processed_to_osv.py: formats the processed advisories into osv format
12 | 
13 | Could be improved by:
14 |   - the advisory summary/title could include more information (eg, from the syzkaller reproducer we can guess attack vector, privileges required)
15 |   - llm summarization could be a lot faster if done in batch, instead of serially
16 |   - summarization could be done at the last stage, only on new ones
17 |   - individual steps could be just functions to improve readability
18 | 
19 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | # This is a basic workflow to help you get started with Actions
 2 | 
 3 | name: Merge upstream branches
 4 | on:
 5 |   workflow_dispatch:
 6 | #  schedule:
 7 |      # actually, ~5 minutes is the highest
 8 |      # effective frequency you will get
 9 | #    - cron:  '* * * * *'
10 | 
11 | jobs:
12 |   merge:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v2
16 |       - name: Merge upstream
17 |         run: |
18 |           git config --global user.name 'Pauldevt'
19 |           git config --global user.email 'pauldevt@users.noreply.github.com'
20 |           # "git checkout master" is unnecessary, already here by default
21 |           git pull --unshallow  # this option is very important, you would get
22 |                                 # complains about unrelated histories without it.
23 |                                 # (but actions/checkout@v2 can also be instructed
24 |                                 # to fetch all git depth right from the start)
25 |           git remote add upstream https://github.com/CVEProject/cvelist.git
26 |           git fetch upstream
27 |           
28 |           # Neither forget the -b opt,
29 |           # the feature/x ref is ambiguous at this stage
30 |           
31 |           git checkout -b master
32 |           git merge --no-edit upstream/master
33 |           git push origin master
34 | 


--------------------------------------------------------------------------------
/.github/workflows/syzkaller.yml:
--------------------------------------------------------------------------------
 1 | name: Syzkaller Integration
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | jobs:
 7 |   syzk:
 8 |     runs-on: ubuntu-latest
 9 |     permissions:
10 |       contents: 'write'
11 |       id-token: 'write'
12 |     steps:
13 |       - uses: actions/checkout@v2
14 |         with:
15 |           fetch-depth: 1
16 |       - uses: 'google-github-actions/auth@v0.4.0'
17 |         with:
18 |           workload_identity_provider: 'projects/799795028847/locations/global/workloadIdentityPools/github-pool/providers/github-provider-new'
19 |           service_account: 'github@sdcpocs.iam.gserviceaccount.com'
20 |       - uses: actions/setup-python@v4
21 |         with:
22 |           python-version: '3.9'
23 |           cache: 'pip'
24 |       - run: pip install google-auth requests
25 |       - name: Update syzkaller pipeline
26 |         run: |
27 |           cd fuzzer/syzkaller
28 |           ./process_osv.sh base.json process.json > output-new.json
29 |           mv output-new.json output.json
30 |           mv process.json base.json
31 |       - run: |
32 |           git config --local user.email "action@github.com"
33 |           git config --local user.name "GitHub Action"
34 |           git add fuzzer/syzkaller/output.json fuzzer/syzkaller/base.json
35 |           git diff
36 |           git commit -m '[automatic] update output.json and base.json' || true
37 |           git push
38 | 


--------------------------------------------------------------------------------
/fuzzer/syzkaller/list_to_unique.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import json
 4 | import sys
 5 | 
 6 | def main(argv):
 7 |     unique_bugs = {}
 8 | 
 9 |     if len(argv) < 1:
10 |         raise Exception("No input file")
11 | 
12 |     with open(argv[0]) as list_file:
13 |         while True:
14 |             line = list_file.readline()
15 |             if not line:
16 |                 break
17 |             item = json.loads(line)
18 |             ids = item['syzkaller'] + item['fixed_by'] + item['cve']
19 |             canonical = None
20 |             for id in ids:
21 |                 if id in unique_bugs:
22 |                     canonical = unique_bugs[id]
23 |                     for key in canonical.keys():
24 |                         canonical[key] = list(set(canonical[key] + item[key]))
25 | 
26 |             if canonical is None:
27 |                 canonical = item
28 | 
29 |             for id in ids:
30 |                 if id in unique_bugs:
31 |                     for key in canonical.keys():
32 |                         canonical[key] = list(set(canonical[key] + unique_bugs[id][key]))
33 |             
34 |             ids = canonical['syzkaller'] + canonical['fixed_by'] + canonical['cve']
35 |             for id in ids:
36 |                 unique_bugs[id] = canonical
37 |         
38 |         for id in unique_bugs.keys():
39 |             unique_bugs[id] = json.dumps(unique_bugs[id])
40 |         
41 |         print(json.dumps([json.loads(item) for item in list(set(unique_bugs.values()))]))
42 | 
43 | if __name__ == "__main__":
44 |    main(sys.argv[1:])
45 | 


--------------------------------------------------------------------------------
/fuzzer/syzkaller/summary_to_processed.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | import uuid
 4 | 
 5 | def main(argv):
 6 |     with open(argv[0]) as base_file, open(argv[1]) as delta_file:
 7 |         base = json.load(base_file)
 8 |         delta = json.load(delta_file)
 9 |         all_bugs = {}
10 |         for bug in base + delta:
11 |             is_dupe = False
12 |             has_osv = len(bug['osvs']) > 0
13 |             for id in bug['unique_ids']:
14 |                 if id in all_bugs:
15 |                     if has_osv and len(all_bugs[id]['osvs']) > 0:
16 |                         print("two OSVs share reference %s (%s and %s)" % (id, bug['osvs'], all_bugs[id]['osvs']), file=sys.stderr)
17 |                     is_dupe = True
18 | 
19 |             if not is_dupe or has_osv:
20 |                 for id in bug['unique_ids']:
21 |                     if id not in all_bugs:
22 |                         all_bugs[id] = bug
23 |                     else:
24 |                         dupe_bug = all_bugs[id]
25 |                         for id in dupe_bug['unique_ids']:
26 |                             all_bugs[id] = bug
27 |                             bug['unique_ids'] = list(
28 |                                 set(bug['unique_ids'] + dupe_bug['unique_ids']))
29 |                             bug['unique_ids'].sort()
30 | 
31 |         unique_bugs = list(set([json.dumps(bug) for bug in all_bugs.values()]))
32 |         unique_bugs.sort(reverse=True)
33 | 
34 |         osv_eligible_bugs = []
35 |         for bug_serialized in unique_bugs:
36 |             bug = json.loads(bug_serialized)
37 |             if not len(bug['osvs']):
38 |                 bug['osvs'].append(str(uuid.uuid4()))
39 |             osv_eligible_bugs.append(bug)
40 | 
41 |         print(json.dumps(osv_eligible_bugs))
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     main(sys.argv[1:])
46 | 


--------------------------------------------------------------------------------
/fuzzer/syzkaller/processed_to_osv.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | 
 4 | def get_reference_type(url):
 5 |     if "git.kernel.org" in url:
 6 |         return "FIX"
 7 |     if "syzkaller.appspot.com" in url:
 8 |         return "REPORT"
 9 |     if "groups.google.com" in url:
10 |         return "DISCUSSION"
11 |     return "WEB"
12 | 
13 | def main(argv):
14 |   output = []
15 |   with open(argv[0]) as processed_file:
16 |       processed_bugs = json.load(processed_file)
17 |       for bug in processed_bugs:
18 |           if len(bug['osvs']) == 1:
19 |               osv_num = bug['osvs'][0]
20 |               osv_record = {
21 |                 "id": osv_num,
22 |                 "summary": bug["summary"],
23 |                 "details": bug["description"],
24 |                 "references": [
25 |                     {
26 |                         "type": get_reference_type(reference),
27 |                         "url": reference
28 |                     } for reference in bug["references"]
29 |                 ],
30 |                 "affected": [{
31 |                     "package": {
32 |                         "name": "Kernel",
33 |                         "ecosystem": "Linux"
34 |                     },
35 |                     "ranges": [{
36 |                         "type": "GIT",
37 |                         "repo": "https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/",
38 |                         "events": [
39 |                             {"introduced": version} for version in bug["versions"]["affected"]
40 |                         ] + [
41 |                             {"limit": version} for version in bug["versions"]["fixed"]
42 |                         ]
43 |                     }]
44 |                 }]
45 |               }
46 |               output.append(osv_record)
47 |   print(json.dumps(output))
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     main(sys.argv[1:])
52 | 


--------------------------------------------------------------------------------
/fuzzer/syzkaller/db_to_list.sql:
--------------------------------------------------------------------------------
  1 | WITH `base` AS (
  2 |   SELECT
  3 |     DISTINCT
  4 |     `syzkaller`,
  5 |     (
  6 |         SELECT
  7 |           `commit`
  8 |         FROM
  9 |           `tags`
 10 |         WHERE
 11 |           `commit` = `fixed_by`
 12 |     ) `fixed_by`,
 13 |     (
 14 |         SELECT
 15 |           `upstream`
 16 |         FROM
 17 |           `upstream`
 18 |         WHERE
 19 |           `commit` = `fixed_by`
 20 |         UNION ALL
 21 |         SELECT
 22 |           `commit`
 23 |         FROM
 24 |           `tags`
 25 |         WHERE
 26 |           `commit` = `fixed_by`
 27 |           AND LENGTH(REPLACE(`tags`, ".", "")) = LENGTH(`tags`) - 1
 28 |     ) `fixed_by_upstream`,
 29 |     IIF(
 30 |       LENGTH(`introduced_by_short`)<4,
 31 |       null,
 32 |       (
 33 |         SELECT
 34 |           `commit`
 35 |         FROM
 36 |           `tags`
 37 |         WHERE
 38 |           `commit` >= `introduced_by_short`
 39 |           AND `commit` < `introduced_by_short`||"g"
 40 |       )
 41 |     ) `introduced_by`
 42 |   FROM
 43 |     (
 44 |       SELECT
 45 |         `syzkaller`,
 46 |         `fixed_by`,
 47 |         (
 48 |           SELECT
 49 |             SUBSTR(`fixes`, 0, INSTR(`fixes`, " "))
 50 |           FROM
 51 |             `fixes`
 52 |           WHERE
 53 |             `commit` = `fixed_by`
 54 |         ) `introduced_by_short`
 55 |       FROM
 56 |         (
 57 |           SELECT
 58 |             'extid=' || SUBSTRING(
 59 |               `reported_by`,
 60 |               INSTR(`reported_by`, "bot+") + LENGTH("bot+"),
 61 |               INSTR(`reported_by`, "@") - INSTR(`reported_by`, "bot+") - LENGTH("bot+")
 62 |             ) `syzkaller`,
 63 |             `commit` `fixed_by`
 64 |           FROM
 65 |             `reported_by`
 66 |           WHERE
 67 |             `reported_by`
 68 |           LIKE
 69 |             "%bot+%"
 70 |           UNION ALL
 71 |           SELECT
 72 |             'id=' || `syzkaller` `syzkaller`,
 73 |             `commit` `fixed_by`
 74 |           FROM
 75 |             `syzkaller`
 76 |         )
 77 |     )
 78 |   WHERE
 79 |     `introduced_by` is not null
 80 |   ),
 81 | `tagged` AS
 82 |   (
 83 |     SELECT
 84 |       `syzkaller`,
 85 |       `fixed_by_upstream`,
 86 |       `fixed_by`,
 87 |       (
 88 |         SELECT
 89 |           SUBSTR(`tags`, 1 + LENGTH('tags/'), MIN(INSTR(`tags`||'~', '~'), INSTR(`tags`||'-', '-')) - LENGTH('tags/') - 1)
 90 |         FROM
 91 |           `tags`
 92 |         WHERE
 93 |           `commit`=`fixed_by`
 94 |       ) `fixed_by_tag`,
 95 |       `introduced_by`,
 96 |       (
 97 |         SELECT
 98 |           SUBSTR(`tags`, 1 + LENGTH('tags/'), MIN(INSTR(`tags`||'~', '~'), INSTR(`tags`||'-', '-')) - LENGTH('tags/') - 1)
 99 |         FROM
100 |           `tags`
101 |         WHERE
102 |           `commit`=`introduced_by`
103 |       ) `introduced_by_tag`
104 |     FROM
105 |       `base`
106 |   ),
107 | `relevant` AS
108 |   (
109 |     SELECT
110 |       `syzkaller`,
111 |       `introduced_by`,
112 |       `introduced_by_tag`,
113 |       `fixed_by_upstream`,
114 |       (SELECT GROUP_CONCAT(`commit`, ',') FROM `upstream` WHERE `upstream`=`fixed_by_upstream`) `fixed_by_downstream`,
115 |       `fixed_by`,
116 |       `fixed_by_tag`,
117 |       (SELECT GROUP_CONCAT(`cve`, ',') FROM `cve` WHERE `commit`=`fixed_by`) `cve`
118 |     FROM
119 |       `tagged`
120 |     WHERE
121 |       `fixed_by_tag`<>`introduced_by_tag`
122 |   )
123 | SELECT * FROM `relevant`;
124 | 


--------------------------------------------------------------------------------
/fuzzer/README.md:
--------------------------------------------------------------------------------
 1 | # Fuzzer CVE integration
 2 | 
 3 | This directory will host a proof of concept on how we could integrate a high-confidence fuzzer into automatic CVE generation.
 4 | 
 5 | ## Deduplication
 6 | 
 7 | Vulnerability collisions are common, specially on the fuzzing space. In addition, duplicates of CVEs can cause unecessary work for their consumers, which is why accurate deduplication of issues should be a blocking requirement for any CVE automation.
 8 | 
 9 | The problem of deduplication is hence divided in two:
10 |  * Deduplication of crashes - that is minimizing the number of duplicate findings from the fuzzing source
11 |  * Deduplication of CVEs - that is minimizing the number of duplicate CVEs from the existing CVEs (in case of bug collisions, for example)
12 | 
13 | ### Deduplication of crashes
14 | 
15 | Scanners often find the exact same issues multiple times. Despite best efforts, it's extremely difficult to deduplicate an issue fully automatically consistently. As such, when sourcing from a fuzzer, every input should be considered a potential duplicate.
16 | 
17 | The trivial way to minimize the noise is to **deduplicate by the fix**. That is, if the scanner can identify the version that fixed an issue, then we can consider all issues fixed by the same version as the same vulnerability. This works best in cases when versions are highly granular (eg, commits) where a single fix resolves a single commit.
18 | 
19 | It's important to take into consideration backports when deduplicating, as they can wrongly appear to fix different issues when they might fix the same one.
20 | 
21 | ### Deduplication of CVEs
22 | 
23 | Since fuzzing is public, and mostly open-source, it is very common for two people to find the same vulnerability independently, and possibly issue the same CVE. Similar to the deduplication of crashes, one can check the references of the affected version. For open-source fuzzers, linking to the public tracker, crash and any other references can also help identify any potential concerns.
24 | 
25 | Additionally, listing the CVEs automatically on the fuzzing dashboards can help avoid duplication of effort. Reserving a CVE ID, and waiting for some time before populating its details can also help the community unify along a single CVE identifier (and if a duplicate is published in the meantime, then one would just avoid issuing a duplicate).
26 | 
27 | ## Repeatability
28 | 
29 | Crashes that have obscure or hard to nail down issues are common in fuzzers. One important component for a valid vulnerability is that of it being reproducible. This allows to know objectively whether a bug exists in code or not.
30 | 
31 | If a given vulnerability has no reproducer, it might be thought to be fixed, but it might not be in reality. As such, for any vulnerability to be considered valid, it should be possible to objectively observe it.
32 | 
33 | ## Exploitability
34 | 
35 | The main distinction between security bugs and normal bugs is their ability to affect users security.
36 | 
37 | For the purposes of CVEs, the bar to qualify for a CVE is reasonably low, however, most CVEs need to meet a minimum bar of security impact for them to be taken seriously by anyone. Determining the impact of a vulnerability is mostly manual analysis and it requires understanding the impact of every issue. Doing so automatically for any type of bug is challenging.
38 | 
39 | A possible solution is to simply have a high bar for what counts as a valid vulnerability, and only file issues above that bar. Anything else would need manual analysis.
40 | 
41 | 


--------------------------------------------------------------------------------
/fuzzer/syzkaller/unique_to_delta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import google.auth
 4 | 
 5 | import json
 6 | import sys
 7 | 
 8 | def main(argv):
 9 |     bugs = []
10 |     if len(argv) < 1:
11 |         raise Exception("No input file")
12 | 
13 |     with open(argv[0]) as unique_file:
14 |         unique_bugs = json.load(unique_file)
15 |         for bug in unique_bugs:
16 |             crashes = set()
17 |             commits = set()
18 |             repros = set()
19 |             discussions = set()
20 |             syzkaller_links = set()
21 |             is_kasan = False
22 |             for syzkaller in bug['syzkaller']:
23 |                 try:
24 |                     with open("syzkaller/bug_%s.json"%syzkaller) as syzkaller_bug_file:
25 |                         try:
26 |                             syzkaller_bug = json.load(syzkaller_bug_file)
27 |                             syzkaller_links.add("https://syzkaller.appspot.com/bug?%s" % syzkaller)
28 |                             crashes.add(syzkaller_bug['title'])
29 |                             if 'discussions' in syzkaller_bug:
30 |                                 for discussion in syzkaller_bug['discussions']:
31 |                                     discussions.add(discussion)
32 |                                     syzkaller_links.add(discussion)
33 |                             if 'crashes' in syzkaller_bug:
34 |                                 for crash in syzkaller_bug['crashes']:
35 |                                     if 'title' in crash:
36 |                                         crashes.add(crash['title'])
37 |                                     if 'syz-reproducer' in crash:
38 |                                         repros.add(crash['syz-reproducer'])
39 |                             if 'fix-commits' in syzkaller_bug:
40 |                                 for commit in syzkaller_bug['fix-commits']:
41 |                                     if 'title' in commit and commit['title']:
42 |                                         commits.add(commit['title'])
43 |                                     if 'link' in commit and commit['link']:
44 |                                         syzkaller_links.add(commit['link'])
45 |                         except json.decoder.JSONDecodeError:
46 |                             pass
47 |                 except FileNotFoundError:
48 |                     pass
49 |             if len(repros) > 0:
50 |                 for crash in crashes:
51 |                     if 'KASAN' in crash and 'null-ptr-deref' not in crash:
52 |                         is_kasan = True
53 |                 if not is_kasan:
54 |                     continue
55 |                 bugs.append({
56 |                     "cves": bug["cve"],
57 |                     "osvs": [],
58 |                     "unique_ids": bug['cve'] + bug['fixed_by'] + bug ['syzkaller'],
59 |                     "summary_inputs": {
60 |                         "commits": list(commits),
61 |                         "crashes": list(crashes),
62 |                         "fixed_by_upstream": list(bug['fixed_by_upstream']),
63 |                         "fixed_by_tag": list(bug['fixed_by_tag']),
64 |                         "introduced_by_tag": list(bug['introduced_by_tag'])
65 |                     },
66 |                     "references": list(syzkaller_links),
67 |                     "versions": {
68 |                         "fixed": bug["fixed_by_upstream"] + bug["fixed_by_downstream"],
69 |                         "affected": bug["introduced_by"] or ["0"]
70 |                     }
71 |                 })
72 |     print(json.dumps(bugs))
73 | 
74 | if __name__ == "__main__":
75 |    main(sys.argv[1:])
76 | 


--------------------------------------------------------------------------------
/fuzzer/syzkaller/delta_to_summary.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | import google.auth
  4 | from google.auth.transport.requests import AuthorizedSession
  5 | 
  6 | import json
  7 | import requests
  8 | import sys
  9 | 
 10 | def query_llm_prompts(authed_session, prompts):
 11 |     response = authed_session.post(
 12 |         'https://us-central1-aiplatform.googleapis.com/v1/projects/sdcpocs/locations/us-central1/publishers/google/models/text-bison:predict',
 13 |         json={
 14 |             "instances": [{"prompt": prompt["prompt"]} for prompt in prompts],
 15 |             "parameters": {
 16 |                 "temperature": 0,
 17 |                 "maxOutputTokens": 256,
 18 |                 "topK": 1,
 19 |                 "topP": 0.0
 20 |             }
 21 |         })
 22 |     print(response.content, file=sys.stderr)
 23 |     predictions = json.loads(response.content)["predictions"]
 24 |     return [prediction["content"] + prompts[idx]["trailer"] if "content" in prediction else prompts[idx]["fallback"] for idx, prediction in enumerate(predictions)]
 25 | 
 26 | 
 27 | def write_short_summary_prompt(commits, crashes, upstream_fix, fixed_tags, vuln_tags, msgs):
 28 |     prompt = """Provide a summary following the template:
 29 | ```
 30 | The [COMPONENT] in the Linux kernel has a [PROBLEMTYPE] vulnerability. This vulnerability could be exploited by an attacker to cause memory corruption when [ROOTCAUSE].
 31 | ```
 32 | 
 33 | Variables:
 34 | - PROBLEMTYPE: What vulnerability was fixed? (summarize what the memory corruption vulnerability that was fixed based on the patch commit message)
 35 | - COMPONENT: In which subsystem was the bug? (extract this information from the commit title, affected files and reproducer)
 36 | - ROOTCAUSE: What was the cause for the vulnerability? (summarize the mistake in code fixed by the commit message)
 37 | 
 38 | EXAMPLE: CRASH: ["", "KASAN: global-out-of-bounds Read in crypto_chacha20_crypt"] COMMITS: ["crypto: skcipher - set walk.iv for zero-length inputs"] DESCRIPTION: ["crypto: skcipher - set walk.iv for zero-length inputs\n\nAll the ChaCha20 algorithms as well as the ARM bit-sliced AES-XTS\nalgorithms call skcipher_walk_virt(), then access the IV (walk.iv)\nbefore checking whether any bytes need to be processed (walk.nbytes).\n\nBut if the input is empty, then skcipher_walk_virt() doesn't set the IV,\nand the algorithms crash trying to use the uninitialized IV pointer.\n\nFix it by setting the IV earlier in skcipher_walk_virt().  Also fix it\nfor the AEAD walk functions.\n\nThis isn't a perfect solution because we can't actually align the IV to\n->cra_alignmask unless there are bytes to process, for one because the\ntemporary buffer for the aligned IV is freed by skcipher_walk_done(),\nwhich is only called when there are bytes to process.  Thus, algorithms\nthat require aligned IVs will still need to avoid accessing the IV when\nwalk.nbytes == 0.  Still, many algorithms/architectures are fine with\nIVs having any alignment, and even for those that aren't, a misaligned\npointer bug is much less severe than an uninitialized pointer bug.\n\nThis change also matches the behavior of the older blkcipher_walk API.\n\nFixes: 0cabf2af6f5a (\"crypto: skcipher - Fix crash on zero-length input\")\nReported-by: syzbot <syzkaller@googlegroups.com>\nCc: <stable@vger.kernel.org> # v4.14+\nSigned-off-by: Eric Biggers <ebiggers@google.com>\nSigned-off-by: Herbert Xu <herbert@gondor.apana.org.au>\n"]
 39 | ANSWER: The crypto subsystem of the Linux kernel has an out of bounds vulnerability. The vulnerability could be exploited by an attacker to cause memory corruption when crypto algorithm implementations like ChaCha20 and ARM's bit-sliced AES-XTS read an uninitialized IV pointer when the input is empty.
 40 | 
 41 | EXAMPLE: CRASH: ["", "KASAN: use-after-free Read in sock_def_write_space"] COMMITS: ["llc: make sure applications use ARPHRD_ETHER"] DESCRIPTION: ["llc: make sure applications use ARPHRD_ETHER\n\nsyzbot was to trigger a bug by tricking AF_LLC with\nnon sensible addr->sllc_arphrd\n\nIt seems clear LLC requires an Ethernet device.\n\nBack in commit abf9d537fea2 (\"llc: add support for SO_BINDTODEVICE\")\nOctavian Purdila added possibility for application to use a zero\nvalue for sllc_arphrd, convert it to ARPHRD_ETHER to not cause\nregressions on existing applications.\n\nBUG: KASAN: use-after-free in __read_once_size include/linux/compiler.h:199 [inline]\nBUG: KASAN: use-after-free in list_empty include/linux/list.h:268 [inline]\nBUG: KASAN: use-after-free in waitqueue_active include/linux/wait.h:126 [inline]\nBUG: KASAN: use-after-free in wq_has_sleeper include/linux/wait.h:160 [inline]\nBUG: KASAN: use-after-free in skwq_has_sleeper include/net/sock.h:2092 [inline]\nBUG: KASAN: use-after-free in sock_def_write_space+0x642/0x670 net/core/sock.c:2813\nRead of size 8 at addr ffff88801e0b4078 by task ksoftirqd/3/27\n\nCPU: 3 PID: 27 Comm: ksoftirqd/3 Not tainted 5.5.0-rc1-syzkaller #0\nHardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014\nCall Trace:\n __dump_stack lib/dump_stack.c:77 [inline]\n dump_stack+0x197/0x210 lib/dump_stack.c:118\n print_address_description.constprop.0.cold+0xd4/0x30b mm/kasan/report.c:374\n __kasan_report.cold+0x1b/0x41 mm/kasan/report.c:506\n kasan_report+0x12/0x20 mm/kasan/common.c:639\n __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:135\n __read_once_size include/linux/compiler.h:199 [inline]\n list_empty include/linux/list.h:268 [inline]\n waitqueue_active include/linux/wait.h:126 [inline]\n wq_has_sleeper include/linux/wait.h:160 [inline]\n skwq_has_sleeper include/net/sock.h:2092 [inline]\n sock_def_write_space+0x642/0x670 net/core/sock.c:2813\n sock_wfree+0x1e1/0x260 net/core/sock.c:1958\n skb_release_head_state+0xeb/0x260 net/core/skbuff.c:652\n skb_release_all+0x16/0x60 net/core/skbuff.c:663\n __kfree_skb net/core/skbuff.c:679 [inline]\n consume_skb net/core/skbuff.c:838 [inline]\n consume_skb+0xfb/0x410 net/core/skbuff.c:832\n __dev_kfree_skb_any+0xa4/0xd0 net/core/dev.c:2967\n dev_kfree_skb_any include/linux/netdevice.h:3650 [inline]\n e1000_unmap_and_free_tx_resource.isra.0+0x21b/0x3a0 drivers/net/ethernet/intel/e1000/e1000_main.c:1963\n e1000_clean_tx_irq drivers/net/ethernet/intel/e1000/e1000_main.c:3854 [inline]\n e1000_clean+0x4cc/0x1d10 drivers/net/ethernet/intel/e1000/e1000_main.c:3796\n napi_poll net/core/dev.c:6532 [inline]\n net_rx_action+0x508/0x1120 net/core/dev.c:6600\n __do_softirq+0x262/0x98c kernel/softirq.c:292\n run_ksoftirqd kernel/softirq.c:603 [inline]\n run_ksoftirqd+0x8e/0x110 kernel/softirq.c:595\n smpboot_thread_fn+0x6a3/0xa40 kernel/smpboot.c:165\n kthread+0x361/0x430 kernel/kthread.c:255\n ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352\n\nAllocated by task 8247:\n save_stack+0x23/0x90 mm/kasan/common.c:72\n set_track mm/kasan/common.c:80 [inline]\n __kasan_kmalloc mm/kasan/common.c:513 [inline]\n __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:486\n kasan_slab_alloc+0xf/0x20 mm/kasan/common.c:521\n slab_post_alloc_hook mm/slab.h:584 [inline]\n slab_alloc mm/slab.c:3320 [inline]\n kmem_cache_alloc+0x121/0x710 mm/slab.c:3484\n sock_alloc_inode+0x1c/0x1d0 net/socket.c:240\n alloc_inode+0x68/0x1e0 fs/inode.c:230\n new_inode_pseudo+0x19/0xf0 fs/inode.c:919\n sock_alloc+0x41/0x270 net/socket.c:560\n __sock_create+0xc2/0x730 net/socket.c:1384\n sock_create net/socket.c:1471 [inline]\n __sys_socket+0x103/0x220 net/socket.c:1513\n __do_sys_socket net/socket.c:1522 [inline]\n __se_sys_socket net/socket.c:1520 [inline]\n __ia32_sys_socket+0x73/0xb0 net/socket.c:1520\n do_syscall_32_irqs_on arch/x86/entry/common.c:337 [inline]\n do_fast_syscall_32+0x27b/0xe16 arch/x86/entry/common.c:408\n entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139\n\nFreed by task 17:\n save_stack+0x23/0x90 mm/kasan/common.c:72\n set_track mm/kasan/common.c:80 [inline]\n kasan_set_free_info mm/kasan/common.c:335 [inline]\n __kasan_slab_free+0x102/0x150 mm/kasan/common.c:474\n kasan_slab_free+0xe/0x10 mm/kasan/common.c:483\n __cache_free mm/slab.c:3426 [inline]\n kmem_cache_free+0x86/0x320 mm/slab.c:3694\n sock_free_inode+0x20/0x30 net/socket.c:261\n i_callback+0x44/0x80 fs/inode.c:219\n __rcu_reclaim kernel/rcu/rcu.h:222 [inline]\n rcu_do_batch kernel/rcu/tree.c:2183 [inline]\n rcu_core+0x570/0x1540 kernel/rcu/tree.c:2408\n rcu_core_si+0x9/0x10 kernel/rcu/tree.c:2417\n __do_softirq+0x262/0x98c kernel/softirq.c:292\n\nThe buggy address belongs to the object at ffff88801e0b4000\n which belongs to the cache sock_inode_cache of size 1152\nThe buggy address is located 120 bytes inside of\n 1152-byte region [ffff88801e0b4000, ffff88801e0b4480)\nThe buggy address belongs to the page:\npage:ffffea0000782d00 refcount:1 mapcount:0 mapping:ffff88807aa59c40 index:0xffff88801e0b4ffd\nraw: 00fffe0000000200 ffffea00008e6c88 ffffea0000782d48 ffff88807aa59c40\nraw: ffff88801e0b4ffd ffff88801e0b4000 0000000100000003 0000000000000000\npage dumped because: kasan: bad access detected\n\nMemory state around the buggy address:\n ffff88801e0b3f00: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc\n ffff88801e0b3f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc\n>ffff88801e0b4000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb\n                                                                ^\n ffff88801e0b4080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb\n ffff88801e0b4100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb\n\nFixes: abf9d537fea2 (\"llc: add support for SO_BINDTODEVICE\")\nSigned-off-by: Eric Dumazet <edumazet@google.com>\nSigned-off-by: David S. Miller <davem@davemloft.net>\n"]
 42 | ANSWER: The llc subsystem of the Linux Kernel has a use after free vulnerability. The vulnerability could be exploited by an attacker to cause memory corruption when passing a zero value for sllc_arphrd while LLC expected an ethernet device.
 43 | 
 44 | EXAMPLE: CRASH: ["", "KASAN: use-after-free Read in mcba_usb_disconnect"] COMMITS: ["can: mcba_usb: fix use-after-free on disconnect"] DESCRIPTION: ["can: mcba_usb: fix use-after-free on disconnect\n\nThe driver was accessing its driver data after having freed it.\n\nFixes: 51f3baad7de9 (\"can: mcba_usb: Add support for Microchip CAN BUS Analyzer\")\nCc: stable <stable@vger.kernel.org>     # 4.12\nCc: Remigiusz Ko\u0142\u0142\u0105taj <remigiusz.kollataj@mobica.com>\nReported-by: syzbot+e29b17e5042bbc56fae9@syzkaller.appspotmail.com\nSigned-off-by: Johan Hovold <johan@kernel.org>\nSigned-off-by: Marc Kleine-Budde <mkl@pengutronix.de>\n"]
 45 | ANSWER: The can/mcba_usb subsystem of the Linux kernel has a use after free vulnerability. The vulnerability could be exploited by an attacker to cause memory corruption when the driver accessed data after having it freed while disconnecting.
 46 | 
 47 | EXAMPLE: CRASH: ["KASAN: use-after-free Read in bdev_evict_inode"] COMMITS: ["block: ensure the bdi is freed after inode_detach_wb"] DESCRIPTION: ["block: ensure the bdi is freed after inode_detach_wb\n\ninode_detach_wb references the \"main\" bdi of the inode.  With the\nrecent change to move the bdi from the request_queue to the gendisk\nthis causes a guaranteed use after free when using certain cgroup\nconfigurations.  The big itself is older through as any non-default\ninode reference (e.g. an open file descriptor) could have injected\nthis use after free even before that.\n\nFixes: 52ebea749aae (\"writeback: make backing_dev_info host cgroup-specific bdi_writebacks\")\nReported-by: Qian Cai <quic_qiancai@quicinc.com>\nReported-by: syzbot <syzbot+1fb38bb7d3ce0fa3e1c4@syzkaller.appspotmail.com>\nSigned-off-by: Christoph Hellwig <hch@lst.de>\nLink: https://lore.kernel.org/r/20210816122614.601358-3-hch@lst.de\nSigned-off-by: Jens Axboe <axboe@kernel.dk>\n"]
 48 | ANSWER: The block subsystem of the Linux Kernel has a use after free vulnerability. The vulnerability could be exploited by an attacker to cause memory corruption when using certain cgroup configurations because the bdi of the inode wasn't always freed after inode_detach_wb.
 49 | 
 50 | EXAMPLE: CRASH: ["", "KASAN: use-after-free Read in sock_def_write_space (2)"] COMMITS: ["qrtr: orphan socket in qrtr_release()"] DESCRIPTION: ["qrtr: orphan socket in qrtr_release()\n\nWe have to detach sock from socket in qrtr_release(),\notherwise skb->sk may still reference to this socket\nwhen the skb is released in tun->queue, particularly\nsk->sk_wq still points to &sock->wq, which leads to\na UAF.\n\nReported-and-tested-by: syzbot+6720d64f31c081c2f708@syzkaller.appspotmail.com\nFixes: 28fb4e59a47d (\"net: qrtr: Expose tunneling endpoint to user space\")\nCc: Bjorn Andersson <bjorn.andersson@linaro.org>\nCc: Eric Dumazet <eric.dumazet@gmail.com>\nSigned-off-by: Cong Wang <xiyou.wangcong@gmail.com>\nReviewed-by: Eric Dumazet <edumazet@google.com>\nSigned-off-by: David S. Miller <davem@davemloft.net>\n"]
 51 | ANSWER: The qrtr subsystem of the Linux kernel has a use after free vulnerability. The vulnerability could be exploited by an attacker to cause memory corruption when the socket was not properly detached from the qrtr, as otherwise skb->sk may still reference the socket when skb is released in tun->queue.
 52 | 
 53 | EXAMPLE: CRASH: %s COMMITS: %s DESCRIPTION: %s
 54 | ANSWER:""" % (json.dumps(list(crashes)), json.dumps(list(commits)), json.dumps(msgs))
 55 |     fix_trailer = " This vulnerability exists in all versions of the Linux Kernel from %s until commit %s (%s)." % (
 56 |         ', '.join(vuln_tags), ', '.join(upstream_fix), ', '.join(fixed_tags)
 57 |     )
 58 |     return {
 59 |         "prompt": prompt,
 60 |         "fallback": ", ".join(list(commits)),
 61 |         "trailer": fix_trailer
 62 |     }
 63 | 
 64 | def write_long_description_prompt(msgs):
 65 |     prompt = """You are a vulnerability description generator.
 66 | 
 67 | I have a Linux kernel commit that fixes a potential security vulnerability.
 68 | Quickly explain in technical terms the possible vulnerability being fixed by this commit and the possible security impact.
 69 | Briefly explain the vulnerability type and how it usually is exploited.
 70 | Respond with "The patch commit for this vulnerability fixes ..." with the description of the fix and the potential vulnerability.
 71 | Follow with "Vulnerabilities of type ... are exploited by ..." and a brief description of that vulnerability type and how to exploit them.
 72 | End with "The security impact of this vulnerability could be ..." and a reason explaining the worst case scenario that a potential vulnerability like this could have and the most likely case.
 73 | The last sentence of the output should be "To resolve this vulnerability patch the kernel past the fix commit."
 74 | 
 75 | Expand all acronyms so someone that doesn't know the code can understand it.
 76 | Purpose is to explain the vulnerability so no subjective opinions only facts.
 77 | Mention that this vulnerability has a confirmed proof of concept code and the vendor has provided an official fix.
 78 | 
 79 | Keep the response below 200 words.
 80 | 
 81 | ---
 82 | ```
 83 | %s
 84 | ```""" % ("```\n```".join(msgs))
 85 |     return {
 86 |         "prompt": prompt,
 87 |         "fallback": "\n\n".join(list(msgs)),
 88 |         "trailer": "\n\nThis description was automatically generated based on the commit message."
 89 |     }
 90 | 
 91 | def main(argv):
 92 |     creds, project = google.auth.default(scopes=['https://www.googleapis.com/auth/cloud-platform'])
 93 |     authed_session = AuthorizedSession(creds)
 94 | 
 95 |     bugs = []
 96 |     if len(argv) < 1:
 97 |         raise Exception("No input file")
 98 | 
 99 |     with open(argv[0]) as delta_file:
100 |         delta_bugs = json.load(delta_file)
101 |         for bug in delta_bugs:
102 |             msgs = []
103 |             for commit in bug['summary_inputs']['fixed_by_upstream']:
104 |                 response = requests.get("https://kernel.googlesource.com/pub/scm/linux/kernel/git/stable/linux-stable/+/%s?format=json" % commit)
105 |                 try:
106 |                     msgs.append(json.loads(response.content[5:])['message'][0:2000])
107 |                 except json.decoder.JSONDecodeError:
108 |                     pass
109 |             bug["summary"] = query_llm_prompts(authed_session, [write_short_summary_prompt(
110 |                 bug['summary_inputs']['commits'],
111 |                 bug['summary_inputs']['crashes'],
112 |                 bug['summary_inputs']['fixed_by_upstream'],
113 |                 bug['summary_inputs']['fixed_by_tag'],
114 |                 bug['summary_inputs']['introduced_by_tag'],
115 |                 msgs)])
116 |             bug["description"] = query_llm_prompts(authed_session, [write_long_description_prompt(msgs)])
117 |             bugs.append(bug)
118 | 
119 |     print(json.dumps(bugs))
120 | 
121 | if __name__ == "__main__":
122 |    main(sys.argv[1:])
123 | 


--------------------------------------------------------------------------------