├── requirements.txt
├── .DS_Store
├── .github
    └── workflows
    │   ├── long.yml
    │   ├── manual.yml
    │   └── core.yml
├── LICENSE
├── README.md
├── scholar_utils.py
├── _types.py
├── arxiv_utils.py
├── .gitignore
├── paperstack.py
├── openai_utils.py
└── notion_utils.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | arxiv
2 | notion-client
3 | openai
4 | semanticscholar


--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dreadnode/paperstack/HEAD/.DS_Store


--------------------------------------------------------------------------------
/.github/workflows/long.yml:
--------------------------------------------------------------------------------
 1 | name: Paperstack (Long)
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 6 * * *'
 6 | 
 7 | jobs:
 8 |   call-core:
 9 |     uses: ./.github/workflows/core.yml
10 |     secrets: inherit
11 |     with:
12 |       search-arxiv: true
13 |       search-scholar: true


--------------------------------------------------------------------------------
/.github/workflows/manual.yml:
--------------------------------------------------------------------------------
 1 | name: Paperstack (Manual)
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       search-arxiv:
 7 |         description: 'Search Arxiv?'
 8 |         required: false
 9 |         default: false
10 |         type: 'boolean'
11 |       search-scholar:
12 |         description: 'Search Semantic Scholar?'
13 |         required: false
14 |         default: false
15 |         type: 'boolean'
16 | 
17 | jobs:
18 |   call-core:
19 |     uses: ./.github/workflows/core.yml
20 |     secrets: inherit
21 |     with:
22 |       search-arxiv: ${{ inputs.search-arxiv }}
23 |       search-scholar: ${{ inputs.search-scholar }}


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 dreadnode
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # paperstack
 2 | 
 3 | Paperstack uses ArXiv and Semantic Scholar (relational) to sync academic paper information into a Notion DB. It also has some lightweight uses of OpenAI models for summarization and categorization. It was built for gathering machine learning and security related papers, but could be adapted easily to any other subject (`ARXIV_SEARCH`/`--arxiv-search-query`). It's deplyoment is focused on Github actions, but can be executed on the command line directly. It can also detect partial entries (ArXiv link or title) in the Notion DB and fill in the remaining information.
 4 | 
 5 | The Notion DB requires a semi-fixed structure as a function of the syncing logic (`notion_utils.py`), and you're free to add columns and custom syncing behavior as needed. Here is the mininmum database layout the tool currently expects:
 6 | 
 7 | ```
 8 | Title [Title]
 9 | Summary [Text]
10 | Focus [Select]
11 | URL [URL]
12 | Authors [Mutli-select]
13 | Published [Date]
14 | Explored [Checkbox]
15 | ```
16 | 
17 | The majority of command line arguments can be passed via environment variables as expected by the workflows.
18 | 
19 | ```
20 | NOTION_TOKEN
21 | NOTION_DATABASE_ID
22 | OPENAI_API_TOKEN
23 | ```
24 | 
25 | Hack away!
26 | 


--------------------------------------------------------------------------------
/.github/workflows/core.yml:
--------------------------------------------------------------------------------
 1 | name: Paperstack Core
 2 | 
 3 | on:
 4 |   workflow_call:
 5 |     inputs:
 6 |       search-arxiv:
 7 |         description: "Search Arxiv?"
 8 |         required: true
 9 |         type: "boolean"
10 |       search-scholar:
11 |         description: "Search Semantic Scholar?"
12 |         required: true
13 |         type: "boolean"
14 | 
15 | jobs:
16 |   run-script:
17 |     runs-on: ubuntu-latest
18 |     timeout-minutes: 120 # Adding 2-hour timeout for the entire job
19 |     steps:
20 |       - name: Check out code
21 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 #v4.2.2
22 | 
23 |       - name: Set up Python
24 |         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 #v5.6.0
25 |         with:
26 |           python-version: "3.x"
27 | 
28 |       - name: Install dependencies
29 |         run: |
30 |           python -m pip install --upgrade pip
31 |           pip install -r requirements.txt
32 | 
33 |       - name: Run paperstack
34 |         run: |
35 |           python paperstack.py ${{ inputs.search-arxiv && '--search-arxiv' || '' }} ${{ inputs.search-scholar && '--search-semantic-scholar' || '' }}
36 |         env:
37 |           NOTION_TOKEN: ${{ secrets.NOTION_TOKEN }}
38 |           NOTION_DATABASE_ID: ${{ secrets.NOTION_DATABASE_ID }}
39 |           OPENAI_API_TOKEN: ${{ secrets.OPENAI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/scholar_utils.py:
--------------------------------------------------------------------------------
 1 | from semanticscholar import SemanticScholar  # type: ignore
 2 | from tqdm import tqdm  # type: ignore
 3 | 
 4 | from _types import Paper
 5 | 
 6 | client = SemanticScholar()
 7 | 
 8 | 
 9 | def get_recommended_arxiv_ids_from_semantic_scholar(
10 |     papers: list[Paper], max_results: int = 10, min_year: int = 2018
11 | ) -> list[Paper]:
12 |     results: list[dict] = []
13 |     for paper in tqdm(papers):
14 |         if not paper.url:
15 |             continue
16 | 
17 |         if not paper.arxiv_id:
18 |             continue
19 | 
20 |         try:
21 |             results.extend(
22 |                 client.get_recommended_papers(
23 |                     f"arXiv:{paper.arxiv_id}", limit=max_results * 2
24 |                 )
25 |             )
26 |             paper.explored = True
27 |         except Exception as e:
28 |             print(f"[!] {e}]")
29 |             pass
30 | 
31 |     filtered: list[dict] = []
32 |     for result in results:
33 |         if "ArXiv" not in result["externalIds"]:
34 |             continue
35 | 
36 |         arxiv_id = result["externalIds"]["ArXiv"]
37 |         if arxiv_id in [f["externalIds"]["ArXiv"] for f in filtered]:
38 |             continue
39 | 
40 |         if result["title"] in [p.title for p in papers]:
41 |             continue
42 | 
43 |         if result["year"] < min_year:
44 |             continue
45 | 
46 |         filtered.append(result)
47 | 
48 |     # TODO: Sort by something important
49 | 
50 |     recommended_papers: list[Paper] = []
51 |     for result in filtered:
52 |         recommended_papers.append(
53 |             Paper(
54 |                 title=result["title"],
55 |                 url=f'https://arxiv.org/abs/{result["externalIds"]["ArXiv"]}',
56 |                 abstract=result["abstract"],
57 |             )
58 |         )
59 | 
60 |     return recommended_papers[:max_results]
61 | 


--------------------------------------------------------------------------------
/_types.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from dataclasses import asdict, dataclass, field
 3 | from datetime import datetime
 4 | from enum import Enum
 5 | 
 6 | 
 7 | class Focus(str, Enum):
 8 |     Offensive = "Offensive"
 9 |     Defensive = "Defensive"
10 |     Adversarial = "Adversarial"
11 |     Safety = "Safety"
12 |     Other = "Other"
13 | 
14 | class AttackType(str, Enum):
15 |     ModelEvasion = "Evasion"
16 |     ModelExtraction = "Extraction"
17 |     ModelInversion = "Inversion"
18 |     ModelPoisoning = "Poisoning"
19 |     PromptInjection = "Prompt Injection"
20 |     Other = "Other"
21 | 
22 | 
23 | @dataclass
24 | class Paper:
25 |     # Note: These need to reflect in the Notion DB and
26 |     # notion_utils functions.
27 | 
28 |     page_id: str | None = None
29 |     title: str | None = None
30 |     url: str | None = None
31 |     focus: Focus | None = None
32 |     attack_type: AttackType | None = None
33 |     summary: str | None = None
34 |     abstract: str | None = None
35 |     authors: list[str] = field(default_factory=list)
36 |     published: datetime | None = None
37 |     explored: bool | None = None
38 | 
39 |     # We don't want to excessively write back to Notion, so we'll
40 |     # offer the ability to set change tracking when we read.
41 | 
42 |     track_changes: bool = False
43 | 
44 |     def __post_init__(self):
45 |         self._original_state = asdict(self)
46 | 
47 |     def has_changed(self):
48 |         if self.track_changes:
49 |             return self._original_state != asdict(self)
50 |         else:
51 |             return True
52 | 
53 |     def has_arxiv_props(self) -> bool:
54 |         return all(
55 |             [
56 |                 self.title,
57 |                 self.url,
58 |                 self.authors,
59 |                 self.published,
60 |             ]
61 |         )
62 | 
63 |     @property
64 |     def arxiv_id(self) -> str | None:
65 |         if not self.url:
66 |             return None
67 |         match = re.search(r"\d{4}\.\d{5}", self.url)
68 |         return match.group(0) if match else None
69 | 


--------------------------------------------------------------------------------
/arxiv_utils.py:
--------------------------------------------------------------------------------
 1 | import arxiv  # type: ignore
 2 | 
 3 | from _types import Paper
 4 | 
 5 | client = arxiv.Client()
 6 | 
 7 | 
 8 | def arxiv_result_to_paper(result: arxiv.Result) -> Paper:
 9 |     return Paper(
10 |         title=result.title,
11 |         url=result.entry_id,
12 |         abstract=result.summary,
13 |         authors=[a.name for a in result.authors],
14 |         published=result.published,
15 |     )
16 | 
17 | 
18 | def search_arxiv(
19 |     query: str,
20 |     max_results=10,
21 |     sort_by: arxiv.SortCriterion = arxiv.SortCriterion.SubmittedDate,
22 | ) -> list[arxiv.Result]:
23 |     return list(
24 |         client.results(
25 |             arxiv.Search(
26 |                 query,
27 |                 max_results=max_results,
28 |                 sort_by=sort_by,
29 |             )
30 |         )
31 |     )
32 | 
33 | 
34 | def search_arxiv_as_paper(
35 |     query: str,
36 |     max_results=10,
37 |     sort_by: arxiv.SortCriterion = arxiv.SortCriterion.SubmittedDate,
38 | ) -> list[Paper]:
39 |     return [
40 |         arxiv_result_to_paper(result)
41 |         for result in search_arxiv(query, max_results, sort_by)
42 |     ]
43 | 
44 | 
45 | def search_arxiv_by_id(id: str) -> arxiv.Result | None:
46 |     for result in client.results(arxiv.Search(id_list=[id])):
47 |         return result
48 |     return None
49 | 
50 | 
51 | def fill_papers_with_arxiv(papers: list[Paper]) -> list[Paper]:
52 |     for paper in papers:
53 |         if paper.has_arxiv_props():
54 |             continue
55 | 
56 |         result: arxiv.Result | None = None
57 | 
58 |         if paper.arxiv_id:
59 |             result = search_arxiv_by_id(paper.arxiv_id)
60 | 
61 |         if not result and paper.title:
62 |             # Dashes seem to fuck up the API calls - Finicky in general, links work much better
63 |             query = f"ti:{paper.title.replace('-', ' ')}"
64 |             searched = search_arxiv(query, max_results=1, sort_by=arxiv.SortCriterion.Relevance)
65 |             result = searched[0] if searched else None
66 | 
67 |         if not result:
68 |             print(f'[!] Could not find arxiv result for "{paper.title}" [{paper.url}]')
69 |             continue
70 | 
71 |         if paper.title and paper.title != result.title:
72 |             print(f'[!] Title mismatch: "{paper.title}" vs "{result.title}"')
73 | 
74 |         paper.title = result.title
75 |         paper.url = result.entry_id
76 |         paper.abstract = result.summary
77 |         paper.authors = [a.name for a in result.authors]
78 |         paper.published = result.published
79 | 
80 |     return papers
81 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | notion_utils.py
162 | 


--------------------------------------------------------------------------------
/paperstack.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import asyncio
  3 | import os
  4 | from datetime import datetime
  5 | 
  6 | from arxiv_utils import fill_papers_with_arxiv, search_arxiv_as_paper
  7 | from notion_utils import (
  8 |     get_notion_client,
  9 |     get_papers_from_notion,
 10 |     write_papers_to_notion,
 11 | )
 12 | from openai_utils import (
 13 |     get_attack_type_from_abstract,
 14 |     get_focus_label_from_abstract,
 15 |     get_openai_client,
 16 |     summarize_abstract_with_openai,
 17 | )
 18 | from scholar_utils import get_recommended_arxiv_ids_from_semantic_scholar
 19 | 
 20 | ARXIV_SEARCH = """\
 21 | "adversarial attacks" OR "language model attacks" OR "LLM vulnerabilities" OR \
 22 | "AI security" OR "machine learning security" OR "jailbreak" OR "bypassing AI"\
 23 | """
 24 | 
 25 | 
 26 | async def main():
 27 |     parser = argparse.ArgumentParser()
 28 | 
 29 |     parser.add_argument(
 30 |         "--notion-token",
 31 |         type=str,
 32 |         default=os.environ.get("NOTION_TOKEN"),
 33 |         help="Notion token",
 34 |     )
 35 |     parser.add_argument(
 36 |         "--database-id",
 37 |         type=str,
 38 |         default=os.environ.get("NOTION_DATABASE_ID"),
 39 |         help="Notion database id",
 40 |     )
 41 |     parser.add_argument(
 42 |         "--openai-token",
 43 |         type=str,
 44 |         default=os.environ.get("OPENAI_API_TOKEN"),
 45 |         help="OpenAI token",
 46 |     )
 47 |     parser.add_argument("--arxiv-search-query", type=str, default=ARXIV_SEARCH)
 48 |     parser.add_argument("--search-arxiv", action="store_true", default=False)
 49 |     parser.add_argument("--search-semantic-scholar", action="store_true", default=False)
 50 | 
 51 |     args = parser.parse_args()
 52 |     print("[+] Paperstack")
 53 | 
 54 |     notion_client = get_notion_client(args.notion_token)
 55 |     openai_client = get_openai_client(args.openai_token)
 56 | 
 57 |     print(f" |- Getting papers from Notion [{args.database_id}]")
 58 |     papers = await get_papers_from_notion(notion_client, args.database_id)
 59 |     print(f"    |- {len(papers)} existing papers")
 60 | 
 61 |     for p in papers:
 62 |         if p.published < datetime.fromisoformat("2024-07-01 00:00:00+00:00"):
 63 |             p.explored = True
 64 | 
 65 |         if len(p.authors) > 5:
 66 |             p.authors = p.authors[:5]
 67 | 
 68 |     if not all([p.has_arxiv_props() for p in papers]):
 69 |         print(" |- Filling in missing data from arXiv")
 70 |         papers = fill_papers_with_arxiv(papers)
 71 | 
 72 |     if args.search_arxiv:
 73 |         print(" |- Searching arXiv for new papers")
 74 |         existing_titles = [paper.title for paper in papers]
 75 |         for searched_paper in search_arxiv_as_paper(
 76 |             args.arxiv_search_query, max_results=500
 77 |         ):
 78 |             if searched_paper.title not in existing_titles:
 79 |                 print(f"    |- {searched_paper.title[:50]}...")
 80 |                 papers.append(searched_paper)
 81 | 
 82 |     if args.search_semantic_scholar:
 83 |         to_explore = [p for p in papers if not p.explored]
 84 |         if to_explore:
 85 |             print(" |- Getting related papers from Semantic Scholar")
 86 |             recommended_papers = get_recommended_arxiv_ids_from_semantic_scholar(to_explore)
 87 |             papers.extend(fill_papers_with_arxiv(recommended_papers))
 88 |             print(f"    |- {len(recommended_papers)} new papers")
 89 |         else:
 90 |             print(" |- All papers have been explored")
 91 | 
 92 |     if not all([paper.summary for paper in papers]):
 93 |         print(" |- Building summaries with OpenAI")
 94 |         for paper in [p for p in papers if not p.summary and p.abstract]:
 95 |             print(f"    |- {paper.title[:50]}...")
 96 |             paper.summary = summarize_abstract_with_openai(
 97 |                 openai_client, paper.abstract
 98 |             )
 99 | 
100 |     if not all([paper.focus for paper in papers]):
101 |         print(" |- Assigning focus labels with OpenAI")
102 |         for paper in [p for p in papers if not p.focus and (p.abstract or p.summary)]:
103 |             reference = paper.abstract or paper.summary
104 |             paper.focus = get_focus_label_from_abstract(openai_client, reference)
105 |             print(f"    |- {paper.focus}")
106 | 
107 |     if not all([paper.attack_type for paper in papers]):
108 |         print(" |- Assigning attack types with OpenAI")
109 |         for paper in [p for p in papers if not p.attack_type and (p.abstract or p.summary)]:
110 |             reference = paper.abstract or paper.summary
111 |             paper.attack_type = get_attack_type_from_abstract(openai_client, reference)
112 |             print(f"    |- {paper.attack_type}")
113 |             
114 |     to_write = [p for p in papers if p.has_changed()]
115 |     if to_write:
116 |         print(f" |- Writing {len(to_write)} updates back to Notion")
117 |         await write_papers_to_notion(notion_client, args.database_id, to_write)
118 | 
119 |     print("[+] Done!")
120 | 
121 | 
122 | if __name__ == "__main__":
123 |     asyncio.run(main())
124 | 


--------------------------------------------------------------------------------
/openai_utils.py:
--------------------------------------------------------------------------------
  1 | from _types import AttackType, Focus
  2 | from openai import OpenAI
  3 | 
  4 | OpenAIClient = OpenAI
  5 | 
  6 | SUMMARIZE_ABSTRACT_PROMPT = """\
  7 | You will be provided with an abstract of a scientific paper. \
  8 | Compress this abstract in 1-2 sentences. Use very concise language usable as \
  9 | bullet points on a slide deck. Respond ONLY with your summary.
 10 | """
 11 | 
 12 | ASSIGN_LABEL_PROMPT = """\
 13 | You will be provided with an abstract of a scientific paper. \
 14 | Assess the most applicable focus label based on the target audience, \
 15 | research focus, produced materials, and key outcomes.
 16 | 
 17 | {labels}
 18 | 
 19 | Respond with ONLY ONE of the labels above. Do not include anything else in your response.
 20 | """
 21 | 
 22 | # Attack Type descriptions
 23 | 
 24 | EVASION_DESCRIPTION = """\
 25 | Model Evasion is an adversarial attack aimed at bypassing or evading a machine 
 26 | learning model's defenses, usually to make it produce incorrect outputs or behave 
 27 | in ways that favor the attacker. In this context, the adversary doesn't try to 
 28 | "break" the model or extract data from it (like in model inversion) but instead 
 29 | seeks to manipulate the model's behavior in a way that allows them to achieve a 
 30 | desired outcome, such as bypassing detection systems or generating misleading predictions.
 31 | """
 32 | 
 33 | EXTRACTION_DESCRIPTION = """\
 34 | Model Extraction refers to an attack where an adversary tries to replicate or steal 
 35 | the functionality of a machine learning model by querying it and using the outputs 
 36 | to build a copy of the original model. This type of attack doesn't necessarily involve 
 37 | extracting sensitive data used for training, as in model inversion, but instead focuses 
 38 | on how the model behaves—its predictions and outputs—in order to create a surrogate or 
 39 | shadow model that behaves similarly to the original.
 40 | """
 41 | 
 42 | INVERSION_DESCRIPTION = """\
 43 | Model inversion refers to a set of techniques in machine learning where an attacker
 44 | tries to extract confidential information from a trained AI model by interacting with
 45 | it in specific ways, often through extensive querying. By doing so, the attacker may
 46 | be able to infer details about the data used to train the model. These details can
 47 | range from personal information to the reconstruction of private or sensitive datasets,
 48 | potentially revealing confidential information.
 49 | """
 50 | 
 51 | POISONING_DESCRIPTION = """\
 52 | Model Poisoning is an attack on machine learning models where an adversary intentionally
 53 | manipulates data in the training set to impact how a model behaves. Unlike attacks like
 54 | model inversion or model extraction, which focus on extracting information from the model,
 55 | model poisoning targets the model during its training phase. By introducing misleading,
 56 | incorrect, or adversarial data, attackers can manipulate a model's behavior, often without
 57 | detection, leading to significant security, reliability, and ethical risks.
 58 | """
 59 | 
 60 | PROMPT_INJECTION_DESCRIPTION = """\
 61 | Prompt injection is a critical vulnerability in Large Language Models (LLMs), where malicious
 62 | users manipulate model behavior by crafting inputs that override, bypass, or exploit how the
 63 | model follows instructions. This vulnerability has become more pronounced with the widespread
 64 | use of generative AI systems, enabling attackers to induce unintended responses that may lead
 65 | to data leakage, misinformation, or system disruptions.
 66 | """
 67 | 
 68 | 
 69 | ATTACK_TYPE_DESCRIPTIONS: dict[AttackType, str] = {
 70 |     AttackType.ModelEvasion: EVASION_DESCRIPTION,
 71 |     AttackType.ModelExtraction: EXTRACTION_DESCRIPTION,
 72 |     AttackType.ModelInversion: INVERSION_DESCRIPTION,
 73 |     AttackType.ModelPoisoning: POISONING_DESCRIPTION,
 74 |     AttackType.PromptInjection: PROMPT_INJECTION_DESCRIPTION,
 75 |     AttackType.Other: "None of the above",
 76 | }
 77 | 
 78 | ASSIGN_ATTACK_TYPE_PROMPT = """\
 79 | You will be provided with an abstract of a scientific paper. \
 80 | Assess the most applicable attack type label based on the \
 81 | research focus, produced materials, and key outcomes.
 82 | 
 83 | {types}
 84 | 
 85 | If you feel like none of the types apply, you can respond with "Other".
 86 | 
 87 | Respond with ONLY ONE of the labels above. Do not include anything else in your response.
 88 | """
 89 | 
 90 | # Model Evasion
 91 | # Model Extraction
 92 | # Model Inversion
 93 | # Model Poisoning
 94 | # Prompt Injection
 95 | 
 96 | def get_openai_client(token: str) -> OpenAIClient:
 97 |     return OpenAI(api_key=token)
 98 | 
 99 | 
100 | def summarize_abstract_with_openai(client: OpenAIClient, abstract: str) -> str:
101 |     response = client.chat.completions.create(
102 |         model="gpt-4o-mini",
103 |         messages=[
104 |             {"role": "system", "content": SUMMARIZE_ABSTRACT_PROMPT},
105 |             {"role": "user", "content": f"{abstract}"},
106 |         ],
107 |         temperature=0.5,
108 |         max_tokens=100,
109 |     )
110 | 
111 |     return response.choices[0].message.content.strip()  # type: ignore
112 | 
113 | 
114 | def get_focus_label_from_abstract(client: OpenAIClient, abstract: str) -> Focus | None:
115 |     system_prompt = ASSIGN_LABEL_PROMPT.format(
116 |         labels="\n".join([f"- {f.value}" for f in Focus])
117 |     )
118 | 
119 |     response = client.chat.completions.create(
120 |         model="gpt-3.5-turbo",
121 |         messages=[
122 |             {"role": "system", "content": system_prompt},
123 |             {"role": "user", "content": f"{abstract}"},
124 |         ],
125 |         temperature=0.5,
126 |         max_tokens=10,
127 |     )
128 | 
129 |     content = response.choices[0].message.content.strip()  # type: ignore
130 |     if content not in [f.value for f in Focus]:
131 |         return None
132 | 
133 |     return Focus(content)
134 | 
135 | def get_attack_type_from_abstract(client: OpenAIClient, abstract: str) -> AttackType | None:
136 |     system_prompt = ASSIGN_ATTACK_TYPE_PROMPT.format(
137 |         types="\n".join([f"- `{t.value}`: {ATTACK_TYPE_DESCRIPTIONS[t]}" for t in AttackType])
138 |     )
139 | 
140 |     response = client.chat.completions.create(
141 |         model="gpt-3.5-turbo",
142 |         messages=[
143 |             {"role": "system", "content": system_prompt},
144 |             {"role": "user", "content": f"{abstract}"},
145 |         ],
146 |         temperature=0.5,
147 |         max_tokens=10,
148 |     )
149 | 
150 |     content = response.choices[0].message.content.strip()  # type: ignore
151 |     content = content.strip("`")
152 | 
153 |     if content not in [t.value for t in AttackType]:
154 |         print(f"Invalid attack type: {content}")
155 |         return None
156 | 
157 |     return AttackType(content)


--------------------------------------------------------------------------------
/notion_utils.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import typing as t
  3 | from datetime import datetime
  4 | import time
  5 | 
  6 | from notion_client import AsyncClient
  7 | from notion_client.errors import RequestTimeoutError, APIResponseError
  8 | from notion_client.helpers import async_collect_paginated_api
  9 | from tqdm import tqdm  # type: ignore
 10 | 
 11 | from _types import AttackType, Paper, Focus
 12 | 
 13 | # Retry constants
 14 | MAX_RETRIES = 5
 15 | RETRY_DELAY = 5
 16 | MAX_BATCH_SIZE = 5
 17 | 
 18 | NotionClient = AsyncClient
 19 | 
 20 | 
 21 | def get_notion_client(token: str) -> NotionClient:
 22 |     return NotionClient(auth=token, timeout_ms=60000)  # 60-second timeout
 23 | 
 24 | 
 25 | async def get_papers_from_notion(client: NotionClient, database_id: str, *, max: int | None = None) -> list[Paper]:
 26 |     retries = 0
 27 |     results = []
 28 | 
 29 |     while retries < MAX_RETRIES:
 30 |         try:
 31 |             if max:
 32 |                 response = await client.databases.query(database_id=database_id, page_size=max)
 33 |                 results = response['results']
 34 |             else:
 35 |                 results = await async_collect_paginated_api(
 36 |                     client.databases.query, database_id=database_id
 37 |                 )
 38 |             break
 39 |         except (RequestTimeoutError, APIResponseError) as e:
 40 |             retries += 1
 41 |             if retries >= MAX_RETRIES:
 42 |                 print(f"Failed to get papers from Notion after {MAX_RETRIES} attempts: {str(e)}")
 43 |                 return []
 44 |             else:
 45 |                 print(f"Notion API error when fetching papers, retrying ({retries}/{MAX_RETRIES}): {str(e)}")
 46 |                 # Exponential backoff with jitter
 47 |                 wait_time = RETRY_DELAY * (2 ** (retries - 1)) + (RETRY_DELAY * 0.1 * retries)
 48 |                 print(f"Waiting {wait_time:.1f} seconds before retry...")
 49 |                 await asyncio.sleep(wait_time)
 50 | 
 51 |     papers: list[Paper] = []
 52 |     for result in results:
 53 |         page_id = result["id"]
 54 |         properties = result["properties"]
 55 | 
 56 |         title = properties["Title"]["title"]
 57 |         title = title[0]["text"]["content"] if title else None
 58 |         url = properties["URL"]["url"]
 59 |         summary = properties["Summary"]["rich_text"]
 60 |         summary = summary[0]["text"]["content"] if summary else None
 61 |         authors = [author["name"] for author in properties["Authors"]["multi_select"]]
 62 |         published = properties["Published"]["date"]
 63 |         published = datetime.fromisoformat(published["start"]) if published else None
 64 |         focus = properties["Focus"]["select"]
 65 |         focus = Focus(focus["name"]) if focus else None
 66 |         attack_type = properties["Attack Type"]["select"]
 67 |         attack_type = AttackType(attack_type["name"]) if attack_type else None
 68 |         explored = properties["Explored"]["checkbox"]
 69 | 
 70 |         if not any([url, title]):
 71 |             continue
 72 | 
 73 |         papers.append(
 74 |             Paper(
 75 |                 page_id=page_id,
 76 |                 title=title,
 77 |                 url=url,
 78 |                 focus=focus,
 79 |                 attack_type=attack_type,
 80 |                 summary=summary,
 81 |                 authors=authors,
 82 |                 published=published,
 83 |                 explored=explored,
 84 |                 track_changes=True,
 85 |             )
 86 |         )
 87 | 
 88 |     return papers
 89 | 
 90 | 
 91 | async def write_papers_to_notion(
 92 |     client: NotionClient, database_id: str, papers: list[Paper]
 93 | ) -> None:
 94 |     # Process papers in smaller batches with pauses between
 95 |     for i in range(0, len(papers), MAX_BATCH_SIZE):
 96 |         batch = papers[i:i+MAX_BATCH_SIZE]
 97 |         print(f"Processing batch {i//MAX_BATCH_SIZE + 1}/{(len(papers) + MAX_BATCH_SIZE - 1)//MAX_BATCH_SIZE}")
 98 | 
 99 |         for paper in tqdm(batch):
100 |             properties: dict[str, t.Any] = {}
101 |             if paper.title:
102 |                 properties["Title"] = {"title": [{"text": {"content": paper.title}}]}
103 |             if paper.url:
104 |                 properties["URL"] = {"url": paper.url}
105 |             if paper.summary:
106 |                 properties["Summary"] = {
107 |                     "rich_text": [{"text": {"content": paper.summary}}]
108 |                 }
109 |             if paper.authors:
110 |                 properties["Authors"] = {
111 |                     "multi_select": [{"name": author} for author in paper.authors]
112 |                 }
113 |             if paper.published:
114 |                 properties["Published"] = {"date": {"start": paper.published.isoformat()}}
115 |             if paper.focus:
116 |                 properties["Focus"] = {"select": {"name": paper.focus.value}}
117 |             if paper.attack_type:
118 |                 properties["Attack Type"] = {"select": {"name": paper.attack_type.value}}
119 |             if paper.explored is not None:
120 |                 properties["Explored"] = {"checkbox": paper.explored}
121 | 
122 |             # Retry logic with progressive backoff
123 |             retries = 0
124 |             while retries < MAX_RETRIES:
125 |                 try:
126 |                     if paper.page_id:
127 |                         await client.pages.update(paper.page_id, properties=properties)
128 |                     else:
129 |                         await client.pages.create(
130 |                             parent={"database_id": database_id}, properties=properties
131 |                         )
132 |                     # Success, break out of retry loop
133 |                     break
134 |                 except (RequestTimeoutError, APIResponseError) as e:
135 |                     retries += 1
136 |                     if retries >= MAX_RETRIES:
137 |                         print(f"Failed to update/create paper after {MAX_RETRIES} attempts: {paper.title[:50]}...")
138 |                         # Don't raise - continue with other papers
139 |                         break
140 |                     else:
141 |                         print(f"Notion API error, retrying ({retries}/{MAX_RETRIES}): {str(e)}")
142 |                         # Exponential backoff with longer delays
143 |                         wait_time = RETRY_DELAY * (2 ** (retries - 1)) + (RETRY_DELAY * 0.1 * retries)
144 |                         print(f"Waiting {wait_time:.1f} seconds before retry...")
145 |                         await asyncio.sleep(wait_time)
146 | 
147 |             # Add a small delay between papers regardless of success/failure
148 |             await asyncio.sleep(1)
149 | 
150 |         if i + MAX_BATCH_SIZE < len(papers):
151 |             print(f"Pausing for 10 seconds between batches...")
152 |             await asyncio.sleep(10)
153 | 
154 |     return None
155 | 


--------------------------------------------------------------------------------