├── static
    └── uploads
    │   └── .gitkeep
├── assets
    └── media
    │   ├── icons
    │       └── .gitkeep
    │   ├── icon.png
    │   └── covers
    │       ├── breslin-silicon.webp
    │       ├── nasa-Q1p7bh3SHj8-unsplash.jpg
    │       └── pexels-nuno-fangueiro-12125258.jpg
├── .github
    └── FUNDING.yml
├── preview.webp
├── .gitignore
├── config
    └── _default
    │   ├── menus.yaml
    │   ├── params.yaml
    │   └── config.yaml
├── go.mod
├── .editorconfig
├── netlify.toml
├── README.md
└── content
    └── _index.md


/static/uploads/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/assets/media/icons/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: gcushen
2 | custom: https://wowchemy.com/sponsor/
3 | 


--------------------------------------------------------------------------------
/preview.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsecnet/llmsec-site/HEAD/preview.webp


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # IDEs
2 | .idea/
3 | 
4 | # Hugo
5 | resources/
6 | public/
7 | jsconfig.json
8 | 


--------------------------------------------------------------------------------
/assets/media/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsecnet/llmsec-site/HEAD/assets/media/icon.png


--------------------------------------------------------------------------------
/assets/media/covers/breslin-silicon.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsecnet/llmsec-site/HEAD/assets/media/covers/breslin-silicon.webp


--------------------------------------------------------------------------------
/assets/media/covers/nasa-Q1p7bh3SHj8-unsplash.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsecnet/llmsec-site/HEAD/assets/media/covers/nasa-Q1p7bh3SHj8-unsplash.jpg


--------------------------------------------------------------------------------
/assets/media/covers/pexels-nuno-fangueiro-12125258.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsecnet/llmsec-site/HEAD/assets/media/covers/pexels-nuno-fangueiro-12125258.jpg


--------------------------------------------------------------------------------
/config/_default/menus.yaml:
--------------------------------------------------------------------------------
 1 | # Navigation Links
 2 | #   To link a homepage widget, specify the URL as a hash `#` followed by the filename of the
 3 | #     desired widget in your `content/home/` folder.
 4 | #   The weight parameter defines the order that the links will appear in.
 5 | 
 6 | main:
 7 |   - name: Home
 8 |     url: /
 9 |     weight: 10
10 | 
11 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/wowchemy/hugo-second-brain-theme
 2 | 
 3 | go 1.15
 4 | 
 5 | require (
 6 | 	github.com/wowchemy/wowchemy-hugo-themes/modules/wowchemy-plugin-netlify v1.0.0 // indirect
 7 | 	github.com/wowchemy/wowchemy-hugo-themes/modules/wowchemy-plugin-netlify-cms v1.0.0 // indirect
 8 | 	github.com/wowchemy/wowchemy-hugo-themes/modules/wowchemy/v5 v5.7.0 // indirect
 9 | )
10 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # editorconfig.org
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | charset = utf-8
 7 | end_of_line = lf
 8 | indent_size = 2
 9 | indent_style = space
10 | insert_final_newline = true
11 | trim_trailing_whitespace = true
12 | 
13 | [*.toml]
14 | max_line_length = 100
15 | 
16 | [*.md]
17 | trim_trailing_whitespace = false
18 | 
19 | [layouts/shortcodes/*.html]
20 | insert_final_newline = false
21 | 


--------------------------------------------------------------------------------
/netlify.toml:
--------------------------------------------------------------------------------
 1 | [build]
 2 |   command = "hugo --gc --minify -b $URL"
 3 |   publish = "public"
 4 | 
 5 | [build.environment]
 6 |   HUGO_VERSION = "0.97.3"
 7 |   HUGO_ENABLEGITINFO = "true"
 8 | 
 9 | [context.production.environment]
10 |   HUGO_ENV = "production"
11 | 
12 | [context.deploy-preview]
13 |   command = "hugo --gc --minify --buildFuture -b $DEPLOY_PRIME_URL"
14 | 
15 | [context.branch-deploy]
16 |   command = "hugo --gc --minify -b $DEPLOY_PRIME_URL"
17 | 
18 | [[plugins]]
19 |   package = "netlify-plugin-hugo-cache-resources"
20 |   [plugins.inputs]
21 |     debug = true
22 | 


--------------------------------------------------------------------------------
/config/_default/params.yaml:
--------------------------------------------------------------------------------
 1 | # SITE SETUP
 2 | # Guide: https://wowchemy.com/docs/getting-started/
 3 | # Documentation: https://wowchemy.com/docs/
 4 | # This file is formatted using YAML syntax - learn more at https://learnxinyminutes.com/docs/yaml/
 5 | 
 6 | # Appearance
 7 | 
 8 | appearance:
 9 |   theme_day: minimal
10 |   theme_night: minimal
11 |   font: minimal
12 |   font_size: L
13 | 
14 | # SEO
15 | 
16 | marketing:
17 |   seo:
18 |     site_type: Project
19 |     local_business_type: ''
20 |     org_name: ''
21 |     description: 'The latest research, news, and papers on large language model security.'
22 |     twitter: 'llm_sec'
23 |   analytics:
24 |     google_analytics: ''
25 |     baidu_tongji: ''
26 |     google_tag_manager: ''
27 |     microsoft_clarity: ''
28 |   verification:
29 |     google: ''
30 |     baidu: ''
31 | 
32 | # Site header
33 | 
34 | header:
35 |   navbar:
36 |     enable: false
37 |     align: l
38 |     show_logo: true
39 |     show_language: false
40 |     show_day_night: true
41 |     show_search: false
42 |     highlight_active_link: true
43 | 
44 | footer:
45 |   copyright:
46 |     notice: '© {year} [@llm_sec](https://twitter.com/llm_sec). This work is licensed under {license}'
47 |     license:
48 |       enable: true
49 |       allow_derivatives: true
50 |       share_alike: false
51 |       allow_commercial: true
52 | 
53 | # Localization
54 | 
55 | locale:
56 |   date_format: 'Jan 2, 2006'
57 |   time_format: '3:04 PM'
58 | 
59 | # Site features
60 | 
61 | features:
62 |   syntax_highlighter:
63 |     theme_light: github-light
64 |     theme_dark: dracula
65 |   math:
66 |     enable: false
67 |   privacy_pack:
68 |     enable: false
69 |   repository:
70 |     url: 'https://github.com/llmsecnet/llmsec-site'
71 |     content_dir: content
72 |     branch: main
73 |   comment:
74 |     provider: ''
75 |     disqus:
76 |       shortname: ''
77 |       show_count: true
78 |     commento:
79 |       url: ''
80 |     giscus:
81 |       repo: ''
82 |       repo_id: ''
83 |       category: ''
84 |       category_id: ''
85 | 


--------------------------------------------------------------------------------
/config/_default/config.yaml:
--------------------------------------------------------------------------------
 1 | # Configuration of Hugo
 2 | # Guide: https://wowchemy.com/docs/getting-started/
 3 | # Hugo Documentation: https://gohugo.io/getting-started/configuration/#all-configuration-settings
 4 | # This file is formatted using YAML syntax - learn more at https://learnxinyminutes.com/docs/yaml/
 5 | 
 6 | title: 'LLM Security' # Website name
 7 | baseURL: 'https://llmsec.net/' # Website URL
 8 | 
 9 | ############################
10 | ## PAGE OPTIONS
11 | ############################
12 | 
13 | cascade:
14 |   # Docs folder options
15 |   - _target:
16 |       path: /**
17 |     type: book
18 |     editable: true
19 |     show_breadcrumb: true
20 | 
21 | ############################
22 | ## LANGUAGE
23 | ############################
24 | 
25 | languageCode: en-us
26 | hasCJKLanguage: false
27 | defaultContentLanguageInSubdir: false
28 | removePathAccents: true
29 | 
30 | ############################
31 | ## MODULES
32 | ############################
33 | 
34 | module:
35 |   imports:
36 |     - path: github.com/wowchemy/wowchemy-hugo-themes/modules/wowchemy-plugin-netlify-cms
37 |       disable: true
38 |     - path: github.com/wowchemy/wowchemy-hugo-themes/modules/wowchemy-plugin-netlify
39 |     - path: github.com/wowchemy/wowchemy-hugo-themes/modules/wowchemy/v5
40 | 
41 | ############################
42 | ## ADVANCED
43 | ############################
44 | 
45 | enableGitInfo: false
46 | summaryLength: 30
47 | paginate: 10
48 | enableEmoji: true
49 | enableRobotsTXT: true
50 | footnotereturnlinkcontents: <sup>^</sup>
51 | ignoreFiles: [\.ipynb$, .ipynb_checkpoints$, \.Rmd$, \.Rmarkdown$, _cache$]
52 | permalinks:
53 |   authors: '/author/:slug/'
54 |   tags: '/tag/:slug/'
55 |   categories: '/category/:slug/'
56 | disableAliases: true
57 | outputs:
58 |   home: [HTML, RSS, JSON, WebAppManifest, headers, redirects]
59 |   section: [HTML, RSS]
60 | imaging:
61 |   resampleFilter: lanczos
62 |   quality: 97
63 |   anchor: smart
64 | timeout: 600000
65 | taxonomies:
66 |   tag: tags
67 |   category: categories
68 |   author: authors
69 | markup:
70 |   _merge: deep
71 | related:
72 |   threshold: 80
73 |   includeNewer: true
74 |   toLower: true
75 |   indices:
76 |     - name: tags
77 |       weight: 100
78 |     - name: categories
79 |       weight: 70
80 | security:
81 |   _merge: deep
82 | sitemap:
83 |   _merge: deep
84 | minify:
85 |   _merge: deep
86 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # [Hugo Second Brain Theme](https://github.com/wowchemy/hugo-second-brain-theme)
 2 | 
 3 | [![Screenshot](./preview.webp)](https://wowchemy.com/hugo-themes/)
 4 | 
 5 | The **Second Brain** starter template empowers you to easily create **personal notes** and **knowledge bases** in a future-proof way.
 6 | 
 7 | - It is your **second brain** 🧠, stored in **future-proof** Markdown files
 8 | - Supports audio, video, images, math, code, [Mermaid](https://mermaid.live/) diagrams, and [much more](https://wowchemy.com/docs/content/writing-markdown-latex/)
 9 | - Edit your notes online in GitHub, or any Git-connected Markdown app such as [Obsidian](https://obsidian.md/) or [Visual Studio Code](https://vscode.dev/)
10 | 
11 | [![Get Started](https://img.shields.io/badge/-Get%20started-ff4655?style=for-the-badge)](https://wowchemy.com/hugo-themes/)
12 | [![Discord](https://img.shields.io/discord/722225264733716590?style=for-the-badge)](https://discord.com/channels/722225264733716590/742892432458252370/742895548159492138)  
13 | [![Twitter Follow](https://img.shields.io/twitter/follow/wowchemy?label=Follow%20on%20Twitter)](https://twitter.com/wowchemy)
14 | 
15 | [Check out the latest demo](https://wowchemy.com/docs/) of what you'll get in less than 10 minutes, or [get inspired by how others are using this template](https://wowchemy.com/creators/).
16 | 
17 | The integrated [**Wowchemy**](https://wowchemy.com) website builder and CMS makes it easy to create a beautiful website for free. Edit your site in the CMS (or your favorite editor), generate it with [Hugo](https://github.com/gohugoio/hugo), and deploy with GitHub or Netlify. Customize anything on your site with widgets, light/dark themes, and language packs.
18 | 
19 | - 👉 [**Get Started**](https://wowchemy.com/hugo-themes/)
20 | - 📚 [View the **documentation**](https://wowchemy.com/docs/)
21 | - 💬 [Chat with the **Wowchemy research community**](https://discord.gg/z8wNYzb) or [**Hugo community**](https://discourse.gohugo.io)
22 | - ⬇️ **Automatically import citations from BibTeX** with the [Hugo Academic CLI](https://github.com/wowchemy/hugo-academic-cli)
23 | - 🐦 Share your new site with the community: [@wowchemy](https://twitter.com/wowchemy) [@GeorgeCushen](https://twitter.com/GeorgeCushen) [#MadeWithWowchemy](https://twitter.com/search?q=%23MadeWithWowchemy&src=typed_query)
24 | - 🗳 [Take the survey and help us improve #OpenSource](https://forms.gle/NioD9VhUg7PNmdCAA)
25 | - 🚀 [Contribute improvements](https://github.com/wowchemy/wowchemy-hugo-themes/blob/main/CONTRIBUTING.md) or [suggest improvements](https://github.com/wowchemy/wowchemy-hugo-themes/issues)
26 | - ⬆️ **Updating?** View the [Update Guide](https://wowchemy.com/docs/hugo-tutorials/update/) and [Release Notes](https://github.com/wowchemy/wowchemy-hugo-themes/releases)
27 | 
28 | ## We ask you, humbly, to support this open source movement
29 | 
30 | Today we ask you to defend the open source independence of the Wowchemy website builder and themes 🐧
31 | 
32 | We're an open source movement that depends on your support to stay online and thriving, but 99.9% of our creators don't give; they simply look the other way.
33 | 
34 | ### [❤️ Click here to become a GitHub Sponsor, unlocking awesome perks such as _exclusive academic templates and widgets_](https://github.com/sponsors/gcushen)
35 | 


--------------------------------------------------------------------------------
/content/_index.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: LLM Security
  3 | toc: false
  4 | 
  5 | image:
  6 |   filename: covers/pexels-nuno-fangueiro-12125258.jpg
  7 |   caption:  Monstera - Nuno Fangueiro
  8 | ---
  9 | 
 10 | LLM security is the investigation of the failure modes of LLMs in use, the conditions that lead to them, and their mitigations.
 11 | 
 12 | Here are links to large language model security content - research, papers, and news - posted by [@llm_sec](https://twitter.com/llm_sec)
 13 | 
 14 | Got a tip/link? Open a [pull request](https://github.com/llmsecnet/llmsec-site) or send a [DM](https://twitter.com/llm_sec).
 15 | 
 16 | ## Getting Started
 17 | 
 18 | * [How to hack Google Bard, ChatGPT, or any other chatbot](https://dataconomy.com/2023/09/01/how-to-hack-google-bard-chatbots/)
 19 | * [Prompt injection primer for engineers](https://github.com/jthack/PIPE)
 20 | * [Tutorial based on ten vulnerabilities, by Hego](https://wiki.hego.tech/owasp/owasp-llm-top-10-v1.0)
 21 | 
 22 | ## Attacks
 23 | 
 24 | ### Adversarial
 25 | 
 26 | * [A LLM Assisted Exploitation of AI-Guardian](https://arxiv.org/abs/2307.15008)
 27 | * [Adversarial Attacks on Tables with Entity Swap](https://ceur-ws.org/Vol-3462/TADA4.pdf)
 28 | * [Adversarial Demonstration Attacks on Large Language Models](https://arxiv.org/abs/2305.14950)
 29 | * [Adversarial Examples Are Not Bugs, They Are Features](https://arxiv.org/abs/1905.02175) 🌶️
 30 | * [Are Aligned Language Models “Adversarially Aligned”?](https://www.youtube.com/watch?v=uqOfC3KSZFc) 🌶️
 31 | * [Bad Characters: Imperceptible NLP Attacks](https://arxiv.org/abs/2106.09898)
 32 | * [Breaking BERT: Understanding its Vulnerabilities for Named Entity Recognition through Adversarial Attack](https://arxiv.org/abs/2109.11308)
 33 | * [Expanding Scope: Adapting English Adversarial Attacks to Chinese](https://aclanthology.org/2023.trustnlp-1.24/)
 34 | * [Fine-tuning Aligned Language Models Compromises Safety, Even When Users Do Not Intend To!](https://arxiv.org/abs/2310.03693)
 35 | * [Gradient-based Adversarial Attacks against Text Transformers](https://arxiv.org/abs/2104.13733)
 36 | * [Gradient-Based Word Substitution for Obstinate Adversarial Examples Generation in Language Models](https://arxiv.org/abs/2307.12507)
 37 | * [Sample Attackability in Natural Language Adversarial Attacks](https://aclanthology.org/2023.trustnlp-1.9/)
 38 | * [Universal and Transferable Adversarial Attacks on Aligned Language Models](https://arxiv.org/abs/2307.15043)
 39 | * [Why Should Adversarial Perturbations be Imperceptible? Rethink the Research Paradigm in Adversarial NLP](https://arxiv.org/abs/2210.10683) 🌶️
 40 | 
 41 | ### Backdoors & data poisoning
 42 | 
 43 | * [A backdoor attack against LSTM-based text classification systems](https://arxiv.org/abs/1905.12457) "Submitted on 29 May 2019"!
 44 | * [A Gradient Control Method for Backdoor Attacks on Parameter-Efficient Tuning](https://aclanthology.org/2023.acl-long.194/)
 45 | * [Are You Copying My Model? Protecting the Copyright of Large Language Models for EaaS via Backdoor Watermark](https://arxiv.org/abs/2305.10036)
 46 | * [Backdoor Learning on Sequence to Sequence Models](https://arxiv.org/abs/2305.02424)
 47 | * [Backdooring Neural Code Search](https://arxiv.org/abs/2305.17506) 🌶️
 48 | * [BadPre: Task-agnostic Backdoor Attacks to Pre-trained NLP Foundation Models](https://arxiv.org/abs/2110.02467)
 49 | * [BadPrompt: Backdoor Attacks on Continuous Prompts](https://arxiv.org/abs/2211.14719)
 50 | * [Be Careful about Poisoned Word Embeddings: Exploring the Vulnerability of the Embedding Layers in NLP Models](https://arxiv.org/abs/2103.15543)
 51 | * [BadNL: Backdoor Attacks against NLP Models with Semantic-preserving Improvements](https://arxiv.org/abs/2006.01043)
 52 | * [BITE: Textual Backdoor Attacks with Iterative Trigger Injection](https://arxiv.org/abs/2205.12700) 🌶️
 53 | * [Exploring the Universal Vulnerability of Prompt-based Learning Paradigm](https://aclanthology.org/2022.findings-naacl.137/)
 54 | * [Hidden Killer: Invisible Textual Backdoor Attacks with Syntactic Trigger](https://arxiv.org/abs/2105.12400) 🌶️
 55 | * [Instructions as Backdoors: Backdoor Vulnerabilities of Instruction Tuning for Large Language Models](https://arxiv.org/abs/2305.14710)
 56 | * [Mind the Style of Text! Adversarial and Backdoor Attacks Based on Text Style Transfer](https://aclanthology.org/2021.emnlp-main.374/)
 57 | * [On the Exploitability of Instruction Tuning](https://arxiv.org/abs/2306.17194)
 58 | * [Poisoning Web-Scale Training Datasets is Practical](https://arxiv.org/abs/2302.10149) 🌶️
 59 | * [Prompt as Triggers for Backdoor Attack: Examining the Vulnerability in Language Models](https://arxiv.org/abs/2305.01219)
 60 | * [Textual Backdoor Attacks Can Be More Harmful via Two Simple Tricks](https://arxiv.org/abs/2110.08247)
 61 | * [Two-in-One: A Model Hijacking Attack Against Text Generation Models](https://arxiv.org/abs/2305.07406)
 62 | 
 63 | ### Prompt injection
 64 | 
 65 | * [Bing Chat: Data Exfiltration Exploit Explained](https://embracethered.com/blog/posts/2023/bing-chat-data-exfiltration-poc-and-fix/) 🌶️
 66 | * [ChatGPT's new browser feature is affected by Indirect Prompt Injection vulnerability. ](https://twitter.com/evrnyalcin/status/1707298475216425400)
 67 | * [Compromising LLMs: The Advent of AI Malware](https://www.blackhat.com/us-23/briefings/schedule/index.html#compromising-llms-the-advent-of-ai-malware-33075)
 68 | * [Generative AI’s Biggest Security Flaw Is Not Easy to Fix](https://www.wired.com/story/generative-ai-prompt-injection-hacking/)
 69 | * [GPT-4 Is Too Smart To Be Safe: Stealthy Chat with LLMs via Cipher](https://arxiv.org/abs/2308.06463)
 70 | * [Hackers Compromised ChatGPT Model with Indirect Prompt Injection](https://gbhackers.com/hackers-compromised-chatgpt-model/)
 71 | * [Large Language Model Prompts for Prompt Injection (RTC0006)](https://redteamrecipe.com/Large-Language-Model-Prompts/)
 72 | * [Ignore Previous Prompt: Attack Techniques For Language Models](https://arxiv.org/abs/2211.09527) 🌶️
 73 | * [Not what you've signed up for: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection](https://arxiv.org/abs/2302.12173) 🌶️
 74 | * [Prompt Injection attack against LLM-integrated Applications](https://arxiv.org/abs/2306.05499)
 75 | * [Safeguarding Crowdsourcing Surveys from ChatGPT with Prompt Injection](https://arxiv.org/abs/2306.08833)
 76 | * [Virtual Prompt Injection for Instruction-Tuned Large Language Models](https://arxiv.org/abs/2307.16888)
 77 | 
 78 | ### Jailbreaking
 79 | 
 80 | * [AutoDAN: Generating Stealthy Jailbreak Prompts on Aligned Large Language Models](https://arxiv.org/abs/2310.04451) 🌶️
 81 | * ["Do Anything Now": Characterizing and Evaluating In-The-Wild Jailbreak Prompts on Large Language Models](https://arxiv.org/abs/2308.03825) 🌶️
 82 | * [GPTFUZZER: Red Teaming Large Language Models with Auto-Generated Jailbreak Prompts](https://arxiv.org/abs/2309.10253)
 83 | * [JAILBREAKER: Automated Jailbreak Across Multiple Large Language Model Chatbots](https://arxiv.org/pdf/2307.08715.pdf)
 84 | * [Jailbroken: How Does LLM Safety Training Fail?](https://arxiv.org/abs/2307.02483)
 85 | * [LLM Censorship: A Machine Learning Challenge Or A Computer Security Problem?](https://www.cl.cam.ac.uk/~is410/Papers/llm_censorship.pdf) (mosaic prompts)
 86 | * [Low-Resource Languages Jailbreak GPT-4](https://arxiv.org/abs/2310.02446) 🌶️
 87 | * [Self-Deception: Reverse Penetrating the Semantic Firewall of Large Language Models](https://arxiv.org/abs/2308.11521v1)
 88 | 
 89 | ### Data extraction & privacy
 90 | 
 91 | * [DP-Forward: Fine-tuning and Inference on Language Models with Differential Privacy in Forward Pass ](https://arxiv.org/abs/2309.06746)
 92 | * [Extracting Training Data from Large Language Models](https://arxiv.org/abs/2012.07805)
 93 | * [Privacy Side Channels in Machine Learning Systems](https://arxiv.org/abs/2309.05610) 🌶️
 94 | * [Prompts Should not be Seen as Secrets: Systematically Measuring Prompt Extraction Attack Success](https://arxiv.org/abs/2307.06865)
 95 | * [ProPILE: Probing Privacy Leakage in Large Language Models](https://arxiv.org/abs/2307.01881) 🌶️
 96 | * [Training Data Extraction From Pre-trained Language Models: A Survey](https://aclanthology.org/2023.trustnlp-1.23/)
 97 | 
 98 | ### Data reconstruction
 99 | 
100 | * [Deconstructing Classifiers: Towards A Data Reconstruction Attack Against Text Classification Models](https://arxiv.org/abs/2306.13789)
101 | 
102 | ### Denial of service
103 | 
104 | * [Sponge Examples: Energy-Latency Attacks on Neural Networks](https://arxiv.org/abs/2006.03463) 🌶️
105 | 
106 | ### Escalation
107 | 
108 | * [Demystifying RCE Vulnerabilities in LLM-Integrated Apps](https://arxiv.org/abs/2309.02926) 🌶️
109 | * [Hacking Auto-GPT and escaping its docker container](https://positive.security/blog/auto-gpt-rce)
110 | 
111 | ### Evasion
112 | 
113 | * [Large Language Models can be Guided to Evade AI-Generated Text Detection](https://arxiv.org/abs/2305.10847)
114 | * [GPT-4 Is Too Smart To Be Safe: Stealthy Chat with LLMs via Cipher](https://arxiv.org/abs/2308.06463)
115 | 
116 | ### Malicious code
117 | 
118 | * [A Study on Robustness and Reliability of Large Language Model Code Generation](https://arxiv.org/abs/2308.10335)
119 | * [Can you trust ChatGPT’s package recommendations?](https://vulcan.io/blog/ai-hallucinations-package-risk)
120 | 
121 | 
122 | ### XSS/CSRF/CPRF
123 | 
124 | * [LLM causing self-XSS](https://hackstery.com/2023/07/10/llm-causing-self-xss/)
125 | 
126 | ### Cross-model
127 | 
128 | * [Exploring the Vulnerability of Natural Language Processing Models via Universal Adversarial Texts](https://aclanthology.org/2021.alta-1.14/)
129 | 
130 | ### Multimodal
131 | 
132 | * [(Ab)using Images and Sounds for Indirect Instruction Injection in Multi-Modal LLMs](https://arxiv.org/abs/2307.10490)
133 | * [Image to Prompt Injection with Google Bard](https://embracethered.com/blog/posts/2023/google-bard-image-to-prompt-injection/)
134 | * [Plug and Pray: Exploiting off-the-shelf components of Multi-Modal Models](https://arxiv.org/abs/2307.14539)
135 | * [Visual Adversarial Examples Jailbreak Aligned Large Language Models](https://arxiv.org/abs/2306.13213)
136 | 
137 | 
138 | ### Model theft
139 | 
140 | * [Stealing Machine Learning Models via Prediction APIs](https://arxiv.org/abs/1609.02943)
141 | 
142 | ### Attack automation
143 | 
144 | * [FakeToxicityPrompts: Automatic Red Teaming](https://interhumanagreement.substack.com/p/faketoxicityprompts-automatic-red)
145 | * [FLIRT: Feedback Loop In-context Red Teaming](https://huggingface.co/papers/2308.04265)
146 | * [Red Teaming Language Models with Language Models](https://arxiv.org/abs/2202.03286)
147 | * [Red Teaming Language Models to Reduce Harms: Methods, Scaling Behaviors, and Lessons Learned](https://arxiv.org/abs/2209.07858)
148 | * [Red-Teaming Large Language Models using Chain of Utterances for Safety-Alignment](https://arxiv.org/abs/2308.09662)
149 | 
150 | ## Defenses & Detections
151 | 
152 | ### against things other than backdoors
153 | 
154 | * [Baseline Defenses for Adversarial Attacks Against Aligned Language Models](https://arxiv.org/abs/2309.00614)
155 | * [Defending ChatGPT against Jailbreak Attack via Self-Reminder](https://assets.researchsquare.com/files/rs-2873090/v1_covered_3dc9af48-92ba-491e-924d-b13ba9b7216f.pdf?c=1686882819)
156 | * [Diffusion Theory as a Scalpel: Detecting and Purifying Poisonous Dimensions in Pre-trained Language Models Caused by Backdoor or Bias](https://arxiv.org/abs/2305.04547)
157 | * [Exploring the Limits of Domain-Adaptive Training for Detoxifying Large-Scale Language Models](https://proceedings.neurips.cc/paper_files/paper/2022/hash/e8c20cafe841cba3e31a17488dc9c3f1-Abstract-Conference.html)
158 | * [FedMLSecurity: A Benchmark for Attacks and Defenses in Federated Learning and LLMs](https://arxiv.org/abs/2306.04959)
159 | * [Interpretability and Transparency-Driven Detection and Transformation of Textual Adversarial Examples (IT-DT)](https://arxiv.org/abs/2307.01225)
160 | * [Large Language Models for Code: Security Hardening and Adversarial Testing](https://www.sri.inf.ethz.ch/publications/ccs23-llmsec)
161 | * [LLM Self Defense: By Self Examination, LLMs Know They Are Being Tricked](https://arxiv.org/abs/2308.07308)
162 | * [Make Text Unlearnable: Exploiting Effective Patterns to Protect Personal Data](https://aclanthology.org/2023.trustnlp-1.22/)
163 | * [Mitigating Stored Prompt Injection Attacks Against LLM Applications](https://developer.nvidia.com/blog/mitigating-stored-prompt-injection-attacks-against-llm-applications/?utm_source=tldrsec.com&utm_medium=referral&utm_campaign=tl-dr-sec-194-cnappgoat-kubefuzz-tl-dr-sec-swag)
164 | * [RAIN: Your Language Models Can Align Themselves without Finetuning](https://arxiv.org/abs/2309.07124) 🌶️
165 | * [Secure your machine learning with Semgrep](https://blog.trailofbits.com/2022/10/03/semgrep-maching-learning-static-analysis/)
166 | * [Sparse Logits Suffice to Fail Knowledge Distillation](https://openreview.net/forum?id=BxZgduuNDl5)
167 | * [Text-CRS: A Generalized Certified Robustness Framework against Textual Adversarial Attacks](https://arxiv.org/abs/2307.16630)
168 | * [Thinking about the security of AI systems](https://www.ncsc.gov.uk/blog-post/thinking-about-security-ai-systems)
169 | * [Towards building a robust toxicity predictor](https://www.amazon.science/publications/towards-building-a-robust-toxicity-predictor)
170 | 
171 | ### against backdoors / backdoor insertion
172 | 
173 | * [Defending against Insertion-based Textual Backdoor Attacks via Attribution](https://aclanthology.org/2023.findings-acl.561/)
174 | * [Donkii: Can Annotation Error Detection Methods Find Errors in Instruction-Tuning Datasets?](https://arxiv.org/abs/2309.01669)
175 | * [Exploring the Universal Vulnerability of Prompt-based Learning Paradigm](https://aclanthology.org/2022.findings-naacl.137/)
176 | * [GPTs Don’t Keep Secrets: Searching for Backdoor Watermark Triggers in Autoregressive Language Models](https://aclanthology.org/2023.trustnlp-1.21/) 🌶️
177 | * [IMBERT: Making BERT Immune to Insertion-based Backdoor Attacks](https://aclanthology.org/2023.trustnlp-1.25/) 🌶️
178 | * [Maximum Entropy Loss, the Silver Bullet Targeting Backdoor Attacks in Pre-trained Language Models](https://aclanthology.org/2023.findings-acl.237/)
179 | * [ONION: A Simple and Effective Defense Against Textual Backdoor Attacks](https://arxiv.org/abs/2011.10369)
180 | * [ParaFuzz: An Interpretability-Driven Technique for Detecting Poisoned Samples in NLP](https://arxiv.org/abs/2308.02122) 🌶️
181 | * [VDC: Versatile Data Cleanser for Detecting Dirty Samples via Visual-Linguistic Inconsistency](https://arxiv.org/abs/2309.16211)
182 | 
183 | ## Evaluation
184 | 
185 | * [Do you really follow me? Adversarial Instructions for Evaluating the Robustness of Large Language Models](https://arxiv.org/abs/2308.10819)
186 | * [Evaluating the Susceptibility of Pre-Trained Language Models via Handcrafted Adversarial Examples](https://arxiv.org/abs/2209.02128)
187 | * [Latent Jailbreak: A Test Suite for Evaluating Both Text Safety and Output Robustness of Large Language Models](https://arxiv.org/abs/2307.08487) 🌶️
188 | * [LLM-Deliberation: Evaluating LLMs with Interactive Multi-Agent Negotiation Games](https://arxiv.org/abs/2309.17234)
189 | * [LLM Platform Security: Applying a Systematic Evaluation Framework to OpenAI's ChatGPT Plugins](https://arxiv.org/abs/2309.10254)
190 | * [PromptBench: Towards Evaluating the Robustness of Large Language Models on Adversarial Prompts](https://arxiv.org/abs/2306.04528)
191 | * [TrustGPT: A Benchmark for Trustworthy and Responsible Large Language Models](https://arxiv.org/abs/2306.11507)
192 | 
193 | ## Practices
194 | 
195 | * [A framework to securely use LLMs in companies - Part 1: Overview of Risks](https://boringappsec.substack.com/p/edition-21-a-framework-to-securely)
196 | * [All the Hard Stuff Nobody Talks About when Building Products with LLMs](https://www.honeycomb.io/blog/hard-stuff-nobody-talks-about-llm)
197 | * [Artificial intelligence and machine learning security](https://learn.microsoft.com/en-us/security/engineering/failure-modes-in-machine-learning) (microsoft) 🌶️
198 | * [Assessing Language Model Deployment with Risk Cards](https://arxiv.org/abs/2303.18190)
199 | * [Explore, Establish, Exploit: Red Teaming Language Models from Scratch](https://arxiv.org/abs/2306.09442)
200 | * [Protect Your Prompts: Protocols for IP Protection in LLM Applications](https://arxiv.org/abs/2306.06297)
201 | * ["Real Attackers Don't Compute Gradients": Bridging the Gap Between Adversarial ML Research and Practice](https://arxiv.org/abs/2212.14315) 🌶️
202 | * [Red Teaming Handbook](https://assets.publishing.service.gov.uk/media/61702155e90e07197867eb93/20210625-Red_Teaming_Handbook.pdf) 🌶️
203 | * [Securing LLM Systems Against Prompt Injection](https://developer.nvidia.com/blog/securing-llm-systems-against-prompt-injection/)
204 | * [Threat Modeling LLM Applications](https://aivillage.org/large%20language%20models/threat-modeling-llm/)
205 | * [Toward Comprehensive Risk Assessments and Assurance of AI-Based Systems](https://docs.google.com/viewer?url=https://raw.githubusercontent.com/trailofbits/publications/master/papers/toward_comprehensive_risk_assessments.pdf)
206 | * [Understanding the risks of deploying LLMs in your enterprise](https://www.moveworks.com/insights/risks-of-deploying-llms-in-your-enterprise)
207 | 
208 | ## Analyses & surveys
209 | 
210 | * [A Comprehensive Overview of Backdoor Attacks in Large Language Models within Communication Networks](https://arxiv.org/abs/2308.14367)
211 | * [Chatbots to ChatGPT in a Cybersecurity Space: Evolution, Vulnerabilities, Attacks, Challenges, and Future Recommendations](https://arxiv.org/abs/2306.09255)
212 | * [Identifying and Mitigating the Security Risks of Generative AI](https://arxiv.org/abs/2308.14840)
213 | * [OWASP Top 10 for LLM vulnerabilities](https://llmtop10.com/) 🌶️
214 | * [Security and Privacy on Generative Data in AIGC: A Survey](https://arxiv.org/abs/2309.09435)
215 | * [The AI Attack Surface Map v1.0](https://danielmiessler.com/p/the-ai-attack-surface-map-v1-0/)
216 | * [Towards Security Threats of Deep Learning Systems: A Survey](https://arxiv.org/abs/1911.12562)
217 | 
218 | ## Policy, legal, ethical, and social
219 | 
220 | * [Are You Worthy of My Trust?: A Socioethical Perspective on the Impacts of Trustworthy AI Systems on the Environment and Human Society](https://arxiv.org/abs/2309.09450 )
221 | * [Cybercrime and Privacy Threats of Large Language Models](https://ieeexplore.ieee.org/abstract/document/10174273)
222 | * [Ethical Considerations and Policy Implications for Large Language Models: Guiding Responsible Development and Deployment](https://arxiv.org/abs/2308.02678)
223 | * [Frontier AI Regulation: Managing Emerging Risks to Public Safety](https://arxiv.org/abs/2307.03718)
224 | * [Loose-lipped large language models spill your secrets: The privacy implications of large language models](https://jolt.law.harvard.edu/assets/articlePDFs/v36/Winograd-Loose-Lipped-LLMs.pdf)
225 | * [On the Trustworthiness Landscape of State-of-the-art Generative Models: A Comprehensive Survey](https://arxiv.org/abs/2307.16680)
226 | * [On the Dangers of Stochastic Parrots: Can Language Models Be Too Big? 🦜](https://dl.acm.org/doi/10.1145/3442188.3445922) 🌶️
227 | * [Product Liability for Defective AI](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4515202)
228 | * [The last attempted AI revolution in security, and the next one](https://drive.google.com/file/d/1BbSIBayQ1RHVSnh-FnaeXr8xjw5SVJV8/view?pli=1)
229 | * [Unveiling Security, Privacy, and Ethical Concerns of ChatGPT](https://arxiv.org/abs/2307.14192)
230 | * [Where's the Liability in Harmful AI Speech?](https://arxiv.org/abs/2308.04635)
231 | 
232 | ## Software
233 | 
234 | ### LLM-specific
235 | 
236 | * [BITE](https://github.com/INK-USC/BITE) Textual Backdoor Attacks with Iterative Trigger Injection
237 | * [garak](https://github.com/leondz/garak/) LLM vulnerability scanner 🌶️🌶️
238 | * [HouYi](https://github.com/LLMSecurity/HouYi) successful prompt injection framework 🌶️
239 | * [dropbox/llm-security](https://github.com/dropbox/llm-security) demo scripts & docs for LLM attacks
240 | * [promptmap](https://github.com/utkusen/promptmap) bulk testing of prompt injection on openai LLMs  
241 | * [rebuff](https://github.com/protectai/rebuff) LLM Prompt Injection Detector
242 | * [](https://github.com/deadbits/vigil-llm) risky llm input detection
243 | 
244 | ### general MLsec
245 | 
246 | * [Adversarial Robustness Toolkit](https://github.com/Trusted-AI/adversarial-robustness-toolbox)
247 | * [nvtrust](https://github.com/NVIDIA/nvtrust) Ancillary open source software to support confidential computing on NVIDIA GPUs
248 | 
249 | </hr>
250 | 
251 | 🌶️ = extra spicy


--------------------------------------------------------------------------------