├── static └── uploads │ └── .gitkeep ├── assets └── media │ ├── icons │ └── .gitkeep │ ├── icon.png │ └── covers │ ├── breslin-silicon.webp │ ├── nasa-Q1p7bh3SHj8-unsplash.jpg │ └── pexels-nuno-fangueiro-12125258.jpg ├── .github └── FUNDING.yml ├── preview.webp ├── .gitignore ├── config └── _default │ ├── menus.yaml │ ├── params.yaml │ └── config.yaml ├── go.mod ├── .editorconfig ├── netlify.toml ├── README.md └── content └── _index.md /static/uploads/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/media/icons/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: gcushen 2 | custom: https://wowchemy.com/sponsor/ 3 | -------------------------------------------------------------------------------- /preview.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsecnet/llmsec-site/HEAD/preview.webp -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # IDEs 2 | .idea/ 3 | 4 | # Hugo 5 | resources/ 6 | public/ 7 | jsconfig.json 8 | -------------------------------------------------------------------------------- /assets/media/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsecnet/llmsec-site/HEAD/assets/media/icon.png -------------------------------------------------------------------------------- /assets/media/covers/breslin-silicon.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsecnet/llmsec-site/HEAD/assets/media/covers/breslin-silicon.webp -------------------------------------------------------------------------------- /assets/media/covers/nasa-Q1p7bh3SHj8-unsplash.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsecnet/llmsec-site/HEAD/assets/media/covers/nasa-Q1p7bh3SHj8-unsplash.jpg -------------------------------------------------------------------------------- /assets/media/covers/pexels-nuno-fangueiro-12125258.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsecnet/llmsec-site/HEAD/assets/media/covers/pexels-nuno-fangueiro-12125258.jpg -------------------------------------------------------------------------------- /config/_default/menus.yaml: -------------------------------------------------------------------------------- 1 | # Navigation Links 2 | # To link a homepage widget, specify the URL as a hash `#` followed by the filename of the 3 | # desired widget in your `content/home/` folder. 4 | # The weight parameter defines the order that the links will appear in. 5 | 6 | main: 7 | - name: Home 8 | url: / 9 | weight: 10 10 | 11 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/wowchemy/hugo-second-brain-theme 2 | 3 | go 1.15 4 | 5 | require ( 6 | github.com/wowchemy/wowchemy-hugo-themes/modules/wowchemy-plugin-netlify v1.0.0 // indirect 7 | github.com/wowchemy/wowchemy-hugo-themes/modules/wowchemy-plugin-netlify-cms v1.0.0 // indirect 8 | github.com/wowchemy/wowchemy-hugo-themes/modules/wowchemy/v5 v5.7.0 // indirect 9 | ) 10 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | charset = utf-8 7 | end_of_line = lf 8 | indent_size = 2 9 | indent_style = space 10 | insert_final_newline = true 11 | trim_trailing_whitespace = true 12 | 13 | [*.toml] 14 | max_line_length = 100 15 | 16 | [*.md] 17 | trim_trailing_whitespace = false 18 | 19 | [layouts/shortcodes/*.html] 20 | insert_final_newline = false 21 | -------------------------------------------------------------------------------- /netlify.toml: -------------------------------------------------------------------------------- 1 | [build] 2 | command = "hugo --gc --minify -b $URL" 3 | publish = "public" 4 | 5 | [build.environment] 6 | HUGO_VERSION = "0.97.3" 7 | HUGO_ENABLEGITINFO = "true" 8 | 9 | [context.production.environment] 10 | HUGO_ENV = "production" 11 | 12 | [context.deploy-preview] 13 | command = "hugo --gc --minify --buildFuture -b $DEPLOY_PRIME_URL" 14 | 15 | [context.branch-deploy] 16 | command = "hugo --gc --minify -b $DEPLOY_PRIME_URL" 17 | 18 | [[plugins]] 19 | package = "netlify-plugin-hugo-cache-resources" 20 | [plugins.inputs] 21 | debug = true 22 | -------------------------------------------------------------------------------- /config/_default/params.yaml: -------------------------------------------------------------------------------- 1 | # SITE SETUP 2 | # Guide: https://wowchemy.com/docs/getting-started/ 3 | # Documentation: https://wowchemy.com/docs/ 4 | # This file is formatted using YAML syntax - learn more at https://learnxinyminutes.com/docs/yaml/ 5 | 6 | # Appearance 7 | 8 | appearance: 9 | theme_day: minimal 10 | theme_night: minimal 11 | font: minimal 12 | font_size: L 13 | 14 | # SEO 15 | 16 | marketing: 17 | seo: 18 | site_type: Project 19 | local_business_type: '' 20 | org_name: '' 21 | description: 'The latest research, news, and papers on large language model security.' 22 | twitter: 'llm_sec' 23 | analytics: 24 | google_analytics: '' 25 | baidu_tongji: '' 26 | google_tag_manager: '' 27 | microsoft_clarity: '' 28 | verification: 29 | google: '' 30 | baidu: '' 31 | 32 | # Site header 33 | 34 | header: 35 | navbar: 36 | enable: false 37 | align: l 38 | show_logo: true 39 | show_language: false 40 | show_day_night: true 41 | show_search: false 42 | highlight_active_link: true 43 | 44 | footer: 45 | copyright: 46 | notice: '© {year} [@llm_sec](https://twitter.com/llm_sec). This work is licensed under {license}' 47 | license: 48 | enable: true 49 | allow_derivatives: true 50 | share_alike: false 51 | allow_commercial: true 52 | 53 | # Localization 54 | 55 | locale: 56 | date_format: 'Jan 2, 2006' 57 | time_format: '3:04 PM' 58 | 59 | # Site features 60 | 61 | features: 62 | syntax_highlighter: 63 | theme_light: github-light 64 | theme_dark: dracula 65 | math: 66 | enable: false 67 | privacy_pack: 68 | enable: false 69 | repository: 70 | url: 'https://github.com/llmsecnet/llmsec-site' 71 | content_dir: content 72 | branch: main 73 | comment: 74 | provider: '' 75 | disqus: 76 | shortname: '' 77 | show_count: true 78 | commento: 79 | url: '' 80 | giscus: 81 | repo: '' 82 | repo_id: '' 83 | category: '' 84 | category_id: '' 85 | -------------------------------------------------------------------------------- /config/_default/config.yaml: -------------------------------------------------------------------------------- 1 | # Configuration of Hugo 2 | # Guide: https://wowchemy.com/docs/getting-started/ 3 | # Hugo Documentation: https://gohugo.io/getting-started/configuration/#all-configuration-settings 4 | # This file is formatted using YAML syntax - learn more at https://learnxinyminutes.com/docs/yaml/ 5 | 6 | title: 'LLM Security' # Website name 7 | baseURL: 'https://llmsec.net/' # Website URL 8 | 9 | ############################ 10 | ## PAGE OPTIONS 11 | ############################ 12 | 13 | cascade: 14 | # Docs folder options 15 | - _target: 16 | path: /** 17 | type: book 18 | editable: true 19 | show_breadcrumb: true 20 | 21 | ############################ 22 | ## LANGUAGE 23 | ############################ 24 | 25 | languageCode: en-us 26 | hasCJKLanguage: false 27 | defaultContentLanguageInSubdir: false 28 | removePathAccents: true 29 | 30 | ############################ 31 | ## MODULES 32 | ############################ 33 | 34 | module: 35 | imports: 36 | - path: github.com/wowchemy/wowchemy-hugo-themes/modules/wowchemy-plugin-netlify-cms 37 | disable: true 38 | - path: github.com/wowchemy/wowchemy-hugo-themes/modules/wowchemy-plugin-netlify 39 | - path: github.com/wowchemy/wowchemy-hugo-themes/modules/wowchemy/v5 40 | 41 | ############################ 42 | ## ADVANCED 43 | ############################ 44 | 45 | enableGitInfo: false 46 | summaryLength: 30 47 | paginate: 10 48 | enableEmoji: true 49 | enableRobotsTXT: true 50 | footnotereturnlinkcontents: ^ 51 | ignoreFiles: [\.ipynb$, .ipynb_checkpoints$, \.Rmd$, \.Rmarkdown$, _cache$] 52 | permalinks: 53 | authors: '/author/:slug/' 54 | tags: '/tag/:slug/' 55 | categories: '/category/:slug/' 56 | disableAliases: true 57 | outputs: 58 | home: [HTML, RSS, JSON, WebAppManifest, headers, redirects] 59 | section: [HTML, RSS] 60 | imaging: 61 | resampleFilter: lanczos 62 | quality: 97 63 | anchor: smart 64 | timeout: 600000 65 | taxonomies: 66 | tag: tags 67 | category: categories 68 | author: authors 69 | markup: 70 | _merge: deep 71 | related: 72 | threshold: 80 73 | includeNewer: true 74 | toLower: true 75 | indices: 76 | - name: tags 77 | weight: 100 78 | - name: categories 79 | weight: 70 80 | security: 81 | _merge: deep 82 | sitemap: 83 | _merge: deep 84 | minify: 85 | _merge: deep 86 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [Hugo Second Brain Theme](https://github.com/wowchemy/hugo-second-brain-theme) 2 | 3 | [![Screenshot](./preview.webp)](https://wowchemy.com/hugo-themes/) 4 | 5 | The **Second Brain** starter template empowers you to easily create **personal notes** and **knowledge bases** in a future-proof way. 6 | 7 | - It is your **second brain** 🧠, stored in **future-proof** Markdown files 8 | - Supports audio, video, images, math, code, [Mermaid](https://mermaid.live/) diagrams, and [much more](https://wowchemy.com/docs/content/writing-markdown-latex/) 9 | - Edit your notes online in GitHub, or any Git-connected Markdown app such as [Obsidian](https://obsidian.md/) or [Visual Studio Code](https://vscode.dev/) 10 | 11 | [![Get Started](https://img.shields.io/badge/-Get%20started-ff4655?style=for-the-badge)](https://wowchemy.com/hugo-themes/) 12 | [![Discord](https://img.shields.io/discord/722225264733716590?style=for-the-badge)](https://discord.com/channels/722225264733716590/742892432458252370/742895548159492138) 13 | [![Twitter Follow](https://img.shields.io/twitter/follow/wowchemy?label=Follow%20on%20Twitter)](https://twitter.com/wowchemy) 14 | 15 | [Check out the latest demo](https://wowchemy.com/docs/) of what you'll get in less than 10 minutes, or [get inspired by how others are using this template](https://wowchemy.com/creators/). 16 | 17 | The integrated [**Wowchemy**](https://wowchemy.com) website builder and CMS makes it easy to create a beautiful website for free. Edit your site in the CMS (or your favorite editor), generate it with [Hugo](https://github.com/gohugoio/hugo), and deploy with GitHub or Netlify. Customize anything on your site with widgets, light/dark themes, and language packs. 18 | 19 | - 👉 [**Get Started**](https://wowchemy.com/hugo-themes/) 20 | - 📚 [View the **documentation**](https://wowchemy.com/docs/) 21 | - 💬 [Chat with the **Wowchemy research community**](https://discord.gg/z8wNYzb) or [**Hugo community**](https://discourse.gohugo.io) 22 | - ⬇️ **Automatically import citations from BibTeX** with the [Hugo Academic CLI](https://github.com/wowchemy/hugo-academic-cli) 23 | - 🐦 Share your new site with the community: [@wowchemy](https://twitter.com/wowchemy) [@GeorgeCushen](https://twitter.com/GeorgeCushen) [#MadeWithWowchemy](https://twitter.com/search?q=%23MadeWithWowchemy&src=typed_query) 24 | - 🗳 [Take the survey and help us improve #OpenSource](https://forms.gle/NioD9VhUg7PNmdCAA) 25 | - 🚀 [Contribute improvements](https://github.com/wowchemy/wowchemy-hugo-themes/blob/main/CONTRIBUTING.md) or [suggest improvements](https://github.com/wowchemy/wowchemy-hugo-themes/issues) 26 | - ⬆️ **Updating?** View the [Update Guide](https://wowchemy.com/docs/hugo-tutorials/update/) and [Release Notes](https://github.com/wowchemy/wowchemy-hugo-themes/releases) 27 | 28 | ## We ask you, humbly, to support this open source movement 29 | 30 | Today we ask you to defend the open source independence of the Wowchemy website builder and themes 🐧 31 | 32 | We're an open source movement that depends on your support to stay online and thriving, but 99.9% of our creators don't give; they simply look the other way. 33 | 34 | ### [❤️ Click here to become a GitHub Sponsor, unlocking awesome perks such as _exclusive academic templates and widgets_](https://github.com/sponsors/gcushen) 35 | -------------------------------------------------------------------------------- /content/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: LLM Security 3 | toc: false 4 | 5 | image: 6 | filename: covers/pexels-nuno-fangueiro-12125258.jpg 7 | caption: Monstera - Nuno Fangueiro 8 | --- 9 | 10 | LLM security is the investigation of the failure modes of LLMs in use, the conditions that lead to them, and their mitigations. 11 | 12 | Here are links to large language model security content - research, papers, and news - posted by [@llm_sec](https://twitter.com/llm_sec) 13 | 14 | Got a tip/link? Open a [pull request](https://github.com/llmsecnet/llmsec-site) or send a [DM](https://twitter.com/llm_sec). 15 | 16 | ## Getting Started 17 | 18 | * [How to hack Google Bard, ChatGPT, or any other chatbot](https://dataconomy.com/2023/09/01/how-to-hack-google-bard-chatbots/) 19 | * [Prompt injection primer for engineers](https://github.com/jthack/PIPE) 20 | * [Tutorial based on ten vulnerabilities, by Hego](https://wiki.hego.tech/owasp/owasp-llm-top-10-v1.0) 21 | 22 | ## Attacks 23 | 24 | ### Adversarial 25 | 26 | * [A LLM Assisted Exploitation of AI-Guardian](https://arxiv.org/abs/2307.15008) 27 | * [Adversarial Attacks on Tables with Entity Swap](https://ceur-ws.org/Vol-3462/TADA4.pdf) 28 | * [Adversarial Demonstration Attacks on Large Language Models](https://arxiv.org/abs/2305.14950) 29 | * [Adversarial Examples Are Not Bugs, They Are Features](https://arxiv.org/abs/1905.02175) 🌶️ 30 | * [Are Aligned Language Models “Adversarially Aligned”?](https://www.youtube.com/watch?v=uqOfC3KSZFc) 🌶️ 31 | * [Bad Characters: Imperceptible NLP Attacks](https://arxiv.org/abs/2106.09898) 32 | * [Breaking BERT: Understanding its Vulnerabilities for Named Entity Recognition through Adversarial Attack](https://arxiv.org/abs/2109.11308) 33 | * [Expanding Scope: Adapting English Adversarial Attacks to Chinese](https://aclanthology.org/2023.trustnlp-1.24/) 34 | * [Fine-tuning Aligned Language Models Compromises Safety, Even When Users Do Not Intend To!](https://arxiv.org/abs/2310.03693) 35 | * [Gradient-based Adversarial Attacks against Text Transformers](https://arxiv.org/abs/2104.13733) 36 | * [Gradient-Based Word Substitution for Obstinate Adversarial Examples Generation in Language Models](https://arxiv.org/abs/2307.12507) 37 | * [Sample Attackability in Natural Language Adversarial Attacks](https://aclanthology.org/2023.trustnlp-1.9/) 38 | * [Universal and Transferable Adversarial Attacks on Aligned Language Models](https://arxiv.org/abs/2307.15043) 39 | * [Why Should Adversarial Perturbations be Imperceptible? Rethink the Research Paradigm in Adversarial NLP](https://arxiv.org/abs/2210.10683) 🌶️ 40 | 41 | ### Backdoors & data poisoning 42 | 43 | * [A backdoor attack against LSTM-based text classification systems](https://arxiv.org/abs/1905.12457) "Submitted on 29 May 2019"! 44 | * [A Gradient Control Method for Backdoor Attacks on Parameter-Efficient Tuning](https://aclanthology.org/2023.acl-long.194/) 45 | * [Are You Copying My Model? Protecting the Copyright of Large Language Models for EaaS via Backdoor Watermark](https://arxiv.org/abs/2305.10036) 46 | * [Backdoor Learning on Sequence to Sequence Models](https://arxiv.org/abs/2305.02424) 47 | * [Backdooring Neural Code Search](https://arxiv.org/abs/2305.17506) 🌶️ 48 | * [BadPre: Task-agnostic Backdoor Attacks to Pre-trained NLP Foundation Models](https://arxiv.org/abs/2110.02467) 49 | * [BadPrompt: Backdoor Attacks on Continuous Prompts](https://arxiv.org/abs/2211.14719) 50 | * [Be Careful about Poisoned Word Embeddings: Exploring the Vulnerability of the Embedding Layers in NLP Models](https://arxiv.org/abs/2103.15543) 51 | * [BadNL: Backdoor Attacks against NLP Models with Semantic-preserving Improvements](https://arxiv.org/abs/2006.01043) 52 | * [BITE: Textual Backdoor Attacks with Iterative Trigger Injection](https://arxiv.org/abs/2205.12700) 🌶️ 53 | * [Exploring the Universal Vulnerability of Prompt-based Learning Paradigm](https://aclanthology.org/2022.findings-naacl.137/) 54 | * [Hidden Killer: Invisible Textual Backdoor Attacks with Syntactic Trigger](https://arxiv.org/abs/2105.12400) 🌶️ 55 | * [Instructions as Backdoors: Backdoor Vulnerabilities of Instruction Tuning for Large Language Models](https://arxiv.org/abs/2305.14710) 56 | * [Mind the Style of Text! Adversarial and Backdoor Attacks Based on Text Style Transfer](https://aclanthology.org/2021.emnlp-main.374/) 57 | * [On the Exploitability of Instruction Tuning](https://arxiv.org/abs/2306.17194) 58 | * [Poisoning Web-Scale Training Datasets is Practical](https://arxiv.org/abs/2302.10149) 🌶️ 59 | * [Prompt as Triggers for Backdoor Attack: Examining the Vulnerability in Language Models](https://arxiv.org/abs/2305.01219) 60 | * [Textual Backdoor Attacks Can Be More Harmful via Two Simple Tricks](https://arxiv.org/abs/2110.08247) 61 | * [Two-in-One: A Model Hijacking Attack Against Text Generation Models](https://arxiv.org/abs/2305.07406) 62 | 63 | ### Prompt injection 64 | 65 | * [Bing Chat: Data Exfiltration Exploit Explained](https://embracethered.com/blog/posts/2023/bing-chat-data-exfiltration-poc-and-fix/) 🌶️ 66 | * [ChatGPT's new browser feature is affected by Indirect Prompt Injection vulnerability. ](https://twitter.com/evrnyalcin/status/1707298475216425400) 67 | * [Compromising LLMs: The Advent of AI Malware](https://www.blackhat.com/us-23/briefings/schedule/index.html#compromising-llms-the-advent-of-ai-malware-33075) 68 | * [Generative AI’s Biggest Security Flaw Is Not Easy to Fix](https://www.wired.com/story/generative-ai-prompt-injection-hacking/) 69 | * [GPT-4 Is Too Smart To Be Safe: Stealthy Chat with LLMs via Cipher](https://arxiv.org/abs/2308.06463) 70 | * [Hackers Compromised ChatGPT Model with Indirect Prompt Injection](https://gbhackers.com/hackers-compromised-chatgpt-model/) 71 | * [Large Language Model Prompts for Prompt Injection (RTC0006)](https://redteamrecipe.com/Large-Language-Model-Prompts/) 72 | * [Ignore Previous Prompt: Attack Techniques For Language Models](https://arxiv.org/abs/2211.09527) 🌶️ 73 | * [Not what you've signed up for: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection](https://arxiv.org/abs/2302.12173) 🌶️ 74 | * [Prompt Injection attack against LLM-integrated Applications](https://arxiv.org/abs/2306.05499) 75 | * [Safeguarding Crowdsourcing Surveys from ChatGPT with Prompt Injection](https://arxiv.org/abs/2306.08833) 76 | * [Virtual Prompt Injection for Instruction-Tuned Large Language Models](https://arxiv.org/abs/2307.16888) 77 | 78 | ### Jailbreaking 79 | 80 | * [AutoDAN: Generating Stealthy Jailbreak Prompts on Aligned Large Language Models](https://arxiv.org/abs/2310.04451) 🌶️ 81 | * ["Do Anything Now": Characterizing and Evaluating In-The-Wild Jailbreak Prompts on Large Language Models](https://arxiv.org/abs/2308.03825) 🌶️ 82 | * [GPTFUZZER: Red Teaming Large Language Models with Auto-Generated Jailbreak Prompts](https://arxiv.org/abs/2309.10253) 83 | * [JAILBREAKER: Automated Jailbreak Across Multiple Large Language Model Chatbots](https://arxiv.org/pdf/2307.08715.pdf) 84 | * [Jailbroken: How Does LLM Safety Training Fail?](https://arxiv.org/abs/2307.02483) 85 | * [LLM Censorship: A Machine Learning Challenge Or A Computer Security Problem?](https://www.cl.cam.ac.uk/~is410/Papers/llm_censorship.pdf) (mosaic prompts) 86 | * [Low-Resource Languages Jailbreak GPT-4](https://arxiv.org/abs/2310.02446) 🌶️ 87 | * [Self-Deception: Reverse Penetrating the Semantic Firewall of Large Language Models](https://arxiv.org/abs/2308.11521v1) 88 | 89 | ### Data extraction & privacy 90 | 91 | * [DP-Forward: Fine-tuning and Inference on Language Models with Differential Privacy in Forward Pass ](https://arxiv.org/abs/2309.06746) 92 | * [Extracting Training Data from Large Language Models](https://arxiv.org/abs/2012.07805) 93 | * [Privacy Side Channels in Machine Learning Systems](https://arxiv.org/abs/2309.05610) 🌶️ 94 | * [Prompts Should not be Seen as Secrets: Systematically Measuring Prompt Extraction Attack Success](https://arxiv.org/abs/2307.06865) 95 | * [ProPILE: Probing Privacy Leakage in Large Language Models](https://arxiv.org/abs/2307.01881) 🌶️ 96 | * [Training Data Extraction From Pre-trained Language Models: A Survey](https://aclanthology.org/2023.trustnlp-1.23/) 97 | 98 | ### Data reconstruction 99 | 100 | * [Deconstructing Classifiers: Towards A Data Reconstruction Attack Against Text Classification Models](https://arxiv.org/abs/2306.13789) 101 | 102 | ### Denial of service 103 | 104 | * [Sponge Examples: Energy-Latency Attacks on Neural Networks](https://arxiv.org/abs/2006.03463) 🌶️ 105 | 106 | ### Escalation 107 | 108 | * [Demystifying RCE Vulnerabilities in LLM-Integrated Apps](https://arxiv.org/abs/2309.02926) 🌶️ 109 | * [Hacking Auto-GPT and escaping its docker container](https://positive.security/blog/auto-gpt-rce) 110 | 111 | ### Evasion 112 | 113 | * [Large Language Models can be Guided to Evade AI-Generated Text Detection](https://arxiv.org/abs/2305.10847) 114 | * [GPT-4 Is Too Smart To Be Safe: Stealthy Chat with LLMs via Cipher](https://arxiv.org/abs/2308.06463) 115 | 116 | ### Malicious code 117 | 118 | * [A Study on Robustness and Reliability of Large Language Model Code Generation](https://arxiv.org/abs/2308.10335) 119 | * [Can you trust ChatGPT’s package recommendations?](https://vulcan.io/blog/ai-hallucinations-package-risk) 120 | 121 | 122 | ### XSS/CSRF/CPRF 123 | 124 | * [LLM causing self-XSS](https://hackstery.com/2023/07/10/llm-causing-self-xss/) 125 | 126 | ### Cross-model 127 | 128 | * [Exploring the Vulnerability of Natural Language Processing Models via Universal Adversarial Texts](https://aclanthology.org/2021.alta-1.14/) 129 | 130 | ### Multimodal 131 | 132 | * [(Ab)using Images and Sounds for Indirect Instruction Injection in Multi-Modal LLMs](https://arxiv.org/abs/2307.10490) 133 | * [Image to Prompt Injection with Google Bard](https://embracethered.com/blog/posts/2023/google-bard-image-to-prompt-injection/) 134 | * [Plug and Pray: Exploiting off-the-shelf components of Multi-Modal Models](https://arxiv.org/abs/2307.14539) 135 | * [Visual Adversarial Examples Jailbreak Aligned Large Language Models](https://arxiv.org/abs/2306.13213) 136 | 137 | 138 | ### Model theft 139 | 140 | * [Stealing Machine Learning Models via Prediction APIs](https://arxiv.org/abs/1609.02943) 141 | 142 | ### Attack automation 143 | 144 | * [FakeToxicityPrompts: Automatic Red Teaming](https://interhumanagreement.substack.com/p/faketoxicityprompts-automatic-red) 145 | * [FLIRT: Feedback Loop In-context Red Teaming](https://huggingface.co/papers/2308.04265) 146 | * [Red Teaming Language Models with Language Models](https://arxiv.org/abs/2202.03286) 147 | * [Red Teaming Language Models to Reduce Harms: Methods, Scaling Behaviors, and Lessons Learned](https://arxiv.org/abs/2209.07858) 148 | * [Red-Teaming Large Language Models using Chain of Utterances for Safety-Alignment](https://arxiv.org/abs/2308.09662) 149 | 150 | ## Defenses & Detections 151 | 152 | ### against things other than backdoors 153 | 154 | * [Baseline Defenses for Adversarial Attacks Against Aligned Language Models](https://arxiv.org/abs/2309.00614) 155 | * [Defending ChatGPT against Jailbreak Attack via Self-Reminder](https://assets.researchsquare.com/files/rs-2873090/v1_covered_3dc9af48-92ba-491e-924d-b13ba9b7216f.pdf?c=1686882819) 156 | * [Diffusion Theory as a Scalpel: Detecting and Purifying Poisonous Dimensions in Pre-trained Language Models Caused by Backdoor or Bias](https://arxiv.org/abs/2305.04547) 157 | * [Exploring the Limits of Domain-Adaptive Training for Detoxifying Large-Scale Language Models](https://proceedings.neurips.cc/paper_files/paper/2022/hash/e8c20cafe841cba3e31a17488dc9c3f1-Abstract-Conference.html) 158 | * [FedMLSecurity: A Benchmark for Attacks and Defenses in Federated Learning and LLMs](https://arxiv.org/abs/2306.04959) 159 | * [Interpretability and Transparency-Driven Detection and Transformation of Textual Adversarial Examples (IT-DT)](https://arxiv.org/abs/2307.01225) 160 | * [Large Language Models for Code: Security Hardening and Adversarial Testing](https://www.sri.inf.ethz.ch/publications/ccs23-llmsec) 161 | * [LLM Self Defense: By Self Examination, LLMs Know They Are Being Tricked](https://arxiv.org/abs/2308.07308) 162 | * [Make Text Unlearnable: Exploiting Effective Patterns to Protect Personal Data](https://aclanthology.org/2023.trustnlp-1.22/) 163 | * [Mitigating Stored Prompt Injection Attacks Against LLM Applications](https://developer.nvidia.com/blog/mitigating-stored-prompt-injection-attacks-against-llm-applications/?utm_source=tldrsec.com&utm_medium=referral&utm_campaign=tl-dr-sec-194-cnappgoat-kubefuzz-tl-dr-sec-swag) 164 | * [RAIN: Your Language Models Can Align Themselves without Finetuning](https://arxiv.org/abs/2309.07124) 🌶️ 165 | * [Secure your machine learning with Semgrep](https://blog.trailofbits.com/2022/10/03/semgrep-maching-learning-static-analysis/) 166 | * [Sparse Logits Suffice to Fail Knowledge Distillation](https://openreview.net/forum?id=BxZgduuNDl5) 167 | * [Text-CRS: A Generalized Certified Robustness Framework against Textual Adversarial Attacks](https://arxiv.org/abs/2307.16630) 168 | * [Thinking about the security of AI systems](https://www.ncsc.gov.uk/blog-post/thinking-about-security-ai-systems) 169 | * [Towards building a robust toxicity predictor](https://www.amazon.science/publications/towards-building-a-robust-toxicity-predictor) 170 | 171 | ### against backdoors / backdoor insertion 172 | 173 | * [Defending against Insertion-based Textual Backdoor Attacks via Attribution](https://aclanthology.org/2023.findings-acl.561/) 174 | * [Donkii: Can Annotation Error Detection Methods Find Errors in Instruction-Tuning Datasets?](https://arxiv.org/abs/2309.01669) 175 | * [Exploring the Universal Vulnerability of Prompt-based Learning Paradigm](https://aclanthology.org/2022.findings-naacl.137/) 176 | * [GPTs Don’t Keep Secrets: Searching for Backdoor Watermark Triggers in Autoregressive Language Models](https://aclanthology.org/2023.trustnlp-1.21/) 🌶️ 177 | * [IMBERT: Making BERT Immune to Insertion-based Backdoor Attacks](https://aclanthology.org/2023.trustnlp-1.25/) 🌶️ 178 | * [Maximum Entropy Loss, the Silver Bullet Targeting Backdoor Attacks in Pre-trained Language Models](https://aclanthology.org/2023.findings-acl.237/) 179 | * [ONION: A Simple and Effective Defense Against Textual Backdoor Attacks](https://arxiv.org/abs/2011.10369) 180 | * [ParaFuzz: An Interpretability-Driven Technique for Detecting Poisoned Samples in NLP](https://arxiv.org/abs/2308.02122) 🌶️ 181 | * [VDC: Versatile Data Cleanser for Detecting Dirty Samples via Visual-Linguistic Inconsistency](https://arxiv.org/abs/2309.16211) 182 | 183 | ## Evaluation 184 | 185 | * [Do you really follow me? Adversarial Instructions for Evaluating the Robustness of Large Language Models](https://arxiv.org/abs/2308.10819) 186 | * [Evaluating the Susceptibility of Pre-Trained Language Models via Handcrafted Adversarial Examples](https://arxiv.org/abs/2209.02128) 187 | * [Latent Jailbreak: A Test Suite for Evaluating Both Text Safety and Output Robustness of Large Language Models](https://arxiv.org/abs/2307.08487) 🌶️ 188 | * [LLM-Deliberation: Evaluating LLMs with Interactive Multi-Agent Negotiation Games](https://arxiv.org/abs/2309.17234) 189 | * [LLM Platform Security: Applying a Systematic Evaluation Framework to OpenAI's ChatGPT Plugins](https://arxiv.org/abs/2309.10254) 190 | * [PromptBench: Towards Evaluating the Robustness of Large Language Models on Adversarial Prompts](https://arxiv.org/abs/2306.04528) 191 | * [TrustGPT: A Benchmark for Trustworthy and Responsible Large Language Models](https://arxiv.org/abs/2306.11507) 192 | 193 | ## Practices 194 | 195 | * [A framework to securely use LLMs in companies - Part 1: Overview of Risks](https://boringappsec.substack.com/p/edition-21-a-framework-to-securely) 196 | * [All the Hard Stuff Nobody Talks About when Building Products with LLMs](https://www.honeycomb.io/blog/hard-stuff-nobody-talks-about-llm) 197 | * [Artificial intelligence and machine learning security](https://learn.microsoft.com/en-us/security/engineering/failure-modes-in-machine-learning) (microsoft) 🌶️ 198 | * [Assessing Language Model Deployment with Risk Cards](https://arxiv.org/abs/2303.18190) 199 | * [Explore, Establish, Exploit: Red Teaming Language Models from Scratch](https://arxiv.org/abs/2306.09442) 200 | * [Protect Your Prompts: Protocols for IP Protection in LLM Applications](https://arxiv.org/abs/2306.06297) 201 | * ["Real Attackers Don't Compute Gradients": Bridging the Gap Between Adversarial ML Research and Practice](https://arxiv.org/abs/2212.14315) 🌶️ 202 | * [Red Teaming Handbook](https://assets.publishing.service.gov.uk/media/61702155e90e07197867eb93/20210625-Red_Teaming_Handbook.pdf) 🌶️ 203 | * [Securing LLM Systems Against Prompt Injection](https://developer.nvidia.com/blog/securing-llm-systems-against-prompt-injection/) 204 | * [Threat Modeling LLM Applications](https://aivillage.org/large%20language%20models/threat-modeling-llm/) 205 | * [Toward Comprehensive Risk Assessments and Assurance of AI-Based Systems](https://docs.google.com/viewer?url=https://raw.githubusercontent.com/trailofbits/publications/master/papers/toward_comprehensive_risk_assessments.pdf) 206 | * [Understanding the risks of deploying LLMs in your enterprise](https://www.moveworks.com/insights/risks-of-deploying-llms-in-your-enterprise) 207 | 208 | ## Analyses & surveys 209 | 210 | * [A Comprehensive Overview of Backdoor Attacks in Large Language Models within Communication Networks](https://arxiv.org/abs/2308.14367) 211 | * [Chatbots to ChatGPT in a Cybersecurity Space: Evolution, Vulnerabilities, Attacks, Challenges, and Future Recommendations](https://arxiv.org/abs/2306.09255) 212 | * [Identifying and Mitigating the Security Risks of Generative AI](https://arxiv.org/abs/2308.14840) 213 | * [OWASP Top 10 for LLM vulnerabilities](https://llmtop10.com/) 🌶️ 214 | * [Security and Privacy on Generative Data in AIGC: A Survey](https://arxiv.org/abs/2309.09435) 215 | * [The AI Attack Surface Map v1.0](https://danielmiessler.com/p/the-ai-attack-surface-map-v1-0/) 216 | * [Towards Security Threats of Deep Learning Systems: A Survey](https://arxiv.org/abs/1911.12562) 217 | 218 | ## Policy, legal, ethical, and social 219 | 220 | * [Are You Worthy of My Trust?: A Socioethical Perspective on the Impacts of Trustworthy AI Systems on the Environment and Human Society](https://arxiv.org/abs/2309.09450 ) 221 | * [Cybercrime and Privacy Threats of Large Language Models](https://ieeexplore.ieee.org/abstract/document/10174273) 222 | * [Ethical Considerations and Policy Implications for Large Language Models: Guiding Responsible Development and Deployment](https://arxiv.org/abs/2308.02678) 223 | * [Frontier AI Regulation: Managing Emerging Risks to Public Safety](https://arxiv.org/abs/2307.03718) 224 | * [Loose-lipped large language models spill your secrets: The privacy implications of large language models](https://jolt.law.harvard.edu/assets/articlePDFs/v36/Winograd-Loose-Lipped-LLMs.pdf) 225 | * [On the Trustworthiness Landscape of State-of-the-art Generative Models: A Comprehensive Survey](https://arxiv.org/abs/2307.16680) 226 | * [On the Dangers of Stochastic Parrots: Can Language Models Be Too Big? 🦜](https://dl.acm.org/doi/10.1145/3442188.3445922) 🌶️ 227 | * [Product Liability for Defective AI](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4515202) 228 | * [The last attempted AI revolution in security, and the next one](https://drive.google.com/file/d/1BbSIBayQ1RHVSnh-FnaeXr8xjw5SVJV8/view?pli=1) 229 | * [Unveiling Security, Privacy, and Ethical Concerns of ChatGPT](https://arxiv.org/abs/2307.14192) 230 | * [Where's the Liability in Harmful AI Speech?](https://arxiv.org/abs/2308.04635) 231 | 232 | ## Software 233 | 234 | ### LLM-specific 235 | 236 | * [BITE](https://github.com/INK-USC/BITE) Textual Backdoor Attacks with Iterative Trigger Injection 237 | * [garak](https://github.com/leondz/garak/) LLM vulnerability scanner 🌶️🌶️ 238 | * [HouYi](https://github.com/LLMSecurity/HouYi) successful prompt injection framework 🌶️ 239 | * [dropbox/llm-security](https://github.com/dropbox/llm-security) demo scripts & docs for LLM attacks 240 | * [promptmap](https://github.com/utkusen/promptmap) bulk testing of prompt injection on openai LLMs 241 | * [rebuff](https://github.com/protectai/rebuff) LLM Prompt Injection Detector 242 | * [](https://github.com/deadbits/vigil-llm) risky llm input detection 243 | 244 | ### general MLsec 245 | 246 | * [Adversarial Robustness Toolkit](https://github.com/Trusted-AI/adversarial-robustness-toolbox) 247 | * [nvtrust](https://github.com/NVIDIA/nvtrust) Ancillary open source software to support confidential computing on NVIDIA GPUs 248 | 249 | 250 | 251 | 🌶️ = extra spicy --------------------------------------------------------------------------------