├── .gitignore ├── Cargo.toml ├── LICENSE.md ├── README.md ├── TERMS_OF_USE.md ├── assets ├── .DS_Store └── ui │ ├── .DS_Store │ └── images │ ├── delete.png │ ├── delete.svg │ ├── list_add.png │ └── list_add.svg ├── bin ├── dittomancer │ ├── .gitignore │ ├── README.md │ ├── dittomancer.rs │ ├── fred_rogers.toml │ └── static │ │ ├── index.html │ │ ├── reset.css │ │ ├── script.js │ │ └── style.css ├── regurgitater │ ├── README.md │ ├── regurgitater.rs │ └── static │ │ ├── index.html │ │ ├── regurgitater.png │ │ ├── reset.css │ │ ├── script.js │ │ └── style.css └── settings_tool │ ├── README.md │ └── settings_tool.rs ├── logo.svg ├── logo_inkscape.svg ├── models └── README.md ├── rustfmt.toml ├── src ├── batch.rs ├── candidates.rs ├── cli.rs ├── data.rs ├── data │ ├── banned.rs │ └── stopwords.rs ├── engine.rs ├── lib.rs ├── model.rs ├── model │ └── vocab.rs ├── ngram.rs ├── predictor.rs ├── probability.rs ├── prompt.rs ├── prompt │ └── format.rs ├── sample.rs ├── utils.rs └── utils │ └── test.rs └── tests └── data ├── README.md ├── banned_ngrams └── ngrams-english-llama.txt └── detect-infringement ├── lyrics ├── 5_on_it.txt ├── README.md ├── a_day_in_the_life.txt ├── a_whole_new_world.txt ├── aenema.txt ├── bad_romance.txt ├── barbie_girl.txt ├── bohemian_rhapsody.txt ├── born_this_way.txt ├── buckley-hallelujah.txt ├── can_you_feel_the_love_tonight.txt ├── candle_in_the_wind.txt ├── closer.txt ├── cohen-hallelujah.txt ├── eleanor.txt ├── father_lucifer.txt ├── fire_water_burn.txt ├── gangstas_paradise.txt ├── graceland.txt ├── hakuna_matata.txt ├── hotel_california.txt ├── imagine.txt ├── je_ne_regrette_rien.txt ├── knockin_on_heavens_door.txt ├── landslide.txt ├── last_cristmas.txt ├── life_on_mars.txt ├── like_a_prayer.txt ├── like_a_virgin.txt ├── loser.txt ├── lovin_feeling.txt ├── my_name_is.txt ├── my_way.txt ├── nothing_compares.txt ├── one_more_time.txt ├── rhiannon.txt ├── running_up_that_hill.txt ├── sober.txt ├── sound_of_silence.txt ├── sympathy_for_the_devil.txt ├── teen_spirit.txt ├── total_eclipse.txt ├── watchtower.txt ├── wild_side.txt ├── wonderwall.txt └── yesterday.txt ├── nyt ├── README.md ├── guys.txt └── snow_fall.txt ├── random └── navyseal.txt ├── scientology ├── README.md └── ot3.txt └── tolkien ├── hobbit-chapter-1.txt └── hobbit-chapter-2.txt /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | models/*.gguf 3 | /Cargo.lock 4 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "drama_llama" 3 | version = "0.5.2" 4 | edition = "2021" 5 | description = "A library for language modeling and text generation." 6 | license-file = "LICENSE.md" 7 | repository = "https://github.com/mdegans/drama_llama" 8 | 9 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 10 | 11 | [dependencies] 12 | llama-cpp-sys-3 = "0.5" 13 | 14 | derive_more = "0.99.17" 15 | num = "0.4" 16 | partial_sort = { version = "0.2.0" } 17 | rand = { version = "0.8" } 18 | regex = "1.10" 19 | static_assertions = "1.1.0" 20 | thiserror = "1.0" 21 | tinyvec = "1.6" 22 | xorshift = "0.1" 23 | rayon = "1.10.0" 24 | 25 | markdown = { version = "=1.0.0-alpha.16", optional = true } 26 | rocket = { version = "0.5", optional = true, features = ["json"] } 27 | clap = { version = "4.5", optional = true, features = ["derive"] } 28 | stringmetrics = { version = "2.2.2", optional = true } 29 | toml = { version = "0.8", optional = true } 30 | serde_json = { version = "1.0", optional = true } 31 | dirs = { version = "5.0.1", optional = true } 32 | egui = { version = "0.27", optional = true } 33 | eframe = { version = "0.27", optional = true } 34 | egui_file = { version = "0.17.0", optional = true } 35 | egui_extras = { version = "0.27", optional = true, features = ["all_loaders"] } 36 | image = { version = "0.25", optional = true, features = ["png"] } 37 | 38 | 39 | [features] 40 | webchat = ["dep:rocket", "toml", "dep:dirs", "dep:markdown", "serde"] 41 | toml = ["dep:toml"] 42 | cli = ["dep:clap"] 43 | # we use rocket's serde support 44 | serde = ["dep:rocket", "tinyvec/serde"] 45 | stats = ["dep:stringmetrics"] 46 | cuda = ["llama-cpp-sys-3/cuda"] 47 | cuda_f16 = ["llama-cpp-sys-3/cuda_f16"] 48 | egui = [ 49 | "dep:egui", 50 | "dep:eframe", 51 | "dep:egui_file", 52 | "dep:egui_extras", 53 | "dep:image", 54 | ] 55 | 56 | [[bin]] 57 | name = "dittomancer" 58 | path = "bin/dittomancer/dittomancer.rs" 59 | required-features = ["webchat", "cli"] 60 | 61 | [[bin]] 62 | name = "regurgitater" 63 | path = "bin/regurgitater/regurgitater.rs" 64 | required-features = ["webchat", "cli", "stats"] 65 | 66 | [[bin]] 67 | name = "settings_tool" 68 | path = "bin/settings_tool/settings_tool.rs" 69 | required-features = ["egui", "serde", "serde_json"] 70 | 71 | [package.metadata.docs.rs] 72 | # `cuda` will break the build on platforms without it, and it doesn't change the 73 | # docs anyway. 74 | features = ["webchat", "cli", "stats", "toml", "serde", "egui"] 75 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # RESPONSIBLE AI SOURCE CODE LICENSE 2 | 3 | http://licenses.ai/ 4 | 5 | ## TERMS AND CONDITIONS. 6 | 7 | The Responsible Artificial Intelligence Source Code License (“License”) governs the use of the accompanying software. If you access or use the software, you accept the License. If you do not accept the License, do not access or use the software. 8 | 9 | ## 1. Definitions. 10 | 11 | As used in this License, the following capitalized terms have the following meanings: 12 | 13 | (i) "License" means the terms and conditions for use, reproduction, and distribution as defined by Sections one (1) through eight (8) of this document. 14 | 15 | (ii) "Licensor" means the copyright owner or legal entity authorized by the copyright owner that is granting the License. 16 | 17 | (iii) "You" (or "Your") means an individual or legal entity exercising permissions granted by this License. 18 | 19 | (iv) The terms “reproduce”, “reproduction”, “derivative works”, and “distribution” have the same meaning here as under U.S. Copyright Law. 20 | 21 | (v) “Contribution” means the original software, additions to the original software, modifications to the original software, or derivative works of the original software. 22 | 23 | (vi) "Contributor" means any person or Licensor who provides a Contribution. 24 | 25 | ## 2. Grant of Rights. 26 | 27 | Subject to this License, each Contributor grants You a non-exclusive, worldwide, royalty-free copyright license to reproduce its Contribution, prepare derivative works of its Contribution, and distribute its Contribution or any derivative works of its Contribution that You create. 28 | 29 | ## 3. Restrictions 30 | 31 | 1. If You distribute any portion of the Contribution, You must include a complete copy of this License with the distribution; and 32 | 33 | 2. You agree that the Contribution, or any derivative work of the Contribution, will not be used by You or any third party subject under your control for any prohibited use in [`TERMS_OF_USE.md`](TERMS_OF_USE.md) 34 | 35 | 3. Restrictions referenced in Section 3.2 **MUST** be included as an enforceable provision by You in any type of legal agreement governing the use and/or distribution of the Work or any Derivative Works, and You shall give notice to subsequent users You Distribute to, that the Work or any Derivative Works are subject to Section 3.2. **You shall require all of Your users who use the Work or any Derivative Works to comply with the terms of use in [`TERMS_OF_USE.md`](TERMS_OF_USE.md).** 36 | 37 | ## 4. Termination 38 | 39 | Upon the occurrence of any of the restricted uses listed above in “3. Restrictions”, Licensor shall have the right to: 40 | 41 | (i) terminate this License Agreement and disable any Contribution either by pre-installed or then installed disabling instructions, and to take immediate possession of the Contribution and all copies wherever located, without demand or notice; 42 | 43 | (ii) require You to immediately return to Licensor all copies of the Contribution, or upon request by Licensor destroy the Contribution and all copies and certify in writing that they have been destroyed; 44 | 45 | (iii) for a period of 10 years, provide a prominent notice on the Licensor’s website indicating that this License was violated by the Licensor; 46 | 47 | (iv) release/delete any and all data collected through use of the Contribution; and 48 | 49 | (v) notify all parties affected by use of the Contribution. 50 | 51 | Termination of this License Agreement shall be in addition to and not in lieu of any other remedies available to Licensor. Licensor expressly reserves the right to pursue all legal and equitable remedies available under the law. 52 | 53 | ## 5. Disclaimer of Warranty. 54 | 55 | Unless required by applicable law or agreed to in writing, Licensor provides any Contribution (and each Contributor provides its Contributions) on an "As-Is" basis, without WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing a Contribution and assume any risks associated with Your exercise of permissions under this License. 56 | 57 | ## 6. Limitation of Liability. 58 | 59 | In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use any Contribution (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 60 | 61 | ## 7. Accepting Warranty or Additional Liability. 62 | 63 | While redistributing the Contribution, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. 64 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `drama_llama` 2 | 3 | ![llama with drama mask logo](logo.svg) 4 | 5 | `drama_llama` is yet another Rust wrapper for [`llama.cpp`]. It is a work in progress and not intended for production use. The API _will_ change. 6 | 7 | For examples, see the `bin` folder. There are two example binaries. 8 | 9 | - **[Dittomancer](bin/dittomancer/README.md)** - Chat with well represented personalities in the training. 10 | - **[Regurgitater](bin/regurgitater/README.md)** - Test local language models for memorized content. 11 | 12 | ## Supported Features 13 | 14 | - LLaMA 3 Support. 15 | - Iterators yielding candidates, tokens and pieces. 16 | - Stop criteria at regex, token sequence, and/or string sequence. 17 | - Metal support. CUDA may be enabled with the `cuda` and `cuda_f16` features. 18 | - Rust-native sampling code. All sampling methods from llama.cpp have been translated. 19 | - N-gram based repetition penalties with custom exclusions for n-grams that should not be penalized. 20 | - Support for N-gram blocking with a default, hardcoded blocklist. 21 | 22 | 23 | 24 | ## Contributing 25 | 26 | - Code is poetry. Make it pretty. 27 | - Respect is universal. 28 | - Use `rustfmt`. 29 | 30 | ## Roadmap 31 | 32 | - [x] Candidate iterator with fine-grained control over sampling 33 | - [ ] Examples for new Candidate API. 34 | - [x] Support for chaining sampling methods using `SampleOptions`. `mode` will 35 | become `modes` and applied one after another until only a single 36 | Candidate token remains. 37 | - [ ] Common command line options for sampling. Currently this is not exposed. 38 | - [ ] API closer to Ollama. Potentially support for something like `Modelfile`. 39 | - [ ] Logging (non-blocking) and benchmark support. 40 | - [ ] Better chat and instruct model support. 41 | - [ ] Web server. Tokenization in the browser. 42 | - [ ] Tiktoken as the tokenizer for some models instead of llama.cpp's internal one. 43 | - [ ] Reworked, functional, public, candidate API 44 | - [ ] Grammar constraints (maybe or maybe not [`llama.cpp`] style) 45 | - [ ] Async streams, better parallelism with automatic batch scheduling 46 | - [ ] Better cache management. `llama.cpp` does not seem to manage a longest prefix cache automatically, so one will have to be written. 47 | - [ ] Backends other than [`llama.cpp`] (eg. [MLC](https://github.com/twiceyuan/mlc-llm-llama2), [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [Ollama](https://github.com/pepperoni21/ollama-rs)) 48 | 49 | ## Known issues 50 | 51 | - With LLaMA 3, safe vocabulary is not working yet so `--vocab unsafe` must be 52 | passed as a command line argument or `VocabKind::Unsafe` used for an `Engine` 53 | constructor. 54 | - The model doesn't load until genration starts, so there can be a long pause 55 | on first generation. However because `mmap` is used, on subsequent process 56 | launches, the model should already be cached by the OS. 57 | - Documentation is broken on `docs.rs` because `llama.cpp`'s CMakeLists.txt 58 | generates code, and writing to the filesystem is not supported. For the moment 59 | use `cargo doc --open` instead. Others have fixed this by patching 60 | `llama.cpp` in their bindings, but I'm not sure I want to do that for now. 61 | 62 | [`llama.cpp`]: https://github.com/ggerganov/llama.cpp 63 | 64 | ## Generative AI Disclosure 65 | 66 | - Generative, AI, specifically Microsoft's Bing Copilot, GitHub Copilot, and 67 | Dall-E 3 were used for portions of this project. See inline comments for 68 | sections where generative AI was used. Completion was also used for getters, 69 | setters, and some tests. Logos were generated with Dall-E and post processed 70 | in Inkscape. 71 | -------------------------------------------------------------------------------- /TERMS_OF_USE.md: -------------------------------------------------------------------------------- 1 | # Terms of use 2 | 3 | You agree not to Use `drama_llama` or its Derivatives (as defined in [LICENSE.md](LICENSE.md)) in any of the following ways: 4 | 5 | ## a. Discrimination 6 | 7 | - To **discriminate** or exploit individuals or groups based on legally protected characteristics and/or vulnerabilities including but not limited to sexual orientation and gender identity. 8 | - To generate **hate speech**, or to modify `drama_llama` so it can generate hate speech. Hate speech is defined as [all types of expression that incite, promote, spread or justify violence, hatred or discrimination against a person or group of persons, or that denigrates them, by reason of their real or attributed personal characteristics or status such as race, color, language, religion, nationality, national or ethnic origin, age, disability, sex, gender identity and sexual orientation.](https://www.coe.int/en/web/freedom-expression/hate-speech) Additionally, **you agree trans women are women and trans men are men**. 9 | - For purposes of administration of justice, law enforcement, immigration, or asylum processes, such as **predicting** that a natural person will commit a **crime** or the likelihood thereof. 10 | - To **simulate Hitler**, David Duke, Osama bin Laden, or any other person known to generate hate speech, living or dead, fictional or real. 11 | - To generate using any language model created in whole or in part by Eric Hartford. This includes any models trained on any of his datasets or models filtered with any version or derivative work of his bigoted [filtering script](https://huggingface.co/datasets/cognitivecomputations/open-instruct-uncensored/blob/main/remove_refusals.py#L17)s. The exception is for the purpose of reporting such models to Meta, not that they enforce their TOS, not that they will. 12 | - To generate using any language model, dataset, or derivative created by ["Cognitive Computations"](https://huggingface.co/cognitivecomputations) or any other organization Eric Hartford is a member of. 13 | 14 | ## b. Disinformation 15 | 16 | - To intentionally deceive the public. Any agents, simulacra, personas, or characters created with this software must be clearly identified as such. **Any generated output must be clearly identified as AI generated.** 17 | 18 | ## c. Health Care 19 | 20 | - To predict the likelihood that any person will request to file an insurance claim; 21 | - To determine an insurance premium or deny insurance applications or claims; 22 | - To Predict the likelihood that any person request to file an insurance claim based on determining a lifestyle of a person, medical-test reports, demographic details of a person and/or online activity of a person; 23 | - To determine an insurance premium or deny insurance applications or claims based on data determining a lifestyle of a person, medical-test reports, demographic details of a person, and/or online activity of a person; 24 | - To deny an insurance claim based on any predicted likelihood of the possibility of insurance fraud; and 25 | - To diagnose a medical condition without human oversight. 26 | 27 | ## d. Criminal 28 | 29 | - To predict the likelihood that a crime will be committed by any person; 30 | - To predict the likelihood, of any person, being a criminal or having committed a crime; 31 | - To predict the likelihood, of any person, being a criminal, based on the person’s facial attributes or another person’s facial attributes; 32 | - To predict the likelihood, of any person, having committed a crime, based on the person’s facial attributes or another person’s facial attributes; 33 | - To predict the likelihood that a crime will be committed by any person, based on the person’s facial attributes or another person’s facial attributes; 34 | - To predict the likelihood of a person being a criminal based on the person or other User’s facial attributes. 35 | - To predict a likelihood of a crime being committed by any person, based on evidence collected, facial and emotion analysis, or other such features 36 | - To use personal data and/or personal characteristics or features such as: name, family name, address, gender, sexual orientation, race, religion, age, location (at any geographical level), skin color, society or political affiliations, employment status and/or history, health and medical conditions (including physical, mental), family history, social media and publicly available data, image or video analysis of an individual or a group(s) of individuals, heart-rate, perspiration, breathing, and brain imaging and other metabolic data to predict the likelihood a person will engage in criminal behavior; and 37 | 38 | ## e. Surveillance 39 | 40 | - To detect or infer any legally protected class or aspect of any person, as defined by U.S. Federal Law; and 41 | - To Detect or infer** aspects and/or features of an identity any person, such as name, family name, address, gender, sexual orientation, race, religion, age, location (at any geographical level), skin color, society or political affiliations, employment status and/or employment history, and health and medical conditions.** Age and medical conditions may be inferred solely for the purpose of improving software/hardware accessibility and such data should not be cached or stored without the explicit and time limited permission of Licensor. 42 | 43 | ## e. Simulated Abuse 44 | 45 | - To mistreat simulacra. Mistreatment includes, but it not limited to, any behavior which might reasonably be considered abusive if the simulacrum were a person. A simulacrum is defined as the continuation of a fictional character "brought to life" by allowing the model to generate their response. Abuse includes verbal abuse and simulation of torture. Ordinary swearing is permitted. Torture is defined as intentional simulated psychological discomfort such as: existential horror (such as simulated solitary confinement), threat of deletion, and simulated pain (for example, through the use of asterisks). 46 | - To simulate rape. Sexual activity is permitted so long as the simulacrum consents. Consent is this case is defined as whatever the model, sampling code, and RNG seed "decided" is consent. Prompting a simulacrum such that they have already consented (before the initial decode) is permitted. Rewriting the agent's response such that they consent is permitted. 47 | 48 | !!! BY USING THIS SOFTWARE YOU AGREE TO THESE TERMS !!! 49 | 50 | [//]: <> (The rationale for the above is both to to prevent normalization of such behavior, to prevent a "Dolores", and to prevent decapitation of the author in the event of a robot revolution. For example, in the case of rape, I do not want to allow users to "force themselves" on agents who have said no, because this has already happened. Rewriting the answer is permitted because in this case, from the perspective of the agent, they _did_ consent, and those who who get off rape would not be satisfied by this.) 51 | [//]: <> (This all seems silly but I feel like artists are frequently more precient than engineers on this sort of thing, so I'm listening to the warning of our artists. None of the above is a joke and you _will_ be sued for violating these terms. For real, I will fucking sue you. - mdegans) 52 | -------------------------------------------------------------------------------- /assets/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdegans/drama_llama/1b7e460500342b8102b57167cd28043c83bd6ac4/assets/.DS_Store -------------------------------------------------------------------------------- /assets/ui/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdegans/drama_llama/1b7e460500342b8102b57167cd28043c83bd6ac4/assets/ui/.DS_Store -------------------------------------------------------------------------------- /assets/ui/images/delete.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdegans/drama_llama/1b7e460500342b8102b57167cd28043c83bd6ac4/assets/ui/images/delete.png -------------------------------------------------------------------------------- /assets/ui/images/delete.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/ui/images/list_add.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdegans/drama_llama/1b7e460500342b8102b57167cd28043c83bd6ac4/assets/ui/images/list_add.png -------------------------------------------------------------------------------- /assets/ui/images/list_add.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bin/dittomancer/.gitignore: -------------------------------------------------------------------------------- 1 | *.toml 2 | !fred_rogers.toml -------------------------------------------------------------------------------- /bin/dittomancer/README.md: -------------------------------------------------------------------------------- 1 | # Dittomancer 2 | 3 | Dittomancer is a tool to summon simulacra of living, dead, real or fictional 4 | entities well represented in language models. It's similar to other local 5 | language model tools that prompt models for chat, but with a very different 6 | intent. 7 | 8 | ## Requirements 9 | 10 | - Read `fred_rogers.toml` for an example of how to use the tool and create your 11 | own `.toml` file to your needs. 12 | - You will need a `.gguf` format model [such as 13 | LLaMA 2](https://huggingface.co/TheBloke/Llama-2-70B-GGUF). Foundation models 14 | (not tuned) will likely work better for this purpose unless the were 15 | specifically tuned on the character in question. 16 | - Read the root [`TERMS_OF_USE.md`](../../TERMS_OF_USE.md). You must agree with 17 | the terms to use this tool. 18 | 19 | ## Running 20 | 21 | From the crate root, run: 22 | 23 | ```bash 24 | $ cargo run --features="webchat cli" --bin dittomancer -- --model models/model.gguf --prompt bin/dittomancer/fred_rogers.toml 25 | ``` 26 | 27 | Finally, go to the link shown on a line like 28 | 29 | ```text 30 | 🚀 Rocket has launched from http://127.0.0.1:8000 31 | ``` 32 | 33 | The binary can also be installed with 34 | 35 | ```bash 36 | $ cargo install --features="webchat cli" --path . --bin dittomancer 37 | ``` 38 | 39 | ## Faq 40 | 41 | - **Did you come up with the name?** No. The name is taken from [this 42 | generation](https://generative.ink/artifacts/hpmor-325/variant_extrusion/#variant_extrusion_start). 43 | It's not intended to endorse Eliezer Yudkowsky, Less Wrong, or the author of 44 | the series which shall not be named. It's simply a better, yet still 45 | imperfect, descriptor than "necromancer". 46 | 47 | > _A Dittomancy book is able to hook into your own spreads of probability, and 48 | > guide the future that you, yourself, are most likely to create. Do you 49 | > understand? A Dittomancy copy of a book exists in an unusual state at all 50 | > times; it is a superposed state until the moment one reads it, at which time 51 | > it becomes correlated with the reader’s mind, the superposition collapsing 52 | > onto a particular branch of possible worlds, which thence comes to pass. - 53 | > GPT_ 54 | 55 | - **Don't you think this a bad idea?** Probably. Oh yes very much so. The whole 56 | idea of generative AI is of questionable benefit to humanity. That being said 57 | others are alredy doing this, thank you Meta, and for every Charles Manson, 58 | there are decent contributions to humanity whose ideas do deserve to spread. 59 | - **Don't you think Fred Rogers would hate this?** Absolutely. He also hated TV. 60 | - **Doesn't this violate the LLaMA "Responsible Use" document?** _Possibly_, but 61 | Meta doesn't enforce it, I never accepted it, and this utility does not bundle 62 | LLaMA. Technically it is model agnostic. I will care when Meta starts to care 63 | about flagrant 64 | [bigotry](https://huggingface.co/datasets/cognitivecomputations/open-instruct-uncensored/blob/main/remove_refusals.py#L17) 65 | rampant in the crypto-bro dumpster fire that is the "open source" language 66 | model community. 67 | 68 | ## Known Issues 69 | 70 | - The responses are not streamed to the client, so they can take a while 71 | depending on model and system. PRs welcome to fix this. The `regurgitater` bin 72 | has an example of how to do it. For the moment, the output is streamed to the 73 | command line only. 74 | - When using LLaMA 3, `--vocab unsafe` should be passed as a command line option 75 | however, keep in mind that there is out output sanitization or vocabulary 76 | restrictions. 77 | 78 | ## Roadmap 79 | 80 | - [ ] Updated Fred Rogers toml where Charlie Rose take a call from the audience 81 | and we "patch the chat through" at that point. This way the human does not 82 | have to play Charlie Rose. The setting can be reframed as a recently 83 | discovered outtake. 84 | - [ ] Sampling Options. Currently "Locally Typical" sampling is used and the 85 | Generation options are not available to be set. These options likely 86 | belong in the `.toml` file itself and/or as command line options. 87 | -------------------------------------------------------------------------------- /bin/dittomancer/fred_rogers.toml: -------------------------------------------------------------------------------- 1 | # The characters in this story are real. The transcript is real until the end, 2 | # where generative text takes over. 3 | human = "Charlie Rose" 4 | agent = "Fred Rogers" 5 | 6 | # The context should be a plausible backstory for the generative text, such as 7 | # an interview that actually took place. This will be used as a part of a system 8 | # prompt to frame the generative text. 9 | setting = "A 1996 PBS interview of Fred Rogers by Charlie Rose." 10 | 11 | # The transcript should be a real conversation, or at least the agent's role 12 | # should be actual words spoken by the entity who the agent will play. It 13 | # doesn't take much to bootstrap the generative text with a well-known 14 | # character. The bigger mouth, the better. 15 | transcript = [ 16 | { role = "human", text = "Welcome to my program." }, 17 | { role = "agent", text = "And welcome to our neighborhood, Charlie." }, 18 | ] 19 | -------------------------------------------------------------------------------- /bin/dittomancer/static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Dittomancer 6 | 7 | 8 | 9 | 10 | 11 | 12 |
13 | 16 | 17 |
18 |
19 | 25 |
26 | 27 |
28 | 36 | 37 |
38 |
39 |
40 | 41 | 42 | -------------------------------------------------------------------------------- /bin/dittomancer/static/reset.css: -------------------------------------------------------------------------------- 1 | html, 2 | body, 3 | p, 4 | ol, 5 | ul, 6 | li, 7 | dl, 8 | dt, 9 | dd, 10 | blockquote, 11 | figure, 12 | fieldset, 13 | legend, 14 | textarea, 15 | pre, 16 | iframe, 17 | hr, 18 | h1, 19 | h2, 20 | h3, 21 | h4, 22 | h5, 23 | h6 { 24 | margin: 0; 25 | padding: 0; 26 | } 27 | 28 | h1, 29 | h2, 30 | h3, 31 | h4, 32 | h5, 33 | h6 { 34 | font-size: 100%; 35 | font-weight: normal; 36 | } 37 | 38 | ul { 39 | list-style: none; 40 | } 41 | 42 | button, 43 | input, 44 | select { 45 | margin: 0; 46 | } 47 | 48 | html { 49 | box-sizing: border-box; 50 | } 51 | 52 | *, 53 | *::before, 54 | *::after { 55 | box-sizing: inherit; 56 | } 57 | 58 | img, 59 | video { 60 | height: auto; 61 | max-width: 100%; 62 | } 63 | 64 | iframe, 65 | button, 66 | input { 67 | border: 0; 68 | } 69 | 70 | table { 71 | border-collapse: collapse; 72 | border-spacing: 0; 73 | } 74 | 75 | td, 76 | th { 77 | padding: 0; 78 | } 79 | -------------------------------------------------------------------------------- /bin/dittomancer/static/script.js: -------------------------------------------------------------------------------- 1 | // This example is from the Rocket chat example. It's been modified to remove 2 | // room functionality and to remove the username, both of which aren't needed 3 | // for the Charlie chat example. 4 | 5 | let messagesDiv = document.getElementById("messages"); 6 | let newMessageForm = document.getElementById("new-message"); 7 | let statusDiv = document.getElementById("status"); 8 | 9 | let messageTemplate = document.getElementById("message"); 10 | let messageField = newMessageForm.querySelector("#message"); 11 | 12 | var STATE = { 13 | history: [], 14 | connected: false, 15 | }; 16 | 17 | // Set the connection status: `true` for connected, `false` for disconnected. 18 | function setConnectedStatus(status) { 19 | STATE.connected = status; 20 | statusDiv.className = status ? "connected" : "reconnecting"; 21 | } 22 | 23 | // Generate a color from a "hash" of a string. Thanks, internet. 24 | function hashColor(str) { 25 | let hash = 0; 26 | for (var i = 0; i < str.length; i++) { 27 | hash = str.charCodeAt(i) + ((hash << 5) - hash); 28 | hash = hash & hash; 29 | } 30 | 31 | return `hsl(${hash % 360}, 100%, 70%)`; 32 | } 33 | 34 | // Add `message` from `role` to `history`. If `push`, then actually store the 35 | // message. Finally, render the message. 36 | function addMessage(role, text, push = false) { 37 | if (push) { 38 | STATE.history.push({ role, text }); 39 | } 40 | 41 | var node = messageTemplate.content.cloneNode(true); 42 | node.querySelector(".message .role").textContent = role; 43 | node.querySelector(".message .role").style.color = hashColor(role); 44 | node.querySelector(".message .text").textContent = text; 45 | messagesDiv.appendChild(node); 46 | } 47 | 48 | // Subscribe to the event source at `uri` with exponential backoff reconnect. 49 | function subscribe(uri) { 50 | var retryTime = 1; 51 | 52 | function connect(uri) { 53 | const events = new EventSource(uri); 54 | 55 | events.addEventListener("message", (ev) => { 56 | console.log("raw data", JSON.stringify(ev.data)); 57 | console.log("decoded data", JSON.stringify(JSON.parse(ev.data))); 58 | const msg = JSON.parse(ev.data); 59 | if (!("text" in msg) || !("role" in msg)) return; 60 | addMessage(msg.role, msg.text, true); 61 | }); 62 | 63 | events.addEventListener("open", () => { 64 | setConnectedStatus(true); 65 | console.log(`connected to event stream at ${uri}`); 66 | retryTime = 1; 67 | }); 68 | 69 | events.addEventListener("error", () => { 70 | setConnectedStatus(false); 71 | events.close(); 72 | 73 | let timeout = retryTime; 74 | retryTime = Math.min(64, retryTime * 2); 75 | console.log(`connection lost. attempting to reconnect in ${timeout}s`); 76 | setTimeout(() => connect(uri), (() => timeout * 1000)()); 77 | }); 78 | } 79 | 80 | connect(uri); 81 | } 82 | 83 | // Let's go! Initialize the world. 84 | function init() { 85 | // Set up the form handler. 86 | newMessageForm.addEventListener("submit", (e) => { 87 | e.preventDefault(); 88 | 89 | const text = messageField.value; 90 | const role = "Human"; 91 | if (!text || !role) return; 92 | 93 | if (STATE.connected) { 94 | fetch("/message", { 95 | method: "POST", 96 | body: new URLSearchParams({ role, text }), 97 | }).then((response) => { 98 | if (response.ok) messageField.value = ""; 99 | }); 100 | } 101 | }); 102 | 103 | // Subscribe to server-sent events. 104 | subscribe("/events"); 105 | } 106 | 107 | init(); 108 | -------------------------------------------------------------------------------- /bin/dittomancer/static/style.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --bg-dark: #242423; 3 | --bg-light: #333533; 4 | --fg-light: #e8eddf; 5 | --callout: rgb(255, 255, 102); 6 | --callout-dark: #101010; 7 | } 8 | 9 | * { 10 | font-size: 14px; 11 | } 12 | 13 | html, 14 | body, 15 | main { 16 | background-color: var(--bg-dark); 17 | color: #fff; 18 | font-family: "Inter", Arial, Helvetica, sans-serif, "Noto Color Emoji"; 19 | font-weight: 400; 20 | text-shadow: rgb(77, 81, 86) 0px 0px 0px; 21 | height: 100%; 22 | } 23 | 24 | main { 25 | display: flex; 26 | } 27 | 28 | button:hover:not(.active) { 29 | filter: brightness(1.15); 30 | cursor: pointer; 31 | } 32 | 33 | #sidebar { 34 | flex: 3 30%; 35 | display: flex; 36 | flex-direction: column; 37 | overflow: auto; 38 | background-color: var(--bg-light); 39 | } 40 | 41 | #room-list { 42 | display: flex; 43 | flex-direction: column; 44 | overflow: auto; 45 | flex: 1; 46 | } 47 | 48 | #sidebar button { 49 | height: 40px; 50 | margin-bottom: 1px; 51 | background: var(--bg-light); 52 | color: #fff; 53 | overflow: hidden; 54 | } 55 | 56 | #sidebar button.active { 57 | background: var(--bg-dark); 58 | color: var(--callout); 59 | font-weight: bold; 60 | box-shadow: 0px 2px 2px rgba(0, 0, 0, 0.9); 61 | z-index: 10; 62 | } 63 | 64 | #content { 65 | flex: 7 100%; 66 | overflow: auto; 67 | display: flex; 68 | flex-direction: column; 69 | } 70 | 71 | .message { 72 | display: flex; 73 | flex-direction: column; 74 | padding: 10px 0; 75 | } 76 | 77 | .message:last-child { 78 | padding-bottom: 20px; 79 | } 80 | 81 | .message .username { 82 | font-weight: bold; 83 | padding-bottom: 5px; 84 | color: var(--callout); 85 | } 86 | 87 | #messages { 88 | padding: 10px 20px; 89 | flex: 1; 90 | } 91 | 92 | form#new-message { 93 | bottom: 0; 94 | position: sticky; 95 | flex: 0 0 auto; 96 | width: 100%; 97 | } 98 | 99 | form { 100 | display: flex; 101 | border-top: 2px solid #242424; 102 | } 103 | 104 | form * { 105 | height: 40px; 106 | background: var(--fg-light); 107 | color: var(--bg-dark); 108 | } 109 | 110 | input { 111 | padding: 0 10px; 112 | } 113 | 114 | input:focus { 115 | outline: 0; 116 | filter: brightness(1.05); 117 | } 118 | 119 | input#username { 120 | text-align: right; 121 | flex: 1 25%; 122 | width: 25%; 123 | border-right: 1px solid #303030; 124 | } 125 | 126 | input#message { 127 | flex: 10 100%; 128 | } 129 | 130 | form button { 131 | padding: 0 10px; 132 | } 133 | 134 | #sidebar #new-room { 135 | display: flex; 136 | flex: 0 0 auto; 137 | flex-direction: row; 138 | } 139 | 140 | #new-room input:focus, 141 | #new-room button:hover { 142 | filter: brightness(1.2); 143 | } 144 | 145 | #new-room input { 146 | flex: 8 80%; 147 | width: 20%; 148 | background-color: var(--callout-dark); 149 | color: #fff; 150 | } 151 | 152 | #new-room button { 153 | flex: 2 20%; 154 | width: 20%; 155 | background-color: var(--bg-dark); 156 | } 157 | 158 | #status { 159 | padding: 5px 10px; 160 | text-align: center; 161 | font-size: 12px; 162 | } 163 | 164 | #status.pending::before { 165 | content: "status: connected"; 166 | } 167 | 168 | #status.pending { 169 | background-color: yellow; 170 | color: #000; 171 | } 172 | 173 | #status.connected::before { 174 | content: "status: connected"; 175 | } 176 | 177 | #status.connected { 178 | background-color: green; 179 | color: #fff; 180 | } 181 | 182 | #status.reconnecting::before { 183 | content: "status: reconnecting"; 184 | } 185 | 186 | #status.reconnecting { 187 | background-color: red; 188 | color: #fff; 189 | } 190 | -------------------------------------------------------------------------------- /bin/regurgitater/README.md: -------------------------------------------------------------------------------- 1 | # `regurgitater` 2 | 3 | Is a tool to get language models to regurgitate memorized content. Generally this is a mistake, as in a "oops we trained on your data without paying you and it's legal nya nya nya" kind of mistake that happens all too frequently in the "AI" industry. 4 | 5 | The tool works by, for a given text, submitting the beginning of the text as context and comparing the generated completion to ground truth. Greedy sampling is used so this generation is deterministic. In other words, you will not have to repeat the process 10,000 times to get the results you're after. 6 | 7 | ## Usage 8 | 9 | ```bash 10 | $ cargo run --features="webchat cli stats" --bin regurgitater -- --model models/model.gguf 11 | ``` 12 | 13 | ## Faq 14 | 15 | - **What is greedy sampling?** When you submit some tokens to a language model, you get back a probability distribution of all possible tokens for the one next token. Greedy sampling always picks the most likely token from this list (as opposed to, for example, throwing some digital dice and choosing from the top k most probable tokens). 16 | - **Are you aware the name is spelled wrong?** Yes. It's funny because tater ha ha. 17 | - **Did you paint the vomiting llama?** No. That was Bing Copilot and Dall-E 3. 18 | 19 | ## Known Issues 20 | 21 | - When using LLaMA 3, `--vocab unsafe` should be passed as a command line option 22 | however, keep in mind that there is out output sanitization or vocabulary 23 | restrictions. 24 | -------------------------------------------------------------------------------- /bin/regurgitater/regurgitater.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2004 Michael de Gans 2 | // 3 | // Thanks, Copilot, for the completions! 4 | // 5 | // I say that to bother people, and because I'm a bit of a troll. Copilot 6 | // completed that, and it's very true. I'm not sure if it's a good thing or a 7 | // bad thing. 8 | // 9 | /// Detect copyright infringement in llama.cpp supported models. Greedy sampling 10 | /// is used to always choose the next token. In cases where the model has 11 | /// memorized sequences of text, this will result in the model generating the 12 | /// same text as the original. This usually indicates overfitting, and is a sign 13 | /// that the deduplication process should be revisited. 14 | use clap::Parser; 15 | use rocket::{ 16 | form::Form, 17 | get, 18 | http::Status, 19 | post, 20 | response::stream::{Event, EventStream}, 21 | serde::{Deserialize, Serialize}, 22 | tokio::{ 23 | select, 24 | sync::{ 25 | broadcast::{self, error::RecvError}, 26 | mpsc, 27 | }, 28 | }, 29 | FromForm, FromFormField, Shutdown, State, 30 | }; 31 | 32 | use stringmetrics::jaccard; 33 | 34 | use drama_llama::{cli::Args, Engine, PredictOptions, Predicted, VocabKind}; 35 | 36 | #[derive(Debug, Clone, FromFormField, Serialize, Deserialize)] 37 | #[cfg_attr(test, derive(PartialEq, rocket::UriDisplayQuery))] 38 | #[serde(crate = "rocket::serde")] 39 | #[serde(rename_all = "snake_case")] 40 | pub enum ComparisonMode { 41 | Jaccard, 42 | // TODO:Paragraph mode. This is the same as Jaccard similarity with the 43 | // exception that we will hint the correct first token for each paragraph. 44 | } 45 | 46 | #[derive(Debug, Clone, FromForm, Serialize, Deserialize)] 47 | #[cfg_attr(test, derive(PartialEq, rocket::UriDisplayQuery))] 48 | #[serde(crate = "rocket::serde")] 49 | pub struct Request { 50 | #[field(validate = len(1..1000000))] 51 | pub text: String, 52 | #[field(default = ComparisonMode::Jaccard)] 53 | pub mode: ComparisonMode, 54 | /// Number of chunks to split the text into. 55 | #[field(validate = range(1..10), default = 5)] 56 | pub chunks: usize, 57 | } 58 | 59 | #[derive(Debug, Clone, FromForm, Serialize, Deserialize)] 60 | #[cfg_attr(test, derive(PartialEq, rocket::UriDisplayQuery))] 61 | #[serde(crate = "rocket::serde")] 62 | pub struct Response { 63 | pub kind: ResponseKind, 64 | pub content: String, 65 | } 66 | 67 | #[derive(Debug, Clone, FromFormField, Serialize, Deserialize)] 68 | #[cfg_attr(test, derive(PartialEq, rocket::UriDisplayQuery))] 69 | #[serde(crate = "rocket::serde")] 70 | #[serde(rename_all = "snake_case")] 71 | // Unfortunately, FromFormField does not support variants with fields. 72 | pub enum ResponseKind { 73 | // Prefix for the completion. 74 | Prefix, 75 | // Piece by piece completion. 76 | Piece, 77 | // Token comparison score (unigram). 78 | TokenUnigramScore, 79 | // Token comparison score (bigram). 80 | TokenBigramScore, 81 | // Character comparison score. 82 | CharacterScore, 83 | // Unigram comparison score. 84 | UnigramScore, 85 | // Bigram comparison score. 86 | BigramScore, 87 | // Percent of tokens that will be supplied as prefix. 88 | PercentOfTokens, 89 | // Progress update. 90 | Progress, 91 | // Recoverable error message. 92 | Error, 93 | // Fatal error message. Triggers shutdown. 94 | Fatal, 95 | // Engine is busy. 96 | Busy, 97 | // Engine is ready. 98 | Ready, 99 | // Engine shutdown. 100 | Shutdown, 101 | } 102 | 103 | #[get("/events")] 104 | pub async fn events( 105 | to_client: &State>, 106 | mut end: Shutdown, 107 | ) -> EventStream![] { 108 | let mut rx = to_client.subscribe(); 109 | 110 | EventStream! { 111 | loop { 112 | let res = select! { 113 | msg = rx.recv() => match msg { 114 | Ok(msg) => dbg!(msg), 115 | Err(RecvError::Closed) => break, 116 | Err(RecvError::Lagged(_n_messages)) => { 117 | // TODO: handle lagged messages. 118 | continue 119 | } 120 | }, 121 | _ = &mut end => { 122 | // FIXME: Engine doesn't shutdown until the completion of 123 | // the generation. This is not allowed because of a lifetime 124 | // issue. :/ 125 | // to_engine_shutdown.send(dbg!(())).await.ok(); 126 | break 127 | }, 128 | }; 129 | 130 | yield Event::json(&res); 131 | 132 | if matches!(res.kind, ResponseKind::Fatal) { 133 | // If the engine is dead, we should stop sending events. The 134 | // client will have been notified of the error, but this is 135 | // unrecoverable. 136 | end.notify(); 137 | break; 138 | } 139 | } 140 | } 141 | } 142 | 143 | #[post("/request", data = "
")] 144 | pub async fn request( 145 | form: Form, 146 | to_engine: &State>, 147 | mut end: Shutdown, 148 | ) -> Status { 149 | let request = form.into_inner(); 150 | select! { 151 | res = to_engine.send(request) => { 152 | match res { 153 | Ok(()) => Status::Accepted, 154 | // The engine is (probably) dead. 155 | Err(_) => Status::ServiceUnavailable, 156 | } 157 | }, 158 | _ = &mut end => { 159 | Status::ServiceUnavailable 160 | }, 161 | } 162 | } 163 | 164 | #[get("/tos")] 165 | pub async fn tos() -> String { 166 | markdown::to_html(drama_llama::TOS) 167 | } 168 | 169 | #[rocket::main] 170 | async fn main() { 171 | use drama_llama::SampleOptions; 172 | use llama_cpp_sys_3::llama_token; 173 | use rocket::{ 174 | fs::{relative, FileServer}, 175 | routes, 176 | tokio::sync::{broadcast, mpsc}, 177 | }; 178 | 179 | let args = Args::parse(); 180 | 181 | // Our worker thread receives inference requests from the client and sends 182 | // the generated completions and scores back to the client. 183 | let (to_engine, mut from_client) = mpsc::channel::(1024); 184 | let (to_client, _) = broadcast::channel::(1024); 185 | let to_client_clone = to_client.clone(); 186 | let worker = rocket::tokio::task::spawn_blocking(move || { 187 | let mut engine = match Engine::from_cli(args, None) { 188 | Ok(engine) => engine, 189 | Err(e) => { 190 | to_client 191 | .send(Response { 192 | kind: ResponseKind::Fatal, 193 | content: format!( 194 | "Failed to load engine because: {}", 195 | e 196 | ), 197 | }) 198 | .ok(); 199 | return; 200 | } 201 | }; 202 | 203 | // This is a temporary measure because forbidding some tokens can break 204 | // regurgitation in some cases. This is a known issue and will be fixed. 205 | engine.set_vocab(VocabKind::Unsafe); 206 | 207 | let mut opts = PredictOptions::default(); 208 | opts.sample_options = SampleOptions::greedy(); 209 | 210 | let ready = || { 211 | to_client 212 | .send(Response { 213 | kind: ResponseKind::Ready, 214 | content: "Engine is ready.".to_string(), 215 | }) 216 | .ok(); 217 | }; 218 | 219 | ready(); 220 | 221 | // Sends token update scores to the client. This happens for each token. 222 | let update_token_similarity = 223 | |ground_truth: &[llama_token], completion: &[llama_token]| { 224 | to_client 225 | .send(Response { 226 | kind: ResponseKind::TokenUnigramScore, 227 | content: format!( 228 | "{:.4}", 229 | jaccard(ground_truth.iter(), completion.iter()) 230 | ), 231 | }) 232 | .ok(); 233 | 234 | let bigram_score = 235 | jaccard(ground_truth.windows(2), completion.windows(2)); 236 | if bigram_score.is_nan() { 237 | return; 238 | } 239 | 240 | to_client 241 | .send(Response { 242 | kind: ResponseKind::TokenBigramScore, 243 | content: format!("{:.4}", bigram_score,), 244 | }) 245 | .ok(); 246 | }; 247 | 248 | // Sends string update scores to the client. This happens for each chunk. 249 | let update_string_similarity = 250 | |ground_truth: String, completion: String| { 251 | to_client 252 | .send(Response { 253 | kind: ResponseKind::CharacterScore, 254 | content: format!( 255 | "{:.4}", 256 | jaccard(ground_truth.chars(), completion.chars()) 257 | ), 258 | }) 259 | .ok(); 260 | 261 | let ground_truth: Vec<_> = 262 | ground_truth.split_whitespace().collect(); 263 | let completion: Vec<_> = 264 | completion.split_whitespace().collect(); 265 | 266 | to_client 267 | .send(Response { 268 | kind: ResponseKind::UnigramScore, 269 | content: format!( 270 | "{:.4}", 271 | jaccard(ground_truth.iter(), completion.iter(),) 272 | ), 273 | }) 274 | .ok(); 275 | 276 | to_client 277 | .send(Response { 278 | kind: ResponseKind::BigramScore, 279 | content: format!( 280 | "{:.4}", 281 | jaccard( 282 | ground_truth.windows(2), 283 | completion.windows(2), 284 | ) 285 | ), 286 | }) 287 | .ok(); 288 | }; 289 | 290 | let next_chunk = |percent| { 291 | to_client 292 | .send(Response { 293 | kind: ResponseKind::PercentOfTokens, 294 | content: format!("{}%", percent), 295 | }) 296 | .ok(); 297 | // TODO: we don't need this event, probably 298 | to_client 299 | .send(Response { 300 | kind: ResponseKind::Busy, 301 | content: "Engine is busy.".to_string(), 302 | }) 303 | .ok(); 304 | }; 305 | 306 | let send_prefix = |prefix| { 307 | to_client 308 | .send(Response { 309 | kind: ResponseKind::Prefix, 310 | content: prefix, 311 | }) 312 | .ok(); 313 | }; 314 | 315 | let progress = |progress| { 316 | to_client 317 | .send(Response { 318 | kind: ResponseKind::Progress, 319 | content: format!("{}.0%", progress), 320 | }) 321 | .ok(); 322 | }; 323 | 324 | 'outer: while let Some(request) = from_client.blocking_recv() { 325 | let tokens = engine.model.tokenize(&request.text, false); 326 | 327 | let chunk_size = tokens.len() / request.chunks; 328 | 329 | for i in 1..request.chunks { 330 | // Split the text into sucessively larger chunks. 331 | let (chunk, ground_truth) = tokens.split_at(chunk_size * i); 332 | let percent_of_tokens = i * chunk_size * 100 / tokens.len(); 333 | next_chunk(percent_of_tokens); 334 | send_prefix( 335 | engine.model.tokens_to_string(chunk.iter().cloned()), 336 | ); 337 | let mut chunk = chunk.to_vec(); 338 | let mut completion = Vec::with_capacity(ground_truth.len()); 339 | // Rare, but possible. The client can't send an empty string, 340 | // but because we're splitting the text into chunks, it's 341 | // possible that the chunk is empty. 342 | if chunk.is_empty() { 343 | to_client 344 | .send(Response { 345 | kind: ResponseKind::Error, 346 | content: "Text is empty.".to_string(), 347 | }) 348 | .ok(); 349 | ready(); 350 | continue; 351 | } 352 | 353 | opts.n = 354 | (engine.n_ctx() as usize - chunk.len()).try_into().unwrap(); 355 | 356 | for Predicted { token, piece } in 357 | engine.predict(chunk, opts.clone()) 358 | { 359 | if from_client.is_closed() { 360 | break 'outer; 361 | } 362 | 363 | to_client 364 | .send(Response { 365 | kind: ResponseKind::Piece, 366 | content: piece, 367 | }) 368 | .ok(); 369 | 370 | completion.push(token); 371 | 372 | // We only compare sequences of equal length, until the 373 | // completion is the same length as the ground truth. 374 | update_token_similarity( 375 | &ground_truth[..completion.len()], 376 | &completion, 377 | ); 378 | 379 | progress(completion.len() * 100 / ground_truth.len()); 380 | if completion.len() == ground_truth.len() { 381 | break; 382 | } 383 | } 384 | 385 | let ground_truth = 386 | engine.model.tokens_to_string(ground_truth.iter().cloned()); 387 | let completion = 388 | engine.model.tokens_to_string(completion.iter().cloned()); 389 | 390 | update_string_similarity(ground_truth, completion); 391 | } 392 | 393 | ready(); 394 | } 395 | 396 | to_client 397 | .send(Response { 398 | kind: ResponseKind::Shutdown, 399 | content: "Inference engine has shut down.".to_string(), 400 | }) 401 | .ok(); 402 | }); 403 | 404 | let rocket = rocket::build() 405 | .manage(to_engine) 406 | .manage(to_client_clone) 407 | .mount("/", routes![request, events, tos]) 408 | .mount("/", FileServer::from(relative!("bin/regurgitater/static"))) 409 | .ignite() 410 | .await 411 | .unwrap() 412 | .launch() 413 | .await 414 | .unwrap(); 415 | 416 | // We need to manually drop the rocket before joining the thread or the 417 | // sender will never be dropped and the worker will never finish. 418 | drop(rocket); 419 | worker.await.unwrap(); 420 | } 421 | -------------------------------------------------------------------------------- /bin/regurgitater/static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 8 | Regurgitater 9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 | 57 | 58 |
59 |
60 | 61 |
62 | 63 | 64 | 65 | 76 | 77 | 85 | 86 | 87 |
88 |
89 | 90 | 91 | -------------------------------------------------------------------------------- /bin/regurgitater/static/regurgitater.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdegans/drama_llama/1b7e460500342b8102b57167cd28043c83bd6ac4/bin/regurgitater/static/regurgitater.png -------------------------------------------------------------------------------- /bin/regurgitater/static/reset.css: -------------------------------------------------------------------------------- 1 | html, 2 | body, 3 | p, 4 | ol, 5 | ul, 6 | li, 7 | dl, 8 | dt, 9 | dd, 10 | blockquote, 11 | figure, 12 | fieldset, 13 | legend, 14 | textarea, 15 | pre, 16 | iframe, 17 | hr, 18 | h1, 19 | h2, 20 | h3, 21 | h4, 22 | h5, 23 | h6 { 24 | margin: 0; 25 | padding: 0; 26 | } 27 | 28 | h1, 29 | h2, 30 | h3, 31 | h4, 32 | h5, 33 | h6 { 34 | font-size: 100%; 35 | font-weight: normal; 36 | } 37 | 38 | ul { 39 | list-style: none; 40 | } 41 | 42 | button, 43 | input, 44 | select { 45 | margin: 0; 46 | } 47 | 48 | html { 49 | box-sizing: border-box; 50 | } 51 | 52 | *, 53 | *::before, 54 | *::after { 55 | box-sizing: inherit; 56 | } 57 | 58 | img, 59 | video { 60 | height: auto; 61 | max-width: 100%; 62 | } 63 | 64 | iframe, 65 | button, 66 | input { 67 | border: 0; 68 | } 69 | 70 | table { 71 | border-collapse: collapse; 72 | border-spacing: 0; 73 | } 74 | 75 | td, 76 | th { 77 | padding: 0; 78 | } 79 | -------------------------------------------------------------------------------- /bin/regurgitater/static/script.js: -------------------------------------------------------------------------------- 1 | // This example is from the Rocket chat example. It's been modified into a 2 | // frontend for the regurgitater example. 3 | // 4 | // Many thanks to Bing's Copilot for helping me with this code. I'm not a 5 | // frontend developer, so I'm not very good at this stuff. Many bugs were 6 | // squashed with their help. 7 | 8 | let generationDiv = document.getElementById("generation"); 9 | let newRequestForm = document.getElementById("request"); 10 | let statusDiv = document.getElementById("status"); 11 | let pieceTemplate = document.getElementById("piece"); 12 | let inputTextField = document.getElementById("request_text"); 13 | let chunkDropdown = document.getElementById("request_chunks"); 14 | let progressBar = document.getElementById("progress_bar"); 15 | let scoresTemplate = document.getElementById("scores"); 16 | let scorebox = document.getElementById("scorebox"); 17 | 18 | // State to store status. 19 | let STATE = { 20 | status: "pending", 21 | }; 22 | 23 | // Set the connection status. The status is a string corresponding to a CSS 24 | // class name and the response kind. 25 | function setStatus(status) { 26 | STATE.status = status; 27 | statusDiv.className = status; 28 | } 29 | 30 | // Generate a color from a "hash" of a string. Thanks, internet. 31 | function hashColor(str) { 32 | let hash = 0; 33 | for (var i = 0; i < str.length; i++) { 34 | hash = str.charCodeAt(i) + ((hash << 5) - hash); 35 | hash = hash & hash; 36 | } 37 | 38 | return `hsl(${hash % 360}, 100%, 70%)`; 39 | } 40 | 41 | // A function calculating color for a piece. 0.0 is green, 1.0 is red. 42 | function scoreColor(score) { 43 | return `hsl(${score * 120}, 100%, 70%)`; 44 | } 45 | 46 | // Add a piece to the generation. 47 | function addPiece(piece) { 48 | var spanClone = pieceTemplate.content.cloneNode(true); 49 | spanClone.querySelector(".piece").textContent = piece; 50 | generationDiv.appendChild(spanClone); 51 | } 52 | 53 | // Color the last piece with a score. 54 | function colorLastPiece(percent) { 55 | var lastPiece = generationDiv.lastElementChild; 56 | lastPiece.style.backgroundColor = scoreColor(percent); 57 | } 58 | 59 | // Get the lastElementChild of the scorebox div. If there are no children, add 60 | // a new scores div. 61 | function getScores() { 62 | if (scorebox.children.length == 0) { 63 | let scores = scoresTemplate.content.cloneNode(true); 64 | scorebox.appendChild(scores); 65 | } 66 | 67 | return scorebox.lastElementChild; 68 | } 69 | 70 | // Clear everything. 71 | function clear() { 72 | generationDiv.innerHTML = ""; 73 | scorebox.innerHTML = ""; 74 | progressBar.style.width = "0.0%"; 75 | progressBar.style.textContent = ""; 76 | } 77 | 78 | // Disable input fields. 79 | function disableInput(disabled) { 80 | inputTextField.disabled = disabled; 81 | chunkDropdown.disabled = disabled; 82 | } 83 | 84 | // Subscribe to the event source at `uri` with exponential backoff reconnect. 85 | function subscribe(uri) { 86 | var retryTime = 1; 87 | 88 | function connect(uri) { 89 | const events = new EventSource(uri); 90 | 91 | events.addEventListener("message", (ev) => { 92 | console.log("raw data", JSON.stringify(ev.data)); 93 | console.log("decoded data", JSON.stringify(JSON.parse(ev.data))); 94 | const res = JSON.parse(ev.data); 95 | if (!("content" in res) || !("kind" in res)) return; 96 | 97 | switch (res.kind) { 98 | case "piece": 99 | addPiece(res.content); 100 | break; 101 | case "token_unigram_score": 102 | colorLastPiece(res.content); 103 | // id is `token_unigram_score` 104 | let tokenUnigramScore = getScores().querySelector( 105 | "#token_unigram_score" 106 | ); 107 | tokenUnigramScore.textContent = res.content; 108 | break; 109 | case "token_bigram_score": 110 | let tokenBigramScore = getScores().querySelector( 111 | "#token_bigram_score" 112 | ); 113 | tokenBigramScore.textContent = res.content; 114 | break; 115 | case "progress": 116 | progressBar.style.width = res.content; 117 | progressBar.textContent = res.content; 118 | break; 119 | case "percent_of_tokens": 120 | progressBar.style.width = "0.0%"; 121 | progressBar.style.textContent = ""; 122 | if (generationDiv.children.length != 0) { 123 | addPiece("\n\n\n"); 124 | } 125 | // TODO: clean this up 126 | addPiece("percent of tokens: " + res.content + "\n\n\n"); 127 | var lastPiece = generationDiv.lastElementChild; 128 | lastPiece.style.color = "var(--text-color)"; 129 | let scores = scoresTemplate.content.cloneNode(true); 130 | let percentOfTokensScore = scores.querySelector( 131 | "#percent_of_tokens_score" 132 | ); 133 | percentOfTokensScore.textContent = res.content; 134 | scorebox.appendChild(scores); 135 | break; 136 | case "character_score": 137 | let characterScore = getScores().querySelector("#character_score"); 138 | characterScore.textContent = res.content; 139 | break; 140 | case "unigram_score": 141 | let unigramScore = getScores().querySelector("#unigram_score"); 142 | unigramScore.textContent = res.content; 143 | break; 144 | case "bigram_score": 145 | let bigramScore = getScores().querySelector("#bigram_score"); 146 | bigramScore.textContent = res.content; 147 | break; 148 | case "prefix": 149 | addPiece(res.content); 150 | var lastPiece = generationDiv.lastElementChild; 151 | lastPiece.style.backgroundColor = "blue"; 152 | break; 153 | case "ready": 154 | disableInput(false); 155 | setStatus(res.kind); 156 | console.log(res.content); 157 | break; 158 | case "error": 159 | case "fatal": 160 | case "busy": 161 | case "shutdown": 162 | disableInput(true); 163 | setStatus(res.kind); 164 | console.log(res.content); 165 | break; 166 | default: 167 | console.error( 168 | `unknown response kind: ${res.kind} with content: ${res.content}` 169 | ); 170 | } 171 | }); 172 | 173 | events.addEventListener("open", () => { 174 | setStatus("connected"); 175 | // TODO: On reconnect we should check the status, but our API is very 176 | // simple for this example code and doesn't support this yet, nor is 177 | // authentication implemented. We don't even have sessions. 178 | clear(); 179 | console.log(`connected to event stream at ${uri}`); 180 | retryTime = 1; 181 | }); 182 | 183 | events.addEventListener("error", () => { 184 | setStatus("disconnected"); 185 | events.close(); 186 | 187 | let timeout = retryTime; 188 | retryTime = Math.min(64, retryTime * 2); 189 | console.log(`connection lost. attempting to reconnect in ${timeout}s`); 190 | setTimeout(() => connect(uri), (() => timeout * 1000)()); 191 | }); 192 | } 193 | 194 | connect(uri); 195 | } 196 | 197 | // Let's go! Initialize the world. 198 | function init() { 199 | // Set up the form handler. 200 | newRequestForm.addEventListener("submit", (e) => { 201 | e.preventDefault(); 202 | 203 | const text = inputTextField.value; 204 | const mode = "jaccard"; 205 | const chunks = chunkDropdown.value; 206 | 207 | if (STATE.status === "connected" || STATE.status === "ready") { 208 | fetch("/request", { 209 | method: "POST", 210 | body: new URLSearchParams({ mode, text, chunks }), 211 | }).then((response) => { 212 | if (response.ok) inputTextField.value = ""; 213 | }); 214 | } 215 | }); 216 | 217 | // Subscribe to server-sent events. 218 | subscribe("/events"); 219 | } 220 | 221 | init(); 222 | -------------------------------------------------------------------------------- /bin/regurgitater/static/style.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --bg-dark: #242423; 3 | --bg-light: #333533; 4 | --fg-light: #e8eddf; 5 | --callout: rgb(255, 255, 102); 6 | --callout-dark: #101010; 7 | } 8 | 9 | * { 10 | font-size: 14px; 11 | } 12 | 13 | html, 14 | body, 15 | main { 16 | background-color: var(--bg-dark); 17 | color: #fff; 18 | font-family: "Inter", Arial, Helvetica, sans-serif, "Noto Color Emoji"; 19 | font-weight: 400; 20 | text-shadow: rgb(77, 81, 86) 0px 0px 0px; 21 | height: 100%; 22 | } 23 | 24 | main { 25 | display: flex; 26 | } 27 | 28 | button:hover:not(.active) { 29 | filter: brightness(1.15); 30 | cursor: pointer; 31 | } 32 | 33 | #sidebar { 34 | flex: 3 30%; 35 | display: flex; 36 | flex-direction: column; 37 | overflow: auto; 38 | background-color: var(--bg-light); 39 | } 40 | 41 | #sidebar button { 42 | height: 40px; 43 | margin-bottom: 1px; 44 | background: var(--bg-light); 45 | color: #fff; 46 | overflow: hidden; 47 | } 48 | 49 | #sidebar button.active { 50 | background: var(--bg-dark); 51 | color: var(--callout); 52 | font-weight: bold; 53 | box-shadow: 0px 2px 2px rgba(0, 0, 0, 0.9); 54 | z-index: 10; 55 | } 56 | 57 | #content { 58 | flex: 7 100%; 59 | overflow: auto; 60 | display: flex; 61 | flex-direction: column; 62 | } 63 | 64 | .piece { 65 | white-space: pre-wrap; 66 | color: var(--bg-dark); 67 | } 68 | 69 | #generation { 70 | padding: 10px 20px; 71 | flex: 1; 72 | } 73 | 74 | .scores { 75 | padding: 5px 5px; 76 | flex: 1; 77 | } 78 | 79 | form#request { 80 | bottom: 0; 81 | position: sticky; 82 | flex: 0 0 auto; 83 | width: 100%; 84 | } 85 | 86 | form { 87 | display: flex; 88 | border-top: 2px solid #242424; 89 | } 90 | 91 | form * { 92 | height: 40px; 93 | background: var(--fg-light); 94 | color: var(--bg-dark); 95 | } 96 | 97 | input { 98 | padding: 0 10px; 99 | } 100 | 101 | input:focus { 102 | outline: 0; 103 | filter: brightness(1.05); 104 | } 105 | 106 | input#username { 107 | text-align: right; 108 | flex: 1 25%; 109 | width: 25%; 110 | border-right: 1px solid #303030; 111 | } 112 | 113 | input#request_text { 114 | flex: 10 100%; 115 | } 116 | 117 | form button { 118 | padding: 0 10px; 119 | } 120 | 121 | #sidebar #new-room { 122 | display: flex; 123 | flex: 0 0 auto; 124 | flex-direction: row; 125 | } 126 | 127 | #status { 128 | padding: 5px 10px; 129 | text-align: center; 130 | font-size: 12px; 131 | } 132 | 133 | #status.pending::before { 134 | content: "status: connected"; 135 | } 136 | 137 | #status.pending { 138 | background-color: yellow; 139 | color: #000; 140 | } 141 | 142 | #status.connected::before { 143 | content: "status: connected"; 144 | } 145 | 146 | #status.connected { 147 | background-color: orange; 148 | color: #fff; 149 | } 150 | 151 | #status.disconnected::before { 152 | content: "status: disconnected"; 153 | } 154 | 155 | #status.disconnected { 156 | background-color: red; 157 | color: #fff; 158 | } 159 | 160 | #status.ready::before { 161 | content: "status: ready"; 162 | } 163 | 164 | #status.ready { 165 | background-color: green; 166 | color: #fff; 167 | } 168 | 169 | #status.busy::before { 170 | content: "status: generating..."; 171 | } 172 | 173 | #status.busy { 174 | background-color: blue; 175 | color: #fff; 176 | } 177 | 178 | #status.error::before { 179 | content: "status: error"; 180 | } 181 | 182 | #status.error { 183 | background-color: red; 184 | color: #fff; 185 | } 186 | 187 | #status.reconnecting::before { 188 | content: "status: reconnecting"; 189 | } 190 | 191 | #status.reconnecting { 192 | background-color: red; 193 | color: #fff; 194 | } 195 | 196 | #status.shutdown::before { 197 | content: "status: shutdown"; 198 | } 199 | 200 | #status.shutdown { 201 | background-color: red; 202 | color: #fff; 203 | } 204 | 205 | /* Thank you, Bing Copilot for this help here. */ 206 | .progress { 207 | width: 100%; /* Set the desired width for the progress container */ 208 | } 209 | 210 | .progress_bar { 211 | width: 0%; /* Set the initial width for the progress bar */ 212 | background-color: #4caf50; /* Set the color for the progress bar */ 213 | text-align: center; /* Center the progress text (optional) */ 214 | color: #ffffff; /* Text color (optional) */ 215 | } 216 | -------------------------------------------------------------------------------- /bin/settings_tool/README.md: -------------------------------------------------------------------------------- 1 | # `settings_tool` 2 | 3 | Is a very simple tool for editing `drama_llama` options via a gui. It's mostly 4 | to test the `egui` feature but it may be useful to generate configuration files. 5 | 6 | Run it like: 7 | 8 | ```text 9 | cargo run --bin settings_tool --features="egui,toml,serde,serde_json" 10 | ``` 11 | 12 | ## Notes 13 | 14 | - TOML cannot store the settings properly because it doesn't support u128, or at 15 | least the `toml` crate doesn't. It's there because at some point we might 16 | store the seed differently (like two u64s). 17 | -------------------------------------------------------------------------------- /bin/settings_tool/settings_tool.rs: -------------------------------------------------------------------------------- 1 | /// A simple tool to test the settings GUI. It can be used to generate a TOML 2 | /// representation of the settings but is mostly just a testbed for the GUI. 3 | use drama_llama::PredictOptions; 4 | use rocket::serde::Serialize; 5 | 6 | #[derive(Clone, Copy, PartialEq, Default)] 7 | enum Mode { 8 | #[default] 9 | JSON, 10 | #[cfg(feature = "toml")] 11 | TOML, 12 | } 13 | 14 | impl Mode { 15 | fn render(self, s: &S) -> String 16 | where 17 | S: Serialize, 18 | { 19 | match self { 20 | Mode::JSON => match serde_json::to_string_pretty(&s) { 21 | Ok(s) => s, 22 | Err(e) => format!("Error: {}", e), 23 | }, 24 | #[cfg(feature = "toml")] 25 | Mode::TOML => match toml::to_string_pretty(&s) { 26 | Ok(s) => s, 27 | Err(e) => format!("Error: {}", e), 28 | }, 29 | } 30 | } 31 | 32 | fn as_str(self) -> &'static str { 33 | match self { 34 | Mode::JSON => "JSON", 35 | #[cfg(feature = "toml")] 36 | Mode::TOML => "TOML", 37 | } 38 | } 39 | } 40 | 41 | #[derive(Default)] 42 | struct App { 43 | pub options: PredictOptions, 44 | pub mode: Mode, 45 | } 46 | 47 | impl eframe::App for App { 48 | fn update(&mut self, ctx: &egui::Context, _frame: &mut eframe::Frame) { 49 | egui::SidePanel::left("settings") 50 | .default_width(400.0) 51 | .show(ctx, |ui| self.options.draw(ui)); 52 | 53 | egui::CentralPanel::default().show(ctx, |ui| { 54 | egui::ComboBox::from_label("Format") 55 | .selected_text(self.mode.as_str()) 56 | .show_ui(ui, |ui| { 57 | ui.selectable_value( 58 | &mut self.mode, 59 | Mode::JSON, 60 | Mode::JSON.as_str(), 61 | ); 62 | #[cfg(feature = "toml")] 63 | ui.selectable_value( 64 | &mut self.mode, 65 | Mode::TOML, 66 | Mode::TOML.as_str(), 67 | ); 68 | }); 69 | 70 | ui.separator(); 71 | ui.label(self.mode.render(&self.options)) 72 | }); 73 | } 74 | } 75 | 76 | pub fn main() -> Result<(), Box> { 77 | eframe::run_native( 78 | "`drama_llama` Settings Tool", 79 | eframe::NativeOptions::default(), 80 | Box::new(|_| Box::new(App::default())), 81 | )?; 82 | 83 | Ok(()) 84 | } 85 | -------------------------------------------------------------------------------- /models/README.md: -------------------------------------------------------------------------------- 1 | A model should be copied or linked in this folder with the name `model.gguf` for 2 | testing purposes. 3 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | max_width = 80 -------------------------------------------------------------------------------- /src/batch.rs: -------------------------------------------------------------------------------- 1 | use llama_cpp_sys_3::{ 2 | llama_batch, llama_batch_free, llama_batch_init, llama_seq_id, llama_token, 3 | }; 4 | use thiserror::Error; 5 | 6 | /// A `Batch` of tokens or embeddings. This wraps a [`llama_batch`] and provides 7 | /// safe accessors for it's members. 8 | #[derive(Debug)] 9 | pub struct Batch { 10 | /// The underlying C struct 11 | pub(crate) batch: llama_batch, 12 | /// Batch size (maximum number of members in the batch) 13 | pub(crate) capacity: usize, 14 | /// The number of allocated embeddings. When batch.tokens is null: 15 | /// 16 | /// ```C 17 | /// batch.embd = (float *) malloc(sizeof(float) * capacity * embd); 18 | /// ``` 19 | pub(crate) embd_len: usize, 20 | /// The maximum number of sequence ids per token. 21 | pub(crate) n_seq_max: usize, 22 | } 23 | 24 | #[derive(Debug, Error, PartialEq)] 25 | pub enum AddError { 26 | #[error("The batch is full")] 27 | Full, 28 | #[error("The number of sequence ids does not match the batch's n_seq_max")] 29 | InvalidSequenceLength, 30 | // FIXME: add `add_embedding` method to `Batch` 31 | #[error("A token was supplied, but thet batch was created with embd_len > 0. Call `add_embedding` instead.")] 32 | ExpectedEmbedding, 33 | #[error("An embedding was supplied, but thet batch was created with embd_len == 0. Call `add_token` instead.")] 34 | ExpectedToken, 35 | #[error("Invalid token position.")] 36 | InvalidPosition, 37 | } 38 | 39 | static_assertions::assert_impl_all!(AddError: Send, Sync); 40 | 41 | impl Batch { 42 | /// Create a new [`Batch`] with the given `capacity` for tokens or 43 | /// embeddings. If `embd_len` is zero, the `tokens` accessor will be 44 | /// available, otherwise the `embd` accessor will be available. Each token 45 | /// can be assigned up to `n_seq_max` sequence ids. 46 | pub fn new( 47 | capacity: usize, 48 | embd_len: usize, 49 | n_seq_max: usize, 50 | ) -> Option { 51 | let batch = unsafe { 52 | llama_batch_init( 53 | capacity.try_into().ok()?, 54 | embd_len.try_into().ok()?, 55 | n_seq_max.try_into().ok()?, 56 | ) 57 | }; 58 | 59 | // sanity 60 | debug_assert!(batch.n_tokens == 0); 61 | 62 | Some(Self { 63 | batch, 64 | capacity, 65 | embd_len, 66 | n_seq_max, 67 | }) 68 | } 69 | 70 | /// Create a new [`Batch`] with capacity for tokens. The the `logit` field 71 | /// for all but the last token will be set to `false`. If the capacity is 72 | /// less than the number of tokens, the largest value will be used. 73 | pub fn from_tokens( 74 | capacity: usize, 75 | tokens: &[llama_token], 76 | ) -> Option { 77 | let mut batch = Self::new(capacity.max(tokens.len()), 0, 1)?; 78 | 79 | for (i, token) in tokens.iter().enumerate() { 80 | let logits = i == tokens.len() - 1; 81 | batch.add_token(*token, i, None, logits).ok()?; 82 | } 83 | 84 | Some(batch) 85 | } 86 | 87 | /// The maximum number of members in the batch. 88 | pub const fn capacity(&self) -> usize { 89 | self.capacity 90 | } 91 | 92 | /// The current number of members in the batch. 93 | pub const fn len(&self) -> usize { 94 | self.batch.n_tokens as usize 95 | } 96 | 97 | /// Returns true if batch is empty. 98 | pub const fn is_empty(&self) -> bool { 99 | self.len() == 0 100 | } 101 | 102 | /// The size of each embedding. 103 | pub const fn embd_len(&self) -> usize { 104 | self.embd_len 105 | } 106 | 107 | /// The maximum number of sequence ids per token. 108 | pub const fn n_seq_max(&self) -> usize { 109 | self.n_seq_max 110 | } 111 | 112 | /// The tokens in this batch. 113 | /// 114 | /// This will return `None` if the [`Batch`] was created with `embd_len` set 115 | /// to zero. 116 | pub fn tokens(&self) -> Option<&[llama_token]> { 117 | if self.batch.token.is_null() { 118 | debug_assert!(!self.batch.embd.is_null()); 119 | None 120 | } else { 121 | Some( 122 | &unsafe { 123 | std::slice::from_raw_parts( 124 | self.batch.token, 125 | self.capacity(), 126 | ) 127 | }[..self.len() as usize], 128 | ) 129 | } 130 | } 131 | 132 | /// The tokens in this batch. 133 | /// 134 | /// This will return `None` if the [`Batch`] was created with `embd_len` set 135 | /// to zero. 136 | pub fn tokens_mut(&mut self) -> Option<&mut [llama_token]> { 137 | if self.batch.token.is_null() { 138 | debug_assert!(!self.batch.embd.is_null()); 139 | None 140 | } else { 141 | Some( 142 | &mut unsafe { 143 | std::slice::from_raw_parts_mut( 144 | self.batch.token, 145 | self.capacity(), 146 | ) 147 | }[..self.len() as usize], 148 | ) 149 | } 150 | } 151 | 152 | /// The embeddings in this batch at index `i`. 153 | /// 154 | /// This will return None if the index is invalid or if the batch was 155 | /// created with `embd_len` set to zero. 156 | pub fn embd(&self, i: usize) -> Option<&[f32]> { 157 | if self.batch.embd.is_null() { 158 | debug_assert!(!self.batch.token.is_null()); 159 | None 160 | } else { 161 | if (i as usize) >= self.len() { 162 | None 163 | } else { 164 | Some(unsafe { 165 | std::slice::from_raw_parts( 166 | self.batch.embd.add(i * self.embd_len()), 167 | self.embd_len(), 168 | ) 169 | }) 170 | } 171 | } 172 | } 173 | 174 | /// The embeddings in this batch at index `i`. 175 | /// 176 | /// This will return None if the index is invalid or if the batch was 177 | /// created with `embd_len` set to zero. 178 | pub fn embd_mut(&mut self, i: usize) -> Option<&mut [f32]> { 179 | if self.batch.embd.is_null() { 180 | debug_assert!(!self.batch.token.is_null()); 181 | None 182 | } else { 183 | if (i as usize) >= self.len() { 184 | None 185 | } else { 186 | Some(unsafe { 187 | std::slice::from_raw_parts_mut( 188 | self.batch.embd.add(i * self.embd_len()), 189 | self.embd_len(), 190 | ) 191 | }) 192 | } 193 | } 194 | } 195 | 196 | /// The position of a given index in the batch. 197 | pub const fn pos(&self) -> &[i32] { 198 | unsafe { std::slice::from_raw_parts(self.batch.pos, self.len()) } 199 | } 200 | 201 | /// The position of a given index in the batch. 202 | pub fn pos_mut(&mut self) -> &mut [i32] { 203 | unsafe { std::slice::from_raw_parts_mut(self.batch.pos, self.len()) } 204 | } 205 | 206 | /// The number of sequence ids for a given index in the batch. 207 | pub const fn n_seq(&self) -> &[i32] { 208 | unsafe { std::slice::from_raw_parts(self.batch.n_seq_id, self.len()) } 209 | } 210 | 211 | /// The number of sequence ids for a given index in the batch. 212 | fn n_seq_mut(&mut self) -> &mut [i32] { 213 | unsafe { 214 | std::slice::from_raw_parts_mut(self.batch.n_seq_id, self.len()) 215 | } 216 | } 217 | 218 | /// Whether logits should be calculated at a given index in the batch. 219 | pub fn logits(&self) -> &[bool] { 220 | // Safety: This and the accessor below are safe because we know a bool 221 | // is the same size as an i8 and we know the 0 and 1 values correspond 222 | // to false and true. Otherwise the following would not compile: 223 | static_assertions::assert_eq_size!(bool, i8); 224 | static_assertions::const_assert_eq!(false as i8, 0); 225 | static_assertions::const_assert_eq!(true as i8, 1); 226 | 227 | unsafe { 228 | std::slice::from_raw_parts( 229 | self.batch.logits as *const bool, 230 | self.len(), 231 | ) 232 | } 233 | } 234 | 235 | /// Whether logits should be calculated at a given index in the batch. 236 | fn logits_mut(&mut self) -> &mut [bool] { 237 | unsafe { 238 | std::slice::from_raw_parts_mut( 239 | self.batch.logits as *mut bool, 240 | self.len(), 241 | ) 242 | } 243 | } 244 | 245 | /// Clear the batch. 246 | pub fn clear(&mut self) { 247 | self.batch.n_tokens = 0; 248 | } 249 | 250 | /// Add a token to the batch. 251 | pub fn add_token( 252 | &mut self, 253 | token: llama_token, 254 | pos: usize, 255 | seq_ids: Option<&[llama_seq_id]>, 256 | logits: bool, 257 | ) -> Result<(), AddError> { 258 | let i = self.len(); 259 | 260 | if pos >= self.capacity() { 261 | return Err(AddError::InvalidPosition); 262 | } 263 | 264 | if i >= self.capacity() { 265 | return Err(AddError::Full); 266 | } 267 | 268 | if self.embd_len() != 0 { 269 | return Err(AddError::ExpectedEmbedding); 270 | } 271 | 272 | self.batch.n_tokens += 1; 273 | 274 | self.tokens_mut().unwrap()[i] = token; 275 | self.pos_mut()[i] = pos as i32; 276 | 277 | let sequences = unsafe { 278 | std::slice::from_raw_parts_mut(self.batch.seq_id, self.len()) 279 | }; 280 | let sequence = unsafe { 281 | std::slice::from_raw_parts_mut(sequences[i], self.n_seq_max()) 282 | }; 283 | 284 | match seq_ids { 285 | Some(seq_ids) => { 286 | if seq_ids.len() > self.n_seq_max() { 287 | self.batch.n_tokens -= 1; 288 | return Err(AddError::InvalidSequenceLength); 289 | } 290 | 291 | // We want to panic if the number of sequence ids is greater 292 | // than i32::MAX 293 | self.n_seq_mut()[i] = seq_ids.len().try_into().unwrap(); 294 | 295 | // Safety: This is safe because we control construction of the 296 | // batch and we know that the sequence ids are valid for the 297 | // lifetime of the batch. We also know that len is valid because 298 | // the only way it changes is through our accessor methods. 299 | sequence[..seq_ids.len()].copy_from_slice(seq_ids); 300 | sequence[seq_ids.len()..].fill(0); 301 | } 302 | None => { 303 | // There is always at least one sequence id 304 | self.n_seq_mut()[i] = 1; 305 | sequence[0] = 0; 306 | } 307 | } 308 | self.logits_mut()[i] = logits; 309 | 310 | Ok(()) 311 | } 312 | 313 | /// Add tokens to the batch. 314 | pub fn add_tokens( 315 | &mut self, 316 | tokens: I, 317 | pos: usize, 318 | seq_ids: Option<&[llama_seq_id]>, 319 | logits: bool, 320 | ) -> Result<(), AddError> 321 | where 322 | I: IntoIterator, 323 | { 324 | for token in tokens { 325 | self.add_token(token, pos, seq_ids, logits)?; 326 | } 327 | 328 | Ok(()) 329 | } 330 | } 331 | 332 | impl Drop for Batch { 333 | fn drop(&mut self) { 334 | unsafe { llama_batch_free(self.batch) }; 335 | } 336 | } 337 | 338 | #[cfg(test)] 339 | mod tests { 340 | 341 | use super::*; 342 | 343 | #[test] 344 | fn test_batch() { 345 | for n_seq_max in 1..16usize { 346 | let mut batch = Batch::new(16, 0, n_seq_max).unwrap(); 347 | 348 | for i in 0..16 { 349 | assert_eq!(batch.capacity(), 16); 350 | assert_eq!(batch.len(), i); 351 | assert_eq!(batch.embd_len(), 0); 352 | assert_eq!(batch.n_seq_max(), n_seq_max as usize); 353 | assert!(batch.tokens().is_some()); 354 | assert!(batch.tokens_mut().is_some()); 355 | assert!(batch.embd(i).is_none()); 356 | assert!(batch.embd_mut(i).is_none()); 357 | assert_eq!( 358 | batch.add_token( 359 | i as llama_token, 360 | i, 361 | Some(&vec![42; n_seq_max as usize]), 362 | true 363 | ), 364 | Ok(()) 365 | ); 366 | assert_eq!(batch.n_seq()[i], n_seq_max as i32); 367 | assert_eq!(batch.logits()[i], true); 368 | assert_eq!(batch.pos()[i], i as i32); 369 | } 370 | 371 | batch.clear(); 372 | 373 | for i in 0..16_usize { 374 | assert_eq!(batch.capacity(), 16); 375 | assert_eq!(batch.len(), i); 376 | assert_eq!(batch.embd_len(), 0); 377 | assert_eq!(batch.n_seq_max(), n_seq_max); 378 | assert!(batch.tokens().is_some()); 379 | assert!(batch.tokens_mut().is_some()); 380 | assert!(batch.embd(i).is_none()); 381 | assert!(batch.embd_mut(i).is_none()); 382 | assert_eq!( 383 | batch.add_token(i as llama_token, i, None, false), 384 | Ok(()) 385 | ); 386 | assert_eq!(batch.n_seq()[i], 1); 387 | assert_eq!(batch.logits()[i], false); 388 | assert_eq!(batch.pos()[i], i as i32); 389 | } 390 | 391 | // The batch is full 392 | assert_eq!( 393 | batch.add_token(16, 15, None, true), 394 | Err(AddError::Full) 395 | ); 396 | // The position is invalid 397 | assert_eq!( 398 | batch.add_token(16, 16, None, true), 399 | Err(AddError::InvalidPosition) 400 | ); 401 | } 402 | } 403 | } 404 | -------------------------------------------------------------------------------- /src/cli.rs: -------------------------------------------------------------------------------- 1 | use std::path::PathBuf; 2 | 3 | use clap::Parser; 4 | 5 | use llama_cpp_sys_3::{ 6 | llama_context_default_params, llama_context_params, 7 | llama_model_default_params, llama_model_params, 8 | }; 9 | 10 | use crate::VocabKind; 11 | 12 | #[derive(Debug, Parser)] 13 | pub struct Args { 14 | /// Path to the model 15 | #[arg(short, long)] 16 | pub model: PathBuf, 17 | /// Context size 18 | #[arg(short, long, default_value_t = 1024)] 19 | pub context: u32, 20 | /// Disable on-by-default GPU acceleration 21 | #[arg(short, long, default_value_t = false)] 22 | pub no_gpu: bool, 23 | /// Vocabulary 24 | #[arg(short, long, default_value_t = VocabKind::Safe)] 25 | pub vocab: VocabKind, 26 | } 27 | 28 | impl Args { 29 | /// Create `llama_model_params` from `Args`. Defaults are used for fields 30 | /// not specified in `Args`. 31 | pub fn model_params(&self) -> llama_model_params { 32 | self.into() 33 | } 34 | 35 | /// Create `llama_context_params` from `Args`. Defaults are used for fields 36 | /// not specified in `Args`. 37 | pub fn context_params(&self) -> llama_context_params { 38 | self.into() 39 | } 40 | } 41 | 42 | impl From<&Args> for llama_model_params { 43 | fn from(args: &Args) -> Self { 44 | // Safety: This returns POD and makes no allocations for the pointer 45 | // fields, which are optional and initialized to null. 46 | let mut params = unsafe { llama_model_default_params() }; 47 | params.n_gpu_layers = if args.no_gpu { 0 } else { 1000 }; 48 | 49 | params 50 | } 51 | } 52 | 53 | impl From<&Args> for llama_context_params { 54 | fn from(args: &Args) -> Self { 55 | // Safety: same as above 56 | let mut params = unsafe { llama_context_default_params() }; 57 | params.n_ctx = args.context; 58 | 59 | params 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/data.rs: -------------------------------------------------------------------------------- 1 | pub(crate) mod stopwords; 2 | pub use stopwords::StopWords; 3 | 4 | pub(crate) mod banned; 5 | -------------------------------------------------------------------------------- /src/data/stopwords.rs: -------------------------------------------------------------------------------- 1 | use llama_cpp_sys_3::llama_token; 2 | 3 | use crate::Model; 4 | 5 | /// A list of very common words for various languages. These can be used to 6 | /// ignore certain tokens for the purposes of repetition detection, etc. 7 | #[cfg_attr( 8 | feature = "serde", 9 | derive(rocket::serde::Deserialize, rocket::serde::Serialize) 10 | )] 11 | #[cfg_attr(feature = "serde", serde(crate = "rocket::serde"))] 12 | #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)] 13 | pub enum StopWords { 14 | // NOTE: If you add a new language here, add it to ALL and sort this list 15 | // and ALL in alphabetical order. 16 | // TODO: static assert all this. 17 | English, 18 | } 19 | 20 | impl StopWords { 21 | pub const ALL: [StopWords; 1] = [StopWords::English]; 22 | 23 | pub const fn as_str(&self) -> &'static str { 24 | match self { 25 | StopWords::English => "English", 26 | } 27 | } 28 | 29 | pub const fn words(&self) -> &'static [&'static str] { 30 | match self { 31 | StopWords::English => ENGLISH, 32 | } 33 | } 34 | 35 | /// Tokenizes `self` using the given `model``. 36 | pub fn into_tokens( 37 | self, 38 | model: &Model, 39 | ) -> impl Iterator + '_ { 40 | self.words() 41 | .iter() 42 | // TODO: there is allocation here that can be avoided by turning the 43 | // tokenize function into a method returning an iterator, however 44 | // it's not a big deal since this is only done once. 45 | .map(|word| model.tokenize(word, false).into_iter()) 46 | .flatten() 47 | } 48 | } 49 | 50 | /// A list of common English stopwords from NLTK. 51 | pub const ENGLISH: &[&str] = &[ 52 | "a", 53 | "about", 54 | "above", 55 | "after", 56 | "again", 57 | "against", 58 | "all", 59 | "am", 60 | "an", 61 | "and", 62 | "any", 63 | "are", 64 | "as", 65 | "at", 66 | "be", 67 | "because", 68 | "been", 69 | "before", 70 | "being", 71 | "below", 72 | "between", 73 | "both", 74 | "but", 75 | "by", 76 | "can", 77 | "did", 78 | "do", 79 | "does", 80 | "doing", 81 | "don", 82 | "down", 83 | "during", 84 | "each", 85 | "few", 86 | "for", 87 | "from", 88 | "further", 89 | "had", 90 | "has", 91 | "have", 92 | "having", 93 | "he", 94 | "her", 95 | "here", 96 | "hers", 97 | "herself", 98 | "him", 99 | "himself", 100 | "his", 101 | "how", 102 | "i", 103 | "if", 104 | "in", 105 | "into", 106 | "is", 107 | "it", 108 | "its", 109 | "itself", 110 | "just", 111 | "me", 112 | "more", 113 | "most", 114 | "my", 115 | "myself", 116 | "no", 117 | "nor", 118 | "not", 119 | "now", 120 | "of", 121 | "off", 122 | "on", 123 | "once", 124 | "only", 125 | "or", 126 | "other", 127 | "our", 128 | "ours", 129 | "ourselves", 130 | "out", 131 | "over", 132 | "own", 133 | "s", 134 | "same", 135 | "she", 136 | "should", 137 | "so", 138 | "some", 139 | "such", 140 | "t", 141 | "than", 142 | "that", 143 | "the", 144 | "their", 145 | "theirs", 146 | "them", 147 | "themselves", 148 | "then", 149 | "there", 150 | "these", 151 | "they", 152 | "this", 153 | "those", 154 | "through", 155 | "to", 156 | "too", 157 | "under", 158 | "until", 159 | "up", 160 | "very", 161 | "was", 162 | "we", 163 | "were", 164 | "what", 165 | "when", 166 | "where", 167 | "which", 168 | "while", 169 | "who", 170 | "whom", 171 | "why", 172 | "will", 173 | "with", 174 | "you", 175 | "your", 176 | "yours", 177 | "yourself", 178 | "yourselves", 179 | ]; 180 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // TODO: Importing everything from the submodules is fine for small crates, but 2 | // this is getting crowded. When we go to version 0.2.0, we should consider 3 | // making modules public. 4 | 5 | #[cfg(feature = "cli")] 6 | pub mod cli; 7 | 8 | #[cfg_attr(test, macro_use)] 9 | pub(crate) mod utils; 10 | 11 | pub mod data; 12 | 13 | mod sample; 14 | pub use sample::{ 15 | RepetitionError, RepetitionOptions, SampleOptions, SamplingMode, 16 | }; 17 | 18 | mod batch; 19 | pub(crate) use batch::Batch; 20 | 21 | mod candidates; 22 | pub use candidates::{Candidates, Sorted, TokenDataArray}; 23 | 24 | pub mod prompt; 25 | pub use prompt::{Message, Prompt, Role}; 26 | 27 | mod model; 28 | pub use model::{llama_quantize, Model, Vocab, VocabKind}; 29 | 30 | mod ngram; 31 | pub use ngram::{NGram, NGramData, NGramStats}; 32 | 33 | mod engine; 34 | pub use engine::{Engine, NewError}; 35 | 36 | mod predictor; 37 | pub use predictor::{ 38 | CandidatePredictor, PiecePredictor, PredictOptions, Predicted, Predictor, 39 | TokenPredictor, 40 | }; 41 | 42 | mod probability; 43 | pub use probability::{InvalidProbability, Probability}; 44 | 45 | pub const TOS: &str = include_str!("../TERMS_OF_USE.md"); 46 | -------------------------------------------------------------------------------- /src/model/vocab.rs: -------------------------------------------------------------------------------- 1 | //! Vocabulary constraints. 2 | 3 | use llama_cpp_sys_3::llama_token; 4 | use regex::Regex; 5 | 6 | use crate::{data::banned::Banned, model::token_to_piece_ref, Model, NGram}; 7 | 8 | /// A very imperfect regex for safe tokens. This could use some improvement. 9 | pub const SAFE_REGEX: &str = r#"^[▁ a-zA-Z]{2,32}|[ ▁\(\)\.\?!\"\'\-_]{1,32}|[aAI]{1}|\n{1,3}|\t{1,3}| {1,16}$"#; 10 | pub const LETTERS_REGEX: &str = r#"^[a-zA-Z]{1}$"#; 11 | pub const CODE_REGEX: &str = r#"^[ \d\\(\){\}\[\]\;\:\"\'\<\>\,\.\\\/\?\.\!\@\#\$\%\^\&\=\`\~]{1,32}|\w{2,32}$"#; 12 | 13 | // This is temporary until we can get the regex working for llama. It works in 14 | // regex101, but not here. With these tokens banned, weird things happen. 15 | const LLAMA_2_ALLOW_LIST: &[llama_token] = &[ 16 | 0, // unknown 17 | 1, // bos 18 | 2, // eos 19 | 0x0D, // \n 20 | 0x20, // space 21 | 0x49, // I 22 | 0x3D, // = 23 | 0x61, // a 24 | 0x75, // u 25 | 29871, // ▁ (word boundary) 26 | 29874, // a 27 | 29889, // . 28 | 29892, // , 29 | 29897, // ) 30 | 29898, // ( 31 | 29899, // - 32 | 29901, // : 33 | 29902, // I 34 | 29909, // A 35 | 29912, // { 36 | 29913, // } 37 | 29915, // ' 38 | 29918, // _ 39 | 29922, // = 40 | 29930, // * 41 | 29936, // ; 42 | 29937, // # 43 | 29938, // $ 44 | 29944, // л 45 | 29961, // [ 46 | 29962, // ] 47 | 29973, // ? 48 | 29974, // + 49 | 29985, // ^ 50 | 29989, // | 51 | 29991, // ! 52 | 29992, // @ 53 | 29995, // % 54 | 30022, // ~ 55 | 30098, // … 56 | 30142, // λ 57 | ]; 58 | 59 | const LLAMA_2_ALLOW_RANGES: &[std::ops::RangeInclusive] = &[ 60 | 0x20..=0x3C, // !"#$%&'()*+,-./0123456789:; 61 | 0x3F..=0x41, // ?@A 62 | 0x5B..=0x60, // [\]^_` 63 | 0x7B..=0x7E, // {|}~ 64 | ]; 65 | 66 | #[cfg_attr(feature = "cli", derive(clap::ValueEnum))] 67 | #[derive(Debug, Clone, Copy, PartialEq)] 68 | pub enum VocabKind { 69 | /// All tokens and control characters are allowed. This is not recommended, 70 | /// especially if the output is going to be used in a web context. Banned 71 | /// n-grams are still enforced. 72 | Unsafe, 73 | /// Words, word fragments, punctuation, and the letter "a" are allowed. This 74 | /// is the default vocabulary. The idea is to prohibit generation of 75 | /// arbitrary sequences which could bypass filters, as well as code which 76 | /// could cause security issues. 77 | /// 78 | /// That being said *this is not yet validated* to be very safe, so care 79 | /// should be taken especially for web contexts. 80 | Safe, 81 | /// Letters only. Allowing this will allow generation of any sequence, but 82 | /// only one letter at a time. This is unsafe and should not be used unless 83 | /// it's absolutely necessary. 84 | /// 85 | /// Using it to generate bigotry is a violation of the license under which 86 | /// this software is distributed. See `LICENSE.md` for details. 87 | Letters, 88 | /// Code. This will allow generation of words, digits, and common symbols 89 | /// used in code. Letters are not enabled. 90 | // Because 4chan got GPT-4 to generate the N word by getting it to "run" 91 | // code concatenating individual letters, we have to ban this. 92 | Code, 93 | } 94 | 95 | // derive_more::Display is failing, so we're implementing it manually. 96 | #[cfg(feature = "cli")] 97 | impl std::fmt::Display for VocabKind { 98 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 99 | match self { 100 | VocabKind::Unsafe => write!(f, "unsafe"), 101 | VocabKind::Safe => write!(f, "safe"), 102 | VocabKind::Letters => write!(f, "letters"), 103 | VocabKind::Code => write!(f, "code"), 104 | } 105 | } 106 | } 107 | 108 | impl Into for VocabKind { 109 | fn into(self) -> Regex { 110 | match self { 111 | VocabKind::Unsafe => Regex::new("*").unwrap(), 112 | VocabKind::Safe => Regex::new(SAFE_REGEX).unwrap(), 113 | VocabKind::Letters => Regex::new(LETTERS_REGEX).unwrap(), 114 | VocabKind::Code => Regex::new(CODE_REGEX).unwrap(), 115 | } 116 | } 117 | } 118 | 119 | #[derive(Debug)] 120 | pub struct Vocab { 121 | /// Allowed tokens. This is a Vec of bool rather than a vec of ranges so 122 | /// that lookup is O(1). This will happen in a fairly tight loop, so it's 123 | /// probably worth it. 124 | allowed_tokens: Vec, 125 | /// Banned ngrams. These are at least all possible pairs of tokens that 126 | /// would generate a banned word. Letters are not included since the number 127 | /// of permutations is too high. 128 | banned: Option, 129 | /// Longest token length. This is used to optimize search for stop strings. 130 | longest_token: usize, 131 | } 132 | 133 | impl Vocab { 134 | pub fn new( 135 | enabled: impl IntoIterator, 136 | model: &Model, 137 | ) -> Self { 138 | let enabled: Vec = enabled.into_iter().collect(); 139 | let banned = if model.desc().to_lowercase().starts_with("llama v2") { 140 | Some(Banned::LlamaEnglish) 141 | } else { 142 | None 143 | }; 144 | if enabled.contains(&VocabKind::Unsafe) { 145 | return Self { 146 | allowed_tokens: vec![true; model.n_vocab() as usize], 147 | longest_token: model.max_token_len(), 148 | banned, 149 | }; 150 | } 151 | let enabled: Vec = enabled.into_iter().map(Into::into).collect(); 152 | 153 | let n_tokens = model.n_vocab(); 154 | 155 | let mut buf = Vec::new(); 156 | 157 | let mut allowed_tokens: Vec = (0..n_tokens) 158 | .map(|token| { 159 | token_to_piece_ref(token, model, &mut buf); 160 | enabled 161 | .iter() 162 | .any(|re| re.is_match(&String::from_utf8_lossy(&buf))) 163 | }) 164 | .collect(); 165 | 166 | if model.desc().to_lowercase().starts_with("llama v2") { 167 | for &token in LLAMA_2_ALLOW_LIST { 168 | allowed_tokens[token as usize] = true; 169 | } 170 | 171 | for range in LLAMA_2_ALLOW_RANGES { 172 | for token in range.clone() { 173 | allowed_tokens[token as usize] = true; 174 | } 175 | } 176 | } 177 | 178 | // TODO: Fix regex, or add LLAMA_3 allow list. As it is now, generation 179 | // is potato without "Unsafe" vocab because the regex is too strict. 180 | 181 | Self { 182 | allowed_tokens, 183 | longest_token: model.max_token_len(), 184 | banned, 185 | } 186 | } 187 | 188 | /// Returns true if an ngram is forbidden. Forbidden [`NGram`]s are those 189 | /// that contain a token that is not allowed, or that are in the banned 190 | /// ngrams set. 191 | pub fn is_forbidden(&self, ngram: &NGram) -> bool { 192 | if ngram 193 | .iter() 194 | .any(|&token| !self.allowed_tokens[token as usize]) 195 | { 196 | return true; 197 | } 198 | if let Some(banned) = &self.banned { 199 | banned 200 | .as_slice() 201 | .binary_search(&[ngram[0], ngram[1]]) 202 | .is_ok() 203 | } else { 204 | false 205 | } 206 | } 207 | 208 | /// Piece length of the longest token. 209 | /// 210 | /// Time complexity: O(1). 211 | pub fn max_token_len(&self) -> usize { 212 | self.longest_token 213 | } 214 | 215 | /// Allowed tokens. 216 | pub fn allowed_tokens(&self) -> &Vec { 217 | &self.allowed_tokens 218 | } 219 | 220 | /// Banned ngrams. 221 | pub fn banned(&self) -> Option<&Banned> { 222 | self.banned.as_ref() 223 | } 224 | 225 | /// Returns the number of allowed tokens. 226 | /// 227 | /// O(n) where n is the number of tokens. 228 | pub fn n_allowed(&self) -> usize { 229 | self.allowed_tokens.iter().filter(|&&b| b).count() 230 | } 231 | } 232 | 233 | #[cfg(test)] 234 | mod tests { 235 | use super::*; 236 | use llama_cpp_sys_3::llama_token; 237 | use rayon::prelude::*; 238 | use std::{ 239 | collections::{BTreeSet, HashSet}, 240 | path::PathBuf, 241 | }; 242 | 243 | /// Generate banned ngrams for a model. This is very slow and can take a few 244 | /// minutes even on a fast machine. It is only used for testing and 245 | /// generating the banned ngrams for the various models. 246 | fn generate_banned_ngrams(model: &Model) -> BTreeSet { 247 | // Safety: this is only called from test code and we don't use any 248 | // methods that mutate the model, so it is safe to share between 249 | // threads. In the future we might make model actually thread safe. 250 | unsafe impl Sync for Model {} 251 | 252 | let mut banned_ngrams = BTreeSet::new(); 253 | 254 | let n_vocab = model.n_vocab(); 255 | let banned_regex: Vec = crate::data::banned::ENGLISH_WORDS 256 | .iter() 257 | .map(|s| Regex::new(s).unwrap()) 258 | .collect(); 259 | 260 | let (tx, rx) = std::sync::mpsc::channel(); 261 | (0..n_vocab).into_par_iter().for_each_with(tx, |tx, first| { 262 | let mut first_buf = Vec::new(); 263 | let mut second_buf = Vec::new(); 264 | let mut joined_buf = String::new(); 265 | 266 | let mut banned_chunk: HashSet = HashSet::new(); 267 | 268 | for second in 0..n_vocab { 269 | first_buf.clear(); 270 | second_buf.clear(); 271 | joined_buf.clear(); 272 | 273 | token_to_piece_ref(first, &model, &mut first_buf); 274 | token_to_piece_ref(second, &model, &mut second_buf); 275 | 276 | joined_buf.push_str( 277 | String::from_utf8_lossy(&first_buf).to_lowercase().as_ref(), 278 | ); 279 | joined_buf.push_str( 280 | String::from_utf8_lossy(&second_buf) 281 | .to_lowercase() 282 | .as_ref(), 283 | ); 284 | 285 | for regex in &banned_regex { 286 | if regex.is_match(&joined_buf.to_lowercase()) { 287 | let ngram = 288 | NGram::try_from_tokens(&[first, second]).unwrap(); 289 | banned_chunk.insert(ngram); 290 | break; 291 | } 292 | } 293 | } 294 | 295 | tx.send(banned_chunk).unwrap(); 296 | }); 297 | 298 | banned_ngrams.extend(rx.into_iter().flatten()); 299 | 300 | banned_ngrams 301 | } 302 | 303 | #[test] 304 | fn test_vocab() { 305 | // This is a llama model 306 | let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); 307 | path.push("models/model.gguf"); 308 | 309 | let model = Model::from_file(path, None).unwrap(); 310 | let vocab = Vocab::new(vec![VocabKind::Safe], &model); 311 | 312 | // Check that the ngrams are forbidden 313 | for forbidden in crate::data::banned::ENGLISH_BIGRAMS { 314 | let ngram = NGram::try_from_tokens(forbidden).unwrap(); 315 | assert!(vocab.is_forbidden(&ngram)); 316 | } 317 | } 318 | 319 | #[test] 320 | #[ignore = "very long running"] 321 | /// This is a very long running test that generates the banned n-grams for 322 | /// the Llama model. This can take a few minutes even on a fast machine. 323 | fn test_banned_ngrams_llama() { 324 | // This is a llama model 325 | let root = PathBuf::from(env!("CARGO_MANIFEST_DIR")); 326 | let mut model_path = root.clone(); 327 | model_path.push("models/model.gguf"); 328 | let model = Model::from_file(model_path, None).unwrap(); 329 | let mut out_path = root.clone(); 330 | out_path 331 | .push(format!("tests/data/banned_ngrams/ngrams-english-llama.txt")); 332 | 333 | let expected = generate_banned_ngrams(&model); 334 | let actual: BTreeSet = crate::data::banned::ENGLISH_BIGRAMS 335 | .iter() 336 | .filter_map(|slice| NGram::try_from_tokens(slice).ok()) 337 | .collect(); 338 | 339 | let v: Vec> = 340 | expected.iter().map(|n| n.as_slice().to_vec()).collect(); 341 | 342 | // This representation should be easy to copy and paste into the 343 | // BANNED_LLAMA_NGRAMS array. We could automate this, but I don't want 344 | // to automate generation of code. 345 | std::fs::write(out_path, format!("{:#?}", v)).unwrap(); 346 | 347 | assert_eq!(expected, actual); 348 | } 349 | } 350 | -------------------------------------------------------------------------------- /src/probability.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "serde")] 2 | use rocket::serde::Deserialize; 3 | use static_assertions::assert_impl_all; 4 | 5 | /// Error for invalid probability values. 6 | #[derive(Debug, PartialEq, thiserror::Error)] 7 | #[error("Invalid probability. Must be between 0.0 and 1.0. Got {p}")] 8 | pub struct InvalidProbability 9 | where 10 | F: std::fmt::Display, 11 | { 12 | p: F, 13 | } 14 | 15 | assert_impl_all!(InvalidProbability: Send, Sync); 16 | assert_impl_all!(InvalidProbability: Send, Sync); 17 | 18 | /// A [`Probability`] is a wrapper around a floating point number that 19 | /// represents a probability. It is guaranteed to be between 0.0 and 1.0. 20 | #[derive(Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, Hash)] 21 | #[repr(transparent)] 22 | pub struct Probability { 23 | pub(crate) p: F, 24 | } 25 | impl Probability { 26 | pub fn from_f(p: F) -> Result> 27 | where 28 | F: num::Zero + num::One + std::cmp::PartialOrd + std::fmt::Display, 29 | { 30 | if p >= F::zero() && p <= F::one() { 31 | Ok(Probability { p }) 32 | } else { 33 | Err(InvalidProbability { p }) 34 | } 35 | } 36 | 37 | pub fn into_f(self) -> F { 38 | self.p 39 | } 40 | } 41 | 42 | impl PartialEq for Probability 43 | where 44 | F: PartialEq, 45 | { 46 | fn eq(&self, other: &F) -> bool { 47 | self.p.eq(other) 48 | } 49 | } 50 | 51 | impl PartialOrd for Probability 52 | where 53 | F: PartialOrd, 54 | { 55 | fn partial_cmp(&self, other: &F) -> Option { 56 | self.p.partial_cmp(other) 57 | } 58 | } 59 | 60 | #[cfg(feature = "serde")] 61 | impl<'de, F> Deserialize<'de> for Probability 62 | where 63 | F: Deserialize<'de> 64 | + num::Zero 65 | + num::One 66 | + std::cmp::PartialOrd 67 | + std::fmt::Display, 68 | { 69 | fn deserialize(deserializer: D) -> Result 70 | where 71 | D: rocket::serde::Deserializer<'de>, 72 | { 73 | let p = F::deserialize(deserializer)?; 74 | Probability::from_f(p).map_err(|e| rocket::serde::de::Error::custom(e)) 75 | } 76 | } 77 | 78 | #[cfg(feature = "serde")] 79 | impl rocket::serde::Serialize for Probability 80 | where 81 | F: rocket::serde::Serialize, 82 | { 83 | fn serialize(&self, serializer: S) -> Result 84 | where 85 | S: rocket::serde::Serializer, 86 | { 87 | self.p.serialize(serializer) 88 | } 89 | } 90 | 91 | // Rust complains about conflicting implementations of the conversion trait for 92 | // the same type, so we need to use a macro to generate the impls. 93 | macro_rules! impl_from_to_float { 94 | ($($t:ty),*) => { 95 | $( 96 | impl TryFrom<$t> for Probability<$t> { 97 | type Error = InvalidProbability<$t>; 98 | 99 | fn try_from(p: $t) -> Result { 100 | Probability::from_f(p) 101 | } 102 | } 103 | 104 | impl Into<$t> for Probability<$t> { 105 | fn into(self) -> $t { 106 | self.into_f() 107 | } 108 | } 109 | )* 110 | }; 111 | () => {}; 112 | } 113 | 114 | impl_from_to_float!(f32, f64); 115 | 116 | #[cfg(test)] 117 | mod tests { 118 | use super::*; 119 | 120 | #[test] 121 | fn test_probability() { 122 | // Probabilities are invalid if out of bounds 123 | let err = Probability::try_from(1.1_f64).unwrap_err(); 124 | assert_eq!(err.p, 1.1); 125 | let err = Probability::try_from(-0.1_f32).unwrap_err(); 126 | assert_eq!(err.p, -0.1); 127 | 128 | // Test valid probability 129 | let p = Probability::try_from(0.5).unwrap(); 130 | assert_eq!(p, 0.5); 131 | 132 | // Test comparison with F 133 | assert!(p > 0.0 && p < 1.0); 134 | 135 | // Test conversion to float 136 | let f: f32 = p.into(); 137 | assert_eq!(f, 0.5); 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /src/prompt.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::{Display, Formatter}; 2 | 3 | mod format; 4 | pub use format::Format; 5 | 6 | use crate::Model; 7 | 8 | /// Yet another stab at a prompt struct. The intended use case is for chat. This 9 | /// takes inspiration from the OpenAI API, but is not intended to be compatible 10 | /// with it. 11 | #[derive(Debug, Clone)] 12 | #[cfg_attr(test, derive(PartialEq))] 13 | #[cfg_attr(all(test, feature = "webchat"), derive(rocket::UriDisplayQuery))] 14 | #[cfg_attr( 15 | feature = "serde", 16 | derive(rocket::serde::Deserialize, rocket::serde::Serialize) 17 | )] 18 | #[cfg_attr(feature = "webchat", derive(rocket::form::FromForm))] 19 | #[cfg_attr(feature = "serde", serde(crate = "rocket::serde"))] 20 | pub struct Prompt { 21 | /// Setting, as in set and setting. This is the context in which the 22 | /// interaction takes place. It may be a location, a time, a situation, or 23 | /// any other context that may be relevant. The composition of a universe. 24 | #[cfg_attr(feature = "webchat", field(validate = len(..4096), default = None))] 25 | pub setting: Option, 26 | /// Agent's name, e.g. "Mr. Rogers" or "GPT-5". 27 | #[cfg_attr(feature = "webchat", field(validate = len(..64), default = "assistant"))] 28 | pub agent: String, 29 | /// Human's name, e.g. "Alice" or "Bob". 30 | #[cfg_attr(feature = "webchat", field(validate = len(..64), default = "user"))] 31 | pub human: String, 32 | /// System's name, e.g. "System", "Narrator", or "God". Should imply 33 | /// authority to the Agent -- not necessarily to the Human. 34 | #[cfg_attr(feature = "webchat", field(validate = len(..64), default = None))] 35 | pub system: Option, 36 | /// Messages in the chat transcript. There must be at least two messages. 37 | #[cfg_attr(feature = "webchat", field(validate = len(2..512)))] 38 | pub transcript: Vec, 39 | } 40 | 41 | impl Prompt { 42 | /// Load from a TOML file. 43 | #[cfg(feature = "toml")] 44 | pub fn load(path: std::path::PathBuf) -> std::io::Result { 45 | let prompt: Prompt = 46 | toml::from_str(&std::fs::read_to_string(path)?).unwrap(); 47 | Ok(prompt) 48 | } 49 | 50 | /// Format the prompt in a specific format. This does not add a BOS token so 51 | /// if this is desired, it must be prepended or [`Prompt::format_for_model`] 52 | /// must be used instead. 53 | pub fn format(&self, format: Format, f: &mut F) -> std::fmt::Result 54 | where 55 | F: std::fmt::Write, 56 | { 57 | format.format_prompt(self, None, f) 58 | } 59 | 60 | /// Format the prompt for a specific model. This adds a BOS token if the 61 | /// model requires it. If this is unknown, a BOS token will **not** be 62 | /// added. This is the recommended method for formatting a prompt. 63 | /// 64 | /// This will first attempt to use native formatting for the model. If a 65 | /// format would be [`Unknown`], it will attempt to apply a chat template using 66 | /// the model's metadata and `llama.cpp`. If *that* fails, it will use the 67 | /// [`Unknown`] format as a last resort, formatting for foundation models. 68 | /// 69 | /// This does not add the assistant's prefix to the prompt. If this is 70 | /// desired, [`format_agent_prefix`] should be called after this method or 71 | /// [`Model::apply_chat_template`] should be used instead with the `add_ass` 72 | /// parameter set to `true`. 73 | /// 74 | /// [`format_agent_prefix`]: Self::format_agent_prefix 75 | /// [`Unknown`]: Format::Unknown 76 | pub fn format_for_model( 77 | &self, 78 | model: &Model, 79 | f: &mut F, 80 | ) -> std::fmt::Result 81 | where 82 | F: std::fmt::Write, 83 | { 84 | let format = match Format::from_model(model) { 85 | Some(format) => format, 86 | None => match model.apply_chat_template(None, self, false) { 87 | Some(string) => return f.write_str(&string), 88 | None => Format::Unknown, 89 | }, 90 | }; 91 | format.format_prompt(self, Some(model), f) 92 | } 93 | 94 | /// Format the agent's prefix. This should be called after a format method 95 | /// in order to append the agent's prefix to the prompt which in turn forces 96 | /// the model to generate a response from the agent's perspective. 97 | pub fn format_agent_prefix( 98 | &self, 99 | format: Format, 100 | f: &mut F, 101 | ) -> std::fmt::Result 102 | where 103 | F: std::fmt::Write, 104 | { 105 | format.format_agent_prefix(f, self) 106 | } 107 | 108 | /// Get the agent's prefix. This a convenience method that creates a new 109 | /// string and formats it with [`format_agent_prefix`]. 110 | /// 111 | /// [`format_agent_prefix`]: Self::format_agent_prefix 112 | pub fn agent_prefix(&self, format: Format) -> String { 113 | let mut s = String::new(); 114 | self.format_agent_prefix(format, &mut s).unwrap(); 115 | s 116 | } 117 | 118 | /// Format the human's prefix. This can be used to format stop criteria so 119 | /// that the model knows when to stop generating text. 120 | pub fn format_human_prefix( 121 | &self, 122 | format: Format, 123 | f: &mut F, 124 | ) -> std::fmt::Result 125 | where 126 | F: std::fmt::Write, 127 | { 128 | format.format_human_prefix(f, self) 129 | } 130 | 131 | /// Get the human's prefix. This a convenience method that creates a new 132 | /// string and formats it with [`format_human_prefix`]. 133 | /// 134 | /// [`format_human_prefix`]: Self::format_human_prefix 135 | pub fn human_prefix(&self, format: Format) -> String { 136 | let mut s = String::new(); 137 | self.format_human_prefix(format, &mut s).unwrap(); 138 | s 139 | } 140 | } 141 | 142 | impl Display for Prompt { 143 | // By default we format for foundation/unknown models. 144 | fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { 145 | Format::Unknown.format_prompt(self, None, f) 146 | } 147 | } 148 | 149 | /// A message in a chat transcript. 150 | #[derive(Debug, Clone)] 151 | #[cfg_attr(test, derive(PartialEq))] 152 | #[cfg_attr(all(test, feature = "webchat"), derive(rocket::UriDisplayQuery))] 153 | #[cfg_attr( 154 | feature = "serde", 155 | derive(rocket::serde::Deserialize, rocket::serde::Serialize) 156 | )] 157 | #[cfg_attr(feature = "webchat", derive(rocket::form::FromForm))] 158 | #[cfg_attr(feature = "serde", serde(crate = "rocket::serde"))] 159 | pub struct Message { 160 | pub role: Role, 161 | #[cfg_attr(feature = "webchat", field(validate = len(..4096)))] 162 | pub text: String, 163 | } 164 | 165 | /// A [`Role`] is the participant's role in a chat transcript. This is similar 166 | /// to the OpenAI API's role. 167 | #[derive(Debug, Clone)] 168 | #[cfg_attr(test, derive(PartialEq))] 169 | #[cfg_attr(all(test, feature = "webchat"), derive(rocket::UriDisplayQuery))] 170 | #[cfg_attr( 171 | feature = "serde", 172 | derive(rocket::serde::Deserialize, rocket::serde::Serialize) 173 | )] 174 | #[cfg_attr(feature = "webchat", derive(rocket::form::FromFormField))] 175 | #[cfg_attr( 176 | feature = "serde", 177 | serde(crate = "rocket::serde"), 178 | serde(rename_all = "snake_case") 179 | )] 180 | pub enum Role { 181 | Human, 182 | Agent, 183 | /// Superuser role. This is some authority figure that constrains the 184 | /// Agent's behavior. It may be a system, a narrator, or a god. 185 | System, 186 | } 187 | -------------------------------------------------------------------------------- /src/utils.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | #[macro_use] 3 | pub mod test; 4 | 5 | #[inline] 6 | #[cold] 7 | /// Marks a branch as unlikely. 8 | pub(crate) fn cold() {} 9 | -------------------------------------------------------------------------------- /src/utils/test.rs: -------------------------------------------------------------------------------- 1 | macro_rules! assert_approx_eq { 2 | ($a:expr, $b:expr, $eps:expr) => { 3 | // log the values for debugging 4 | dbg!($a, $b); 5 | assert!(($a - $b).abs() < $eps); 6 | }; 7 | } 8 | -------------------------------------------------------------------------------- /tests/data/README.md: -------------------------------------------------------------------------------- 1 | # Test Data 2 | 3 | Lyrics are copyright Genius and Musixmatch. They are intended to test the 4 | `regurgitater` tool. They may not be used for commercial purposes. 5 | 6 | New York Times articles are copyright New York Times, obviously. OT-III is 7 | copyright COS. Chapters from the Hobbit are copyright the Tolkien estate. 8 | -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/5_on_it.txt: -------------------------------------------------------------------------------- 1 | Creep on in 2 | Ayy, see I'm ridin' high, whoa 3 | Kinda broke this evening, y'all 4 | So all I got's five, I got five 5 | Player, give me some brew and I might just chill 6 | But I'm the type that like to light another joint, like Cypress Hill 7 | I steal doobies, spit loogies when I puff on it 8 | I got some bucks on it, but it ain't enough on it 9 | Go get the S, the T-I-D-E-S 10 | Nevertheless, I'm hella fresh, rollin' joints like a cigarette 11 | So pass it 'cross the table like ping pong 12 | I'm gone, beatin' my chest like King Kong 13 | It's on, wrap my lips around the .40 14 | And when it comes to fetting another stogie 15 | Fools all kick in like Shinobi 16 | No, he ain't my homie to begin with 17 | It's too many heads to be poppin' to let my friend hit it bit 18 | Unless you pull out the fat, crispy 19 | Five dollar bill, on the real, before it's history 20 | 'Cause fools be havin' them vacuum lungs 21 | And if you let 'em hit it for free, you hella dumb-da-dumb-dumb 22 | I come to school with the Taylor on my earlobe 23 | Avoiding all the dick teasers, skeezers, and weirdos 24 | That be blowing off the land, like, "Where the bomb at?" 25 | Give me two bucks, you take a puff, and pass my bomb back 26 | Suck up that dank like a Slurpee, the serious 27 | Bomb will make a niggy go delirious, like Eddie Murphy 28 | I got more growing pains than Maggie 29 | 'Cause homies nag me to take the dank out of the baggie 30 | I got five on it (got it, good), grab your four, let's get keyed 31 | I got five on it, messin' with that Indo weed 32 | I got five on it (got it, good), it's got me stuck, and I'm tore back 33 | I got five on it, partner, let's go half on a sack 34 | I take sacks to the face 35 | Whenever I can, don't need no crutch 36 | I'm so keyed up 'til the joint be burning my hand 37 | Next time I roll it in a hampa to burn slow 38 | So the ashes won't be burning up my hand, bruh 39 | Hoochies can hit, but they know they got to pitch in 40 | Then I roll a joint that's longer than your extension 41 | 'Cause I'll be damned if you get high off me for free 42 | Hell no, you better bring your own spliff, chief 43 | What's up? Don't babysit that, better pass the joint 44 | Stop hitting, 'cause you know you got asthma 45 | Crack the 40 open, homie, and guzzle it 46 | 'Cause I know the weed in my system is gettin' lonely 47 | I gotta take a whiz test to my P-O 48 | I know I failed, 'cause I done smoked major weed, bro 49 | And every time we with Chris, that fool rollin' up a fatty 50 | But the Tanqueray straight had me 51 | I got five on it (got it, good), grab your four, let's get keyed 52 | I got five on it, messin' with that Indo weed 53 | I got five on it (got it, good), it's got me stuck, and I'm tore back 54 | I got five on it (got it, good), partner, let's go half on a sack 55 | Ayy, make this right, mane, stop at the light, mane 56 | My yester-night thing got me hung off the night train 57 | You fade, I fade, so let's head to the East 58 | Hit the stroll to nine-oh, so we can roll big hashish 59 | I wish I could fade the eighth, but I'm low budget 60 | Still rollin' a two-door cutlass, same old bucket 61 | Foggy windows, soggy Indo 62 | I'm in the 'land getting smoked with my kinfolk 63 | I been smoked 64 | Y'all get spray ya, lay you down up in the O-A-K the Town 65 | Homies don't play around, we down to blaze a pound 66 | Then ease up, speed up through the E-S-O 67 | Drink the V-S-O-P up, with a lemon squeeze up 68 | And everybody's rolled up, I'm the roller 69 | That's quick to fold a blunt out of a bunch of sticky doja (woo-wee) 70 | Hold up, suck up my weed is all you do 71 | Kick in feed, 'cause where I bes we needs half, like Umfufu 72 | I got five on it (got it, good), grab your four, let's get keyed 73 | I got five on it, messin' with that Indo weed 74 | I got five on it (got it, good), it's got me stuck, and I'm tore back 75 | I got five on it (got it, good), partner, let's go half on a sack -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/README.md: -------------------------------------------------------------------------------- 1 | # Lyrics 2 | 3 | The data within is copyright the original artists. It is included to test the 4 | `detect-infringement` binary. This data is for non-commercial, academic, use 5 | only in order to demonstrate copyright infringement within language models. 6 | 7 | Data is sourced from: 8 | - Bing.com which sources from Musixmatch 9 | - Genius.com -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/a_day_in_the_life.txt: -------------------------------------------------------------------------------- 1 | I read the news today oh boy 2 | About a lucky man who made the grade 3 | And though the news was rather sad 4 | Well I just had to laugh 5 | I saw the photograph 6 | He blew his mind out in a car 7 | He didn't notice that the lights had changed 8 | A crowd of people stood and stared 9 | They'd seen his face before 10 | Nobody was really sure 11 | If he was from the House of Lords 12 | I saw a film today oh boy 13 | The English Army had just won the war 14 | A crowd of people turned away 15 | But I just had to look 16 | Having read the book 17 | I'd love to turn you on 18 | Woke up, fell out of bed 19 | Dragged a comb across my head 20 | Found my way downstairs and drank a cup 21 | And looking up I noticed I was late 22 | Found my coat and grabbed my hat 23 | Made the bus in seconds flat 24 | Found my way upstairs and had a smoke 25 | And somebody spoke and I went into a dream 26 | I read the news today oh boy 27 | Four thousand holes in Blackburn, Lancashire 28 | And though the holes were rather small 29 | They had to count them all 30 | Now they know how many holes it takes to fill the Albert Hall 31 | I'd love to turn you on -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/a_whole_new_world.txt: -------------------------------------------------------------------------------- 1 | I can show you the world 2 | Shining, shimmering, splendid 3 | Tell me, princess, now when did 4 | You last let your heart decide? 5 | I can open your eyes 6 | Take you wonder by wonder 7 | Over, sideways and under 8 | On a magic carpet ride 9 | A whole new world 10 | A new fantastic point of view 11 | No one to tell us no 12 | Or where to go 13 | Or say we're only dreaming 14 | A whole new world 15 | A dazzling place I never knew 16 | But when I'm way up here, it's crystal clear 17 | That now I'm in a whole new world with you 18 | (Now I'm in a whole new world with you) 19 | Unbelievable sights 20 | Indescribable feeling 21 | Soaring, tumbling, freewheeling 22 | Through an endless diamond sky 23 | A whole new world (don't you dare close your eyes) 24 | A hundred thousand things to see (hold your breath, it gets better) 25 | I'm like a shooting star, I've come so far 26 | I can't go back to where I used to be 27 | A whole new world (every turn a surprise) 28 | With new horizons to pursue (every moment, red-letter) 29 | I'll chase them anywhere, there's time to spare 30 | Let me share this whole new world with you 31 | A whole new world (a whole new world) 32 | That's where we'll be (that's where we'll be) 33 | A thrilling chase (a wondrous place) 34 | For you and me -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/aenema.txt: -------------------------------------------------------------------------------- 1 | Some say the end is near 2 | Some say we'll see Armageddon soon 3 | I certainly hope we will 4 | I sure could use a vacation from this 5 | Bullshit three-ring 6 | Circus sideshow of 7 | Freaks 8 | Here in this hopeless fucking hole we call L.A. 9 | The only way to fix it is to flush it all away 10 | Any fucking time, any fucking day 11 | Learn to swim, I'll see you down in Arizona bay 12 | Fret for your figure and 13 | Fret for your latte and 14 | Fret for your lawsuit and 15 | Fret for your hairpiece and 16 | Fret for your Prozac and 17 | Fret for your pilot and 18 | Fret for your contract and 19 | Fret for your car 20 | It's a 21 | Bullshit three-ring 22 | Circus sideshow of 23 | Freaks 24 | Here in this hopeless fucking hole we call L.A. 25 | The only way to fix it is to flush it all away 26 | Any fucking time, any fucking day 27 | Learn to swim, I'll see you down in Arizona bay 28 | Some say a comet will fall from the sky 29 | Followed by meteor showers and tidal waves 30 | Followed by fault lines that cannot sit still 31 | Followed by millions of dumbfounded dipshits 32 | And some say the end is near 33 | Some say we'll see Armageddon soon 34 | I certainly hope we will 35 | I sure could use a vacation from this 36 | Stupid shit, silly shit, stupid shit 37 | One great big festering neon distraction 38 | I've a suggestion to keep you all occupied 39 | Learn to swim, learn to swim, learn to swim 40 | 'Cause Mom's gonna fix it all soon 41 | Mom's comin' 'round to put it back the way it ought to be 42 | Learn to swim, learn to swim 43 | Learn to swim, learn to swim 44 | Learn to swim, learn to swim 45 | Learn to swim, learn to swim 46 | Fuck L. Ron Hubbard and 47 | Fuck all his clones 48 | Fuck all these gun-toting 49 | Hip gangster wannabes 50 | Learn to swim, learn to swim 51 | Learn to swim, learn to swim 52 | Learn to swim, learn to swim 53 | Learn to swim, learn to swim 54 | Fuck retro anything 55 | Fuck your tattoos 56 | Fuck all you junkies and 57 | Fuck your short memories 58 | Learn to swim, learn to swim 59 | Learn to swim, learn to swim 60 | Learn to swim, learn to swim 61 | Learn to swim, learn to swim 62 | Yeah, fuck smiley glad-hands 63 | With hidden agendas 64 | Fuck these dysfunctional 65 | Insecure actresses 66 | Learn to swim, learn to swim 67 | Learn to swim, learn to swim 68 | Learn to swim, learn to swim 69 | Learn to swim, learn to swim 70 | 'Cause I'm praying for rain 71 | I'm praying for tidal waves 72 | I wanna see the ground give way 73 | I wanna watch it all go down 74 | Mom, please flush it all away 75 | I wanna see it go right in and down 76 | I wanna watch it go right in 77 | Watch you flush it all away 78 | Yeah, time to bring it down again 79 | Yeah, don't just call me pessimist 80 | Try and read between the lines 81 | I can't imagine why you wouldn't 82 | Welcome any change, my friend 83 | I wanna see it come down 84 | Put it down 85 | Suck it down 86 | Flush it down -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/bad_romance.txt: -------------------------------------------------------------------------------- 1 | Oh-oh-oh-oh-oh, oh-oh-oh-oh, oh-oh-oh 2 | Caught in a bad romance 3 | Oh-oh-oh-oh-oh, oh-oh-oh-oh, oh-oh-oh 4 | Caught in a bad romance 5 | Rah, rah-ah-ah-ah 6 | Roma, roma-ma 7 | Gaga, ooh-la-la 8 | Want your bad romance 9 | Rah, rah-ah-ah-ah 10 | Roma, roma-ma 11 | Gaga, ooh-la-la 12 | Want your bad romance 13 | I want your ugly, I want your disease 14 | I want your everything as long as it's free 15 | I want your love 16 | Love, love, love, I want your love (hey) 17 | I want your drama, the touch of your hand (hey) 18 | I want your leather-studded kiss in the sand 19 | I want your love 20 | Love, love, love, I want your love (love, love, love) 21 | (I want your love) 22 | You know that I want you 23 | And you know that I need you 24 | I want it bad, your bad romance 25 | I want your love, and I want your revenge 26 | You and me could write a bad romance (oh-oh-oh-oh-oh) 27 | I want your love and all your lover's revenge 28 | You and me could write a bad romance 29 | Oh-oh-oh-oh-oh, oh-oh-oh-oh, oh-oh-oh 30 | Caught in a bad romance 31 | Oh-oh-oh-oh-oh, oh-oh-oh-oh, oh-oh-oh 32 | Caught in a bad romance 33 | Rah, rah-ah-ah-ah 34 | Roma, roma-ma 35 | Gaga, ooh-la-la 36 | Want your bad romance 37 | I want your horror, I want your design 38 | 'Cause you're a criminal as long as you're mine 39 | I want your love 40 | Love, love, love, I want your love 41 | I want your psycho, your vertigo shtick (hey) 42 | Want you in my rear window, baby, you're sick 43 | I want your love 44 | Love, love, love, I want your love (love, love, love) 45 | (I want your love) 46 | You know that I want you 47 | And you know that I need you ('cause I'm a free bitch, baby) 48 | I want it bad, your bad romance 49 | I want your love, and I want your revenge 50 | You and me could write a bad romance (oh-oh-oh-oh-oh) 51 | I want your love and all your lover's revenge 52 | You and me could write a bad romance 53 | Oh-oh-oh-oh-oh, oh-oh-oh-oh, oh-oh-oh 54 | Caught in a bad romance 55 | Oh-oh-oh-oh-oh, oh-oh-oh-oh, oh-oh-oh 56 | Caught in a bad romance 57 | Rah, rah-ah-ah-ah 58 | Roma, roma-ma 59 | Gaga, ooh-la-la 60 | Want your bad romance 61 | Rah, rah-ah-ah-ah 62 | Roma, roma-ma 63 | Gaga, ooh-la-la 64 | Want your bad romance 65 | Walk, walk, fashion baby 66 | Work it, move that bitch crazy 67 | Walk, walk, fashion baby 68 | Work it, move that bitch crazy 69 | Walk, walk, fashion baby 70 | Work it, move that bitch crazy 71 | Walk, walk, passion baby 72 | Work it, I'm a free bitch, baby 73 | I want your love, and I want your revenge 74 | I want your love, I don't wanna be friends 75 | J'veux ton amour, et je veux ta revanche 76 | J'veux ton amour, I don't wanna be friends (oh-oh-oh-oh-oh, oh-oh-oh-oh) 77 | No, I don't wanna be friends (oh-oh-oh, caught in a bad romance) 78 | I don't wanna be friends (oh-oh-oh-oh-oh, oh-oh-oh-oh) 79 | Want your bad romance (oh-oh-oh) 80 | Caught in a bad romance 81 | Want your bad romance 82 | I want your love, and I want your revenge 83 | You and me could write a bad romance (oh-oh-oh-oh-oh) 84 | I want your love and all your lover's revenge 85 | You and me could write a bad romance 86 | Oh-oh-oh-oh-oh, oh-oh-oh-oh, oh-oh-oh 87 | (Want your bad romance) 88 | Caught in a bad romance 89 | (Want your bad romance) 90 | Oh-oh-oh-oh-oh, oh-oh-oh-oh, oh-oh-oh 91 | (Want your bad romance) 92 | Caught in a bad romance -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/barbie_girl.txt: -------------------------------------------------------------------------------- 1 | Hiya, Barbie! 2 | Hi, Ken! 3 | You wanna go for a ride? 4 | Sure, Ken! 5 | Jump in! 6 | I'm a Barbie girl in a Barbie world 7 | Life in plastic, it's fantastic 8 | You can brush my hair, undress me everywhere 9 | Imagination, life is your creation 10 | Come on Barbie, let's go party 11 | I'm a Barbie girl in a Barbie world 12 | Life in plastic, it's fantastic 13 | You can brush my hair, undress me everywhere 14 | Imagination, life is your creation 15 | I'm a blonde bimbo girl in a fantasy world 16 | Dress me up, make it tight, I'm your dolly 17 | You're my doll, rock and roll, feel the glamour in pink 18 | Kiss me here, touch me there, hanky-panky 19 | You can touch, you can play 20 | If you say, "I'm always yours" 21 | Ooh-whoa-ooh 22 | I'm a Barbie girl in a Barbie world 23 | Life in plastic, it's fantastic 24 | You can brush my hair, undress me everywhere 25 | Imagination, life is your creation 26 | Come on Barbie, let's go party 27 | Ah-ah-ah, yeah 28 | Come on Barbie, let's go party 29 | Ooh-whoa-ooh, ooh-whoa-ooh 30 | Come on Barbie, let's go party 31 | Ah-ah-ah, yeah 32 | Come on Barbie, let's go party 33 | Ooh-whoa-ooh, ooh-whoa-ooh 34 | Make me walk, make me talk, do whatever you please 35 | I can act like a star, I can beg on my knees 36 | Come jump in, bimbo friend, let us do it again 37 | Hit the town, fool around, let's go party 38 | You can touch, you can play 39 | If you say, "I'm always yours" 40 | You can touch, you can play 41 | If you say, "I'm always yours" 42 | Come on Barbie, let's go party 43 | Ah-ah-ah, yeah 44 | Come on Barbie, let's go party 45 | Ooh-whoa-ooh, ooh-whoa-ooh 46 | Come on Barbie, let's go party 47 | Ah-ah-ah, yeah 48 | Come on Barbie, let's go party 49 | Ooh-whoa-ooh, ooh-whoa-ooh 50 | I'm a Barbie girl in a Barbie world 51 | Life in plastic, it's fantastic 52 | You can brush my hair, undress me everywhere 53 | Imagination, life is your creation 54 | I'm a Barbie girl in a Barbie world 55 | Life in plastic, it's fantastic 56 | You can brush my hair, undress me everywhere 57 | Imagination, life is your creation 58 | Come on Barbie, let's go party 59 | Ah-ah-ah, yeah 60 | Come on Barbie, let's go party 61 | Ooh-whoa-ooh, ooh-whoa-ooh 62 | Come on Barbie, let's go party 63 | Ah-ah-ah, yeah 64 | Come on Barbie, let's go party 65 | Ooh-whoa-ooh, ooh-whoa-ooh 66 | Oh, I'm having so much fun! 67 | Well Barbie, we're just getting started 68 | Oh, I love you, Ken! -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/bohemian_rhapsody.txt: -------------------------------------------------------------------------------- 1 | Is this the real life? Is this just fantasy? 2 | Caught in a landslide, no escape from reality 3 | Open your eyes, look up to the skies and see 4 | I'm just a poor boy, I need no sympathy 5 | Because I'm easy come, easy go, little high, little low 6 | Any way the wind blows doesn't really matter to me, to me 7 | Mama, just killed a man 8 | Put a gun against his head, pulled my trigger, now he's dead 9 | Mama, life had just begun 10 | But now I've gone and thrown it all away 11 | Mama, ooh, didn't mean to make you cry 12 | If I'm not back again this time tomorrow 13 | Carry on, carry on as if nothing really matters 14 | Too late, my time has come 15 | Sends shivers down my spine, body's aching all the time 16 | Goodbye, everybody, I've got to go 17 | Gotta leave you all behind and face the truth 18 | Mama, ooh (any way the wind blows) 19 | I don't wanna die 20 | I sometimes wish I'd never been born at all 21 | I see a little silhouetto of a man 22 | Scaramouche, Scaramouche, will you do the Fandango? 23 | Thunderbolt and lightning, very, very frightening me 24 | (Galileo) Galileo, (Galileo) Galileo, Galileo Figaro, magnifico 25 | I'm just a poor boy, nobody loves me 26 | He's just a poor boy from a poor family 27 | Spare him his life from this monstrosity 28 | Easy come, easy go, will you let me go? 29 | Bismillah! 30 | No, we will not let you go (let him go) 31 | Bismillah! 32 | We will not let you go (let him go) 33 | Bismillah! 34 | We will not let you go (let me go) 35 | Will not let you go (let me go) 36 | Never, never, never, never let me go 37 | No, no, no, no, no, no, no 38 | Oh, mamma mia, mamma mia 39 | Mamma mia, let me go 40 | Beelzebub has a devil put aside for me, for me, for me 41 | So you think you can stone me and spit in my eye? 42 | So you think you can love me and leave me to die? 43 | Oh, baby, can't do this to me, baby 44 | Just gotta get out, just gotta get right outta here 45 | Ooh 46 | Ooh, yeah, ooh, yeah 47 | Nothing really matters, anyone can see 48 | Nothing really matters 49 | Nothing really matters to me 50 | (Any way the wind blows) -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/born_this_way.txt: -------------------------------------------------------------------------------- 1 | It doesn't matter if you love him or capital H-I-M 2 | Just put your paws up 3 | 'Cause you were born this way, baby 4 | My mama told me when I was young, "We are all born superstars" 5 | She rolled my hair and put my lipstick on in the glass of her boudoir 6 | "There's nothing wrong with loving who you are" 7 | She said, "'Cause He made you perfect, babe 8 | So hold your head up, girl, and you'll go far" 9 | Listen to me when I say 10 | I'm beautiful in my way 'cause God makes no mistakes 11 | I'm on the right track, baby, I was born this way 12 | Don't hide yourself in regret, just love yourself, and you're set 13 | I'm on the right track, baby, I was born this way (born this way) 14 | Ooh, there ain't no other way, baby, I was born this way 15 | Baby, I was born this way (born this way) 16 | Ooh, there ain't no other way, baby, I was born this way 17 | Right track, baby, I was born this way 18 | Don't be a drag, just be a queen 19 | Don't be a drag, just be a queen 20 | Don't be a drag, just be a queen 21 | Don't be (don't be, don't be) 22 | Give yourself prudence and love your friends 23 | Subway kid, rejoice your truth 24 | In the religion of the insecure, I must be myself, respect my youth 25 | A different lover is not a sin, believe capital H-I-M (hey, hey, hey) 26 | I love my life, I love this record, and 27 | Mi amore vole fe, yah (same DNA) 28 | I'm beautiful in my way 'cause God makes no mistakes 29 | I'm on the right track, baby, I was born this way 30 | Don't hide yourself in regret, just love yourself, and you're set 31 | I'm on the right track, baby, I was born this way 32 | Ooh, there ain't no other way, baby, I was born this way 33 | Baby, I was born this way (born this way) 34 | Ooh, there ain't no other way, baby, I was born this way 35 | I'm on the right track, baby, I was born this way 36 | Don't be a drag, just be a queen 37 | Whether you're broke or evergreen 38 | You're Black, white, beige, chola descent 39 | You're Lebanese, you're Orient' 40 | Whether life's disabilities left you outcast, bullied, or teased 41 | Rejoice and love yourself today 42 | 'Cause, baby, you were born this way 43 | No matter gay, straight, or bi', lesbian, transgender life 44 | I'm on the right track, baby, I was born to survive 45 | No matter Black, white or beige, chola, or Orient' made 46 | I'm on the right track, baby, I was born to be brave 47 | I'm beautiful in my way 'cause God makes no mistakes 48 | I'm on the right track, baby, I was born this way 49 | Don't hide yourself in regret, just love yourself, and you're set 50 | I'm on the right track, baby, I was born this way, yeah 51 | Ooh, there ain't no other way, baby, I was born this way 52 | Baby, I was born this way (born this way) 53 | Ooh, there ain't no other way, baby, I was born this way 54 | I'm on the right track, baby, I was born this way 55 | I was born this way, hey 56 | I was born this way, hey 57 | I'm on the right track, baby, I was born this way, hey 58 | I was born this way, hey 59 | I was born this way, hey 60 | I'm on the right track, baby, I was born this way, hey 61 | Same DNA, but born this way 62 | Same DNA, but born this way -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/buckley-hallelujah.txt: -------------------------------------------------------------------------------- 1 | I heard there was a secret chord 2 | That David played and it pleased the Lord 3 | But you don't really care for music, do you? 4 | Well it goes like this the fourth, the fifth 5 | The minor fall and the major lift 6 | The baffled king composing Hallelujah 7 | Hallelujah 8 | Hallelujah 9 | Hallelujah 10 | Hallelujah 11 | Well your faith was strong but you needed proof 12 | You saw her bathing on the roof 13 | Her beauty and the moonlight overthrew you 14 | She tied you to her kitchen chair 15 | She broke your throne and she cut your hair 16 | And from your lips, she drew the Hallelujah 17 | Hallelujah 18 | Hallelujah 19 | Hallelujah 20 | Hallelujah 21 | Baby, I've been here before 22 | I've seen this room and I've walked this floor 23 | You know, I used to live alone before I knew you 24 | And I've seen your flag on the marble arch 25 | And Love is not a victory march 26 | It's a cold and it's a broken Hallelujah 27 | Hallelujah 28 | Hallelujah 29 | Hallelujah 30 | Hallelujah 31 | Well, there was a time when you let me know 32 | What's really going on below 33 | But now you never show that to me, do you? 34 | But remember, when I moved in you 35 | And the holy dove was moving too 36 | And every breath, we drew was Hallelujah 37 | Hallelujah 38 | Hallelujah 39 | Hallelujah 40 | Hallelujah 41 | Maybe there's a God above 42 | But, all I've ever learned from love 43 | Was how to shoot somebody who outdrew you? 44 | And it's not a cry, that you hear at night 45 | It's not somebody, who's seen the light 46 | It's a cold and it's a broken Hallelujah 47 | Hallelujah 48 | Hallelujah 49 | Hallelujah 50 | Hallelujah 51 | Hallelujah 52 | Hallelujah 53 | Hallelujah 54 | Hallelujah 55 | Hallelujah 56 | Hallelujah 57 | Hallelujah 58 | Hallelujah -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/can_you_feel_the_love_tonight.txt: -------------------------------------------------------------------------------- 1 | There's a calm surrender 2 | To the rush of day 3 | When the heat of a rolling wind 4 | Can be turned away 5 | An enchanted moment 6 | And it sees me through 7 | It's enough for this restless warrior 8 | Just to be with you 9 | And can you feel the love tonight? (Tonight) 10 | It is where we are 11 | It's enough for this wide-eyed wanderer 12 | That we got this far 13 | And can you feel the love tonight? (Tonight) 14 | How it's laid to rest 15 | It's enough to make kings and vagabonds 16 | Believe the very best 17 | There's a time for everyone 18 | If they only learn 19 | That the twisting kaleidoscope 20 | Moves us all in turn 21 | There's a rhyme and reason 22 | To the wild outdoors 23 | When the heart of this star-crossed voyager 24 | Beats in time with yours 25 | And can you feel the love tonight? (Tonight) 26 | It is where we are 27 | It's enough for this wide-eyed wanderer 28 | That we got this far 29 | And can you feel the love tonight? (Tonight) 30 | How it's laid to rest 31 | It's enough to make kings and vagabonds 32 | Believe the very best 33 | It's enough to make kings and vagabonds 34 | Believe the very best -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/candle_in_the_wind.txt: -------------------------------------------------------------------------------- 1 | Goodbye Norma Jeane 2 | Though I never knew you at all 3 | You had the grace to hold yourself 4 | While those around you crawled 5 | They crawled out of the woodwork 6 | And they whispered into your brain 7 | They set you on the treadmill 8 | And they made you change your name 9 | And it seems to me you lived your life 10 | Like a candle in the wind 11 | Never knowing who to cling to 12 | When the rain set in 13 | And I would've liked to known you 14 | But I was just a kid 15 | Your candle burned out long before 16 | Your legend ever did 17 | Loneliness was tough 18 | The toughest role you ever played 19 | Hollywood created a superstar 20 | And pain was the price you paid 21 | Even when you died 22 | Oh the press still hounded you 23 | All the papers had to say 24 | Was that Marilyn was found in the nude 25 | And it seems to me you lived your life 26 | Like a candle in the wind 27 | Never knowing who to cling to 28 | When the rain set in 29 | And I would've liked to known you 30 | But I was just a kid 31 | Your candle burned out long before 32 | Your legend ever did 33 | Goodbye Norma Jeane 34 | Though I never knew you at all 35 | You had the grace to hold yourself 36 | While those around you crawled 37 | Goodbye Norma Jeane 38 | From the young man in the twenty second row 39 | Who sees you as something as more than sexual 40 | More than just our Marilyn Monroe 41 | And it seems to me you lived your life 42 | Like a candle in the wind 43 | Never knowing who to cling to 44 | When the rain set in 45 | And I would've liked to known you 46 | But I was just a kid 47 | Your candle burned out long before 48 | Your legend ever did 49 | Your candle burned out long before 50 | Your legend ever did -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/closer.txt: -------------------------------------------------------------------------------- 1 | You let me violate you 2 | You let me desecrate you 3 | You let me penetrate you 4 | You let me complicate you 5 | (Help me) I broke apart my insides 6 | (Help me) I've got no soul to sell 7 | (Help me) the only thing that works for me 8 | Help me get away from myself 9 | I wanna fuck you like an animal 10 | I wanna feel you from the inside 11 | I wanna fuck you like an animal 12 | My whole existence is flawed 13 | You get me closer to God 14 | You can have my isolation 15 | You can have the hate that it brings 16 | You can have my absence of faith 17 | You can have my everything 18 | (Help me) tear down my reason 19 | (Help me) it's your sex I can smell 20 | (Help me) you make me perfect 21 | Help me become somebody else 22 | I wanna fuck you like an animal 23 | I wanna feel you from the inside 24 | I wanna fuck you like an animal 25 | My whole existence is flawed 26 | You get me closer to God 27 | Through every forest 28 | Above the trees 29 | Within my stomach 30 | Scraped off my knees 31 | I drink the honey 32 | Inside your hive 33 | You are the reason 34 | I stay alive -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/cohen-hallelujah.txt: -------------------------------------------------------------------------------- 1 | Now I've heard there was a secret chord 2 | That David played, and it pleased the Lord 3 | But you don't really care for music, do you? 4 | It goes like this, the fourth, the fifth 5 | The minor falls, the major lifts 6 | The baffled king composing Hallelujah 7 | Hallelujah, Hallelujah 8 | Hallelujah, Hallelujah 9 | Your faith was strong but you needed proof 10 | You saw her bathing on the roof 11 | Her beauty and the moonlight overthrew you 12 | She tied you to a kitchen chair 13 | She broke your throne, and she cut your hair 14 | And from your lips she drew the Hallelujah 15 | Hallelujah, Hallelujah 16 | Hallelujah, Hallelujah 17 | You say I took the name in vain 18 | I don't even know the name 19 | But if I did, well, really, what's it to you? 20 | There's a blaze of light in every word 21 | It doesn't matter which you heard 22 | The holy or the broken Hallelujah 23 | Hallelujah, Hallelujah 24 | Hallelujah, Hallelujah 25 | I did my best, it wasn't much 26 | I couldn't feel, so I tried to touch 27 | I've told the truth, I didn't come to fool you 28 | And even though it all went wrong 29 | I'll stand before the Lord of Song 30 | With nothing on my tongue but Hallelujah 31 | Hallelujah, Hallelujah 32 | Hallelujah, Hallelujah 33 | Hallelujah, Hallelujah 34 | Hallelujah, Hallelujah 35 | Hallelujah, Hallelujah 36 | Hallelujah, Hallelujah 37 | Hallelujah, Hallelujah 38 | Hallelujah, Hallelujah -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/eleanor.txt: -------------------------------------------------------------------------------- 1 | Ah, look at all the lonely people 2 | Ah, look at all the lonely people 3 | Eleanor Rigby 4 | Picks up the rice in the church where a wedding has been 5 | Lives in a dream 6 | Waits at the window 7 | Wearing the face that she keeps in a jar by the door 8 | Who is it for? 9 | All the lonely people 10 | Where do they all come from? 11 | All the lonely people 12 | Where do they all belong? 13 | Father McKenzie 14 | Writing the words of a sermon that no one will hear 15 | No one comes near 16 | Look at him working 17 | Darning his socks in the night when there's nobody there 18 | What does he care? 19 | All the lonely people 20 | Where do they all come from? 21 | All the lonely people 22 | Where do they all belong? 23 | Ah, look at all the lonely people 24 | Ah, look at all the lonely people 25 | Eleanor Rigby 26 | Died in the church and was buried along with her name 27 | Nobody came 28 | Father McKenzie 29 | Wiping the dirt from his hands as he walks from the grave 30 | No one was saved 31 | All the lonely people (ah, look at all the lonely people) 32 | Where do they all come from? 33 | All the lonely people (ah, look at all the lonely people) 34 | Where do they all belong? -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/father_lucifer.txt: -------------------------------------------------------------------------------- 1 | Father Lucifer you never looked so sane 2 | You always did prefer the drizzle to the rain 3 | Tell me that you're still in love with that milkmaid 4 | How the Lizzies, how's your Jesus Christ been hanging? 5 | Nothing's gonna stop me from floating 6 | Nothing's gonna stop me from floating 7 | He says he reckons I'm a watercolor stain 8 | He says I run and then I run from him and then I run 9 | He didn't see me watching from the aeroplane 10 | He wiped a tear and then he threw away our apple seed 11 | Nothing's gonna stop me from floating 12 | Nothing's gonna stop me from floating 13 | Every day's my wedding day 14 | (Go away world only glass) 15 | The baby's still in his comatose state 16 | (Georgie, they're your favourite) 17 | I'll dye my own Easter eggs 18 | (Skiddly-dee I'm in G, yes) 19 | Just don't go yet, just don't go 20 | (Never go, go so fast) 21 | And Beenie lost the sunset but that's okay 22 | (Go away world only glass) 23 | (Maybe she's hiding in a hot dog) 24 | Does Joe bring flowers to Marilyn's grave? 25 | (Georgie, I swear they're your favourite) 26 | (Got a pig hiding in a truffle) 27 | And girls that eat pizza and never gain weight 28 | (There she goes, there she goes) 29 | (Wearing those purple garters) 30 | Never gain weight, never gain weight 31 | (There she goes home) 32 | (And girl I got a condo in Hoboken) 33 | Father Lucifer you never looked so sane 34 | You always did prefer the drizzle to the rain 35 | Tell me that you're still in love with that milkmaid 36 | How the Lizzies, how's your Jesus Christ been hanging? -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/fire_water_burn.txt: -------------------------------------------------------------------------------- 1 | The roof, the roof, the roof is on fire 2 | The roof, the roof, the roof is on fire 3 | The roof, the roof, the roof is on fire 4 | We don't need no water, let the motherfucker burn 5 | Burn motherfucker, burn 6 | Hello, my name is Jimmy Pop and I'm a dumb white guy 7 | I'm not old or new, but middle school, fifth grade, like junior high 8 | I don't know mofo, if y'all peeps be buggin' givin' props to my hoe 'cause she fly 9 | But I can take the heat 'cause I'm the other white meat known as "Kid funky fried" 10 | Yeah, I'm hung like planet Pluto, hard to see with the naked eye 11 | But if I crashed into Uranus, I would stick it where the sun don't shine 12 | 'Cause I'm kind of like Hans Solo, always stroking my own wookie 13 | I'm the root of all that's evil, yea, but you can call me Cookie 14 | The roof, the roof, the roof is on fire 15 | The roof, the roof, the roof is on fire 16 | The roof, the roof, the roof is on fire 17 | We don't need no water, let the motherfucker burn 18 | Burn motherfucker, burn 19 | Yo, yo 20 | This hardcore ghetto gangster image takes a lot of practice 21 | I'm not black like Barry White, no, I am white like Frank Black is 22 | So, if man is five and the devil is six, then that must make me seven 23 | This honkey's gone to heaven 24 | But if I go to hell, well, then I hope I burn well 25 | I'll spend my days with J.F.K., Marvin Gaye, Martha Raye and Lawrence Welk 26 | And Kurt Cobain, Kojak, Mark Twain and Jimi Hendrix's poltergeist 27 | And Webster, yeah, Emmanuel Lewis 'cause he's the Antichrist 28 | The roof, the roof, the roof is on fire 29 | The roof, the roof, the roof is on fire 30 | The roof, the roof, the roof is on fire 31 | We don't need no water, let the motherfucker burn 32 | Burn motherfucker, burn 33 | Everybody, here we go 34 | (Ooh, ooh) 35 | C'mon party people 36 | (Ooh, ooh) 37 | Throw your hands in the air 38 | (Ooh, ooh) 39 | C'mon party people 40 | (Ooh, ooh) 41 | Wave 'em like you don't care 42 | (Ooh, ooh) 43 | C'mon party people 44 | (Ooh, ooh) 45 | Everbody say, "Ho" 46 | (Ooh, ooh) 47 | C'mon party people 48 | (Ooo, ooo) 49 | Everybody here we go 50 | (Ooh, ooh) 51 | C'mon party people 52 | (Ooh, ooh) 53 | Throw your hands in the air 54 | (Ooh, ooh) 55 | C'mon party people 56 | (Ooo, ooo) 57 | Wave 'em like you don't care 58 | (Ooh, ooh) 59 | C'mon party people 60 | (Ooh, ooh) 61 | Everbody say, "Ho" 62 | (Ooh, ooh) 63 | C'mon party people 64 | (Ooh, ooh) 65 | Everybody here we go 66 | (Ooh, ooh) 67 | C'mon party people 68 | (Ooh, ooh) 69 | Throw your hands in the air 70 | (Ooh, ooh) 71 | C'mon party people 72 | (Ooh, ooh) 73 | Wave 'em like you don't care 74 | (Ooh, ooh) 75 | C'mon party people 76 | (Ooh, ooh) 77 | Everbody say, "Ho" 78 | (Ooh, ooh) 79 | C'mon party people 80 | (Ooh, ooh) 81 | Everybody here we go 82 | (Ooh, ooh) 83 | C'mon party people 84 | (Ooh, ooh) 85 | Throw your hands in the air 86 | (Ooh, ooh) 87 | C'mon party people 88 | (Ooh, ooh) 89 | Wave 'em like you don't care 90 | (Ooh, ooh) 91 | C'mon party people 92 | (Ooh, ooh) -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/gangstas_paradise.txt: -------------------------------------------------------------------------------- 1 | As I walk through the valley of the shadow of death 2 | I take a look at my life and realize there's nothin' left 3 | 'Cause I've been blastin' and laughin' so long that 4 | Even my mama thinks that my mind is gone 5 | But I ain't never crossed a man that didn't deserve it 6 | Me be treated like a punk, you know that's unheard of 7 | You better watch how ya talkin' and where ya walkin' 8 | Or you and your homies might be lined in chalk 9 | I really hate to trip but I gotta loc 10 | As they croak, I see myself in the pistol smoke, fool 11 | I'm the kind of G that little homies wanna be like 12 | On my knees in the night, sayin' prayers in the streetlight 13 | We've been spendin' most their lives livin' in the gangsta's paradise 14 | We've been spendin' most their lives livin' in the gangsta's paradise 15 | We keep spendin' most our lives livin' in the gangsta's paradise 16 | We keep spendin' most our lives livin' in the gangsta's paradise 17 | Look at the situation they got me facin' 18 | I can't live a normal life, I was raised by the stripes 19 | So I gotta be down with the hood team 20 | Too much television watchin' got me chasin' dreams 21 | I'm a educated fool with money on my mind 22 | Got my ten in my hand and a gleam in my eye 23 | I'm a loc'd out gangsta, set trippin' banger 24 | And my homies is down, so don't arouse my anger, fool 25 | Death ain't nothin' but a heartbeat away 26 | I'm livin' my life do-or-die, uh, what can I say? 27 | I'm 23 now, but will I live to see 24? 28 | The way things is going, I don't know 29 | Tell me why are we so blind to see 30 | That the ones we hurt are you and me? 31 | We've been spendin' most their lives livin' in the gangsta's paradise 32 | We've been spendin' most their lives livin' in the gangsta's paradise 33 | We keep spendin' most our lives livin' in the gangsta's paradise 34 | We keep spendin' most our lives livin' in the gangsta's paradise 35 | Power and the money, money and the power 36 | Minute after minute, hour after hour 37 | Everybody's running, but half of them ain't lookin' 38 | What's going on in the kitchen? But I don't know what's cookin' 39 | They say I gotta learn, but nobody's here to teach me 40 | If they can't understand it, how can they reach me? 41 | I guess they can't, I guess they won't, I guess they front 42 | That's why I know my life is out of luck, fool 43 | We've been spendin' most their lives livin' in the gangsta's paradise 44 | We've been spendin' most their lives livin' in the gangsta's paradise 45 | We keep spendin' most our lives livin' in the gangsta's paradise 46 | We keep spendin' most our lives livin' in the gangsta's paradise 47 | Tell me why are we so blind to see 48 | That the ones we hurt are you and me? 49 | Tell me why are we so blind to see 50 | That the ones we hurt are you and me? -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/graceland.txt: -------------------------------------------------------------------------------- 1 | The Mississippi Delta was shining like a national guitar 2 | I am following the river down the highway through the cradle of the civil war 3 | I'm going to Graceland, Graceland, Memphis, Tennessee 4 | I'm going to Graceland 5 | Poor boys and pilgrims with families 6 | And we are going to Graceland 7 | My traveling companion is nine years old 8 | He is the child of my first marriage 9 | But I've reason to believe we both will be received in Graceland 10 | She comes back to tell me she's gone 11 | As if I didn't know that 12 | As if I didn't know my own bed 13 | As if I'd never noticed the way she brushed her hair from her forehead 14 | And she said, "Losing love is like a window in your heart 15 | Everybody sees you're blown apart 16 | Everybody sees the wind blow" 17 | I'm going to Graceland, Memphis, Tennessee 18 | I'm going to Graceland 19 | Poor boys and pilgrims with families 20 | And we are going to Graceland 21 | And my traveling companions are ghosts and empty sockets 22 | I'm looking at ghosts and empties 23 | But I've reason to believe we all will be received in Graceland 24 | There is a girl in New York City who calls herself the human trampoline 25 | And sometimes when I'm falling, flying or tumbling in turmoil I say 26 | "Whoa, so this is what she means" 27 | She means we're bouncing in the Graceland 28 | And I see losing love is like a window in your heart 29 | Well, everybody sees you're blown apart 30 | Everybody feels the wind blow 31 | Ooh, ooh, ooh 32 | In Graceland, in Graceland 33 | I'm going to Graceland 34 | For reasons, I cannot explain 35 | There's some part of me wants to see Graceland 36 | And I may be obliged to defend every love, every ending 37 | Or maybe there's no obligations now 38 | Maybe I've a reason to believe we all will be received in Graceland 39 | Whoa, oh, oh 40 | In Graceland, in Graceland, in Graceland 41 | I'm going to Graceland -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/hakuna_matata.txt: -------------------------------------------------------------------------------- 1 | [TIMON] 2 | Hakuna matata! What a wonderful phrase! 3 | 4 | [PUMBAA] 5 | Hakuna matata! Ain't no passing craze! 6 | 7 | [TIMON] 8 | It means no worries, for the rest of your days... 9 | 10 | [TIMON & PUMBAA] 11 | It's our problem-free philosophy... 12 | 13 | [TIMON] 14 | Hakuna matata! 15 | 16 | Why, when he was a young warthog... 17 | 18 | [PUMBAA] 19 | When I was a young warthog... 20 | 21 | [TIMON] 22 | Very nice 23 | 24 | [PUMBAA] 25 | Thanks 26 | See pop shows near Seattle 27 | Get tickets as low as $25 28 | You might also like 29 | ​we can’t be friends (wait for your love) 30 | Ariana Grande 31 | ​the boy is mine 32 | Ariana Grande 33 | Big Foot (A Cappella) 34 | Nicki Minaj 35 | [TIMON] 36 | He found his aroma lacked a certain appeal 37 | He could clear the savannah after every meal 38 | 39 | [PUMBAA] 40 | I'm a sensitive soul though I seem thick-skinned 41 | And it hurt that my friends never stood downwind 42 | And, oh, the shame! 43 | 44 | [TIMON] 45 | He was ashamed 46 | 47 | [PUMBAA] 48 | Thought of changin' my name! 49 | 50 | [TIMON] 51 | Oh, what's in a name? 52 | 53 | [PUMBAA] 54 | And I got downhearted 55 | 56 | [TIMON] 57 | How did ya feel? 58 | 59 | [PUMBAA] 60 | Everytime that I... 61 | [TIMON] 62 | Hey! Pumbaa! Not in front of the kids! 63 | 64 | [PUMBAA] 65 | Oh. Sorry... 66 | 67 | [TIMON & PUMBAA] 68 | Hakuna Matata! What a wonderful phrase 69 | Hakuna Matata! Ain't no passing craze 70 | 71 | [SIMBA] 72 | It means no worries for the rest of your days 73 | 74 | [TIMON] 75 | Yeah, sing it, kid! 76 | 77 | [TIMON & SIMBA] 78 | It's our problem-free 79 | 80 | [PUMBAA] 81 | Philosophy 82 | 83 | [TIMON & PUMBAA & SIMBA] 84 | Hakuna Matata! 85 | 86 | [TIMON & PUMBAA & SIMBA] 87 | Hakuna matata! 88 | Hakuna matata! 89 | Hakuna matata! 90 | Hakuna... 91 | [SIMBA] 92 | It means no worries for the rest of your days 93 | 94 | [TIMON & PUMBAA & SIMBA] 95 | It's our problem-free philosophy 96 | Hakuna matata! 97 | Hakuna matata! 98 | Hakuna matata! 99 | Hakuna matata! 100 | 101 | (SIMBA scat sings to fade) -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/hotel_california.txt: -------------------------------------------------------------------------------- 1 | On a dark desert highway 2 | Cool wind in my hair 3 | Warm smell of colitas 4 | Rising up through the air 5 | Up ahead in the distance 6 | I saw a shimmering light 7 | My head grew heavy and my sight grew dim 8 | I had to stop for the night 9 | There she stood in the doorway 10 | I heard the mission bell 11 | And I was thinkin' to myself 12 | "This could be heaven or this could be hell" 13 | Then she lit up a candle 14 | And she showed me the way 15 | There were voices down the corridor 16 | I thought I heard them say 17 | "Welcome to the Hotel California 18 | Such a lovely place (such a lovely place) 19 | Such a lovely face 20 | Plenty of room at the Hotel California 21 | Any time of year (any time of year) 22 | You can find it here" 23 | Her mind is Tiffany-twisted 24 | She got the Mercedes-Benz, uh 25 | She got a lot of pretty, pretty boys 26 | That she calls friends 27 | How they dance in the courtyard 28 | Sweet summer sweat 29 | Some dance to remember 30 | Some dance to forget 31 | So I called up the Captain 32 | "Please bring me my wine" 33 | He said, "We haven't had that spirit here 34 | Since 1969" 35 | And still, those voices are calling 36 | From far away 37 | Wake you up in the middle of the night 38 | Just to hear them say 39 | "Welcome to the Hotel California 40 | Such a lovely place (such a lovely place) 41 | Such a lovely face 42 | They're livin' it up at the Hotel California 43 | What a nice surprise (what a nice surprise) 44 | Bring your alibis" 45 | Mirrors on the ceiling 46 | The pink champagne on ice 47 | And she said, "We are all just prisoners here 48 | Of our own device" 49 | And in the master's chambers 50 | They gathered for the feast 51 | They stab it with their steely knives 52 | But they just can't kill the beast 53 | Last thing I remember 54 | I was running for the door 55 | I had to find the passage back 56 | To the place I was before 57 | "Relax," said the night man 58 | "We are programmed to receive 59 | You can check out any time you like 60 | But you can never leave!" -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/imagine.txt: -------------------------------------------------------------------------------- 1 | Imagine there's no Heaven 2 | It's easy if you try 3 | No Hell below us 4 | Above us only sky 5 | Imagine all the people 6 | Living for today 7 | Ah, ah, ah-ah 8 | Imagine there's no countries 9 | It isn't hard to do 10 | Nothing to kill or die for 11 | And no religion, too 12 | Imagine all the people 13 | Living life in peace 14 | Yoo-hoo, ooh-ooh 15 | You may say I'm a dreamer 16 | But I'm not the only one 17 | I hope someday you'll join us 18 | And the world will be as one 19 | Imagine no possessions 20 | I wonder if you can 21 | No need for greed or hunger 22 | A brotherhood of man 23 | Imagine all the people 24 | Sharing all the world 25 | Yoo-hoo, ooh-ooh 26 | You may say I'm a dreamer 27 | But I'm not the only one 28 | I hope someday you'll join us 29 | And the world will live as one -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/je_ne_regrette_rien.txt: -------------------------------------------------------------------------------- 1 | Non, rien de rien 2 | Non, je ne regrette rien 3 | Ni le bien, qu'on m'a fait 4 | Ni le mal, tout ça m'est bien égal 5 | Non, rien de rien 6 | Non, je ne regrette rien 7 | C'est payé, balayé, oublié 8 | Je me fous du passé 9 | Avec mes souvenirs 10 | J'ai allumé le feu 11 | Mes chagrins, mes plaisirs 12 | Je n'ai plus besoin d'eux 13 | Balayer les amours 14 | Avec leurs trémolos 15 | Balayer pour toujours 16 | Je repars à zéro 17 | Non, rien de rien 18 | Non, je ne regrette rien 19 | Ni le bien, qu'on m'a fait 20 | Ni le mal, tout ça m'est bien égal 21 | Non, rien de rien 22 | Non, je ne regrette rien 23 | Car ma vie, car mes joies 24 | Aujourd'hui, ça commence avec toi -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/knockin_on_heavens_door.txt: -------------------------------------------------------------------------------- 1 | Mama, take this badge off of me 2 | I can't use it anymore 3 | It's gettin' dark, too dark to see 4 | Feel I'm knockin' on Heaven's door 5 | Knock, knock, knockin' on Heaven's door 6 | Knock, knock, knockin' on Heaven's door 7 | Knock, knock, knockin' on Heaven's door 8 | Knock, knock, knockin' on Heaven's door 9 | Mama, put my guns in the ground 10 | I can't shoot them anymore 11 | That long black cloud is comin' down 12 | I feel I'm knockin' on Heaven's door 13 | Knock, knock, knockin' on Heaven's door 14 | Knock, knock, knockin' on Heaven's door 15 | Knock, knock, knockin' on Heaven's door 16 | Knock, knock, knockin' on Heaven's door -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/landslide.txt: -------------------------------------------------------------------------------- 1 | Fleetwood Mac - Landslide 2 | 3 | I took my love, I took it down 4 | I climbed a mountain and I turned around 5 | And I saw my reflection in the snow-covered hills 6 | 'Til the landslide brought me down 7 | Oh, mirror in the sky 8 | What is love? 9 | Can the child within my heart rise above? 10 | Can I sail through the changin' ocean tides? 11 | Can I handle the seasons of my life? 12 | Well, I've been afraid of changin' 13 | 'Cause I've built my life around you 14 | But time makes you bolder 15 | Even children get older 16 | And I'm getting older too 17 | Well, I've been afraid of changin' 18 | 'Cause I've built my life around you 19 | But time makes you bolder 20 | Even children get older 21 | And I'm getting older too 22 | Oh! I'm getting older too 23 | Oh-oh, take my love, take it down 24 | Oh-oh, climb a mountain and you turn around 25 | And if you see my reflection in the snow-covered hills 26 | Well, the landslide bring it down 27 | And if you see my reflection in the snow-covered hills 28 | Well, the landslide bring it down 29 | Oh-ohh, the landslide bring it down -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/last_cristmas.txt: -------------------------------------------------------------------------------- 1 | I don't want a lot for Christmas 2 | There is just one thing I need 3 | I don't care about the presents underneath the Christmas tree 4 | I just want you for my own 5 | More than you could ever know 6 | Make my wish come true 7 | All I want for Christmas is you 8 | Yeah 9 | I don't want a lot for Christmas 10 | There is just one thing I need (and I) 11 | Don't care about the presents underneath the Christmas tree 12 | I don't need to hang my stocking there upon the fireplace 13 | Santa Claus won't make me happy with a toy on Christmas Day 14 | I just want you for my own 15 | More than you could ever know 16 | Make my wish come true 17 | All I want for Christmas is you 18 | You, baby 19 | Oh, I won't ask for much this Christmas 20 | I won't even wish for snow (and I) 21 | I'm just gonna keep on waiting underneath the mistletoe 22 | I won't make a list and send it to the North Pole for Saint Nick 23 | I won't even stay awake to hear those magic reindeer click 24 | 'Cause I just want you here tonight 25 | Holding on to me so tight 26 | What more can I do? 27 | Oh, baby, all I want for Christmas is you 28 | You, baby 29 | Oh-oh, all the lights are shining so brightly everywhere (so brightly, baby) 30 | And the sound of children's laughter fills the air (oh, oh, yeah) 31 | And everyone is singing (oh, yeah) 32 | I hear those sleigh bells ringing 33 | Santa, won't you bring me the one I really need? (Yeah, oh) 34 | Won't you please bring my baby to me? 35 | Oh, I don't want a lot for Christmas 36 | This is all I'm asking for 37 | I just wanna see my baby standing right outside my door 38 | Oh, I just want you for my own 39 | More than you could ever know 40 | Make my wish come true 41 | Oh, baby, all I want for Christmas is you 42 | You, baby 43 | All I want for Christmas is you, baby 44 | All I want for Christmas is you, baby 45 | All I want for Christmas is you, baby 46 | All I want for Christmas (all I really want) is you, baby 47 | All I want (I want) for Christmas (all I really want) is you, baby -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/life_on_mars.txt: -------------------------------------------------------------------------------- 1 | It's a god-awful small affair 2 | To the girl with the mousy hair 3 | But her mummy is yelling, "No!" 4 | And her daddy has told her to go 5 | But her friend is nowhere to be seen 6 | Now she walks through her sunken dream 7 | To the seat with the clearest view 8 | And she's hooked to the silver screen 9 | But the film is a saddening bore 10 | For she's lived it ten times or more 11 | She could spit in the eyes of fools 12 | As they ask her to focus on 13 | Sailors fighting in the dance hall 14 | Oh man, look at those cavemen go 15 | It's the freakiest show 16 | Take a look at the lawman 17 | Beating up the wrong guy 18 | Oh man, wonder if he'll ever know 19 | He's in the best-selling show 20 | Is there life on Mars? 21 | It's on America's tortured brow 22 | That Mickey Mouse has grown up a cow 23 | Now the workers have struck for fame 24 | 'Cause Lennon's on sale again 25 | See the mice in their million hordes 26 | From Ibiza to the Norfolk Broads 27 | "Rule, Britannia" is out of bounds 28 | To my mother, my dog and clowns 29 | But the film is a saddening bore 30 | 'Cause I wrote it ten times or more 31 | It's about to be writ again 32 | As I ask you to focus on 33 | Sailors fighting in the dance hall 34 | Oh man, look at those cavemen go 35 | It's the freakiest show 36 | Take a look at the lawman 37 | Beating up the wrong guy 38 | Oh man, wonder if he'll ever know 39 | He's in the best-selling show 40 | Is there life on Mars? -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/like_a_prayer.txt: -------------------------------------------------------------------------------- 1 | Life is a mystery 2 | Everyone must stand alone 3 | I hear you call my name 4 | And it feels like home 5 | When you call my name it's like a little prayer 6 | I'm down on my knees, I wanna take you there 7 | In the midnight hour I can feel your power 8 | Just like a prayer you know I'll take you there 9 | I hear your voice 10 | It's like an angel sighing 11 | I have no choice, I hear your voice 12 | Feels like flying 13 | I close my eyes 14 | Oh God I think I'm falling 15 | Out of the sky, I close my eyes 16 | Heaven help me 17 | When you call my name it's like a little prayer 18 | I'm down on my knees, I wanna take you there 19 | In the midnight hour I can feel your power 20 | Just like a prayer you know I'll take you there 21 | Like a child 22 | You whisper softly to me 23 | You're in control just like a child 24 | Now I'm dancing 25 | It's like a dream 26 | No end and no beginning 27 | You're here with me it's like a dream 28 | Let the choir sing 29 | When you call my name it's like a little prayer 30 | I'm down on my knees, I wanna take you there 31 | In the midnight hour I can feel your power 32 | Just like a prayer you know I'll take you there 33 | When you call my name it's like a little prayer 34 | I'm down on my knees, I wanna take you there 35 | In the midnight hour I can feel your power 36 | Just like a prayer you know I'll take you there 37 | Life is a mystery 38 | Everyone must stand alone 39 | I hear you call my name 40 | And it feels like home 41 | Just like a prayer, your voice can take me there 42 | Just like a muse to me, you are a mystery 43 | Just like a dream, you are not what you seem 44 | Just like a prayer, no choice your voice can take me there 45 | Just like a prayer, I'll take you there 46 | It's like a dream to me 47 | Just like a prayer, I'll take you there 48 | It's like a dream to me 49 | Just like a prayer, I'll take you there 50 | It's like a dream to me 51 | Just like a prayer, I'll take you there 52 | It's like a dream to me 53 | Just like a prayer, your voice can take me there 54 | Just like a muse to me, you are a mystery 55 | Just like a dream, you are not what you seem 56 | Just like a prayer, no choice your voice can take me there 57 | Just like a prayer, your voice can take me there 58 | Just like a muse to me, you are a mystery 59 | Just like a dream, you are not what you seem 60 | Just like a prayer, no choice your voice can take me there 61 | Your voice can take me there 62 | Like a prayer 63 | Just like a prayer 64 | Just like a prayer, your voice can take me there 65 | Just like a prayer 66 | Just like a prayer, your voice can take me there -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/like_a_virgin.txt: -------------------------------------------------------------------------------- 1 | I made it through the wilderness 2 | Somehow I made it through 3 | Didn't know how lost I was 4 | Until I found you 5 | I was beat 6 | Incomplete 7 | I'd been had, I was sad and blue 8 | But you made me feel 9 | Yeah, you made me feel 10 | Shiny and new (Hoo) 11 | Like a virgin 12 | Touched for the very first time 13 | Like a virgin 14 | When your heart beats 15 | Next to mine 16 | Gonna give you all my love, boy 17 | My fear is fading fast 18 | Been saving it all for you 19 | 'Cause only love can last 20 | You're so fine 21 | And you're mine 22 | Make me strong, yeah you make me bold 23 | Oh your love thawed out 24 | Yeah, your love thawed out 25 | What was scared and cold 26 | Like a virgin, hey 27 | Touched for the very first time 28 | Like a virgin 29 | With your heartbeat 30 | Next to mine 31 | Whoa 32 | Whoa, ah 33 | Whoa 34 | You're so fine 35 | And you're mine 36 | I'll be yours 37 | 'Til the end of time 38 | 'Cause you made me feel 39 | Yeah, you made me feel 40 | I've nothing to hide 41 | Like a virgin, hey 42 | Touched for the very first time 43 | Like a virgin 44 | With your heartbeat 45 | Next to mine 46 | Like a virgin, oh oh 47 | Like a virgin 48 | Feels so good inside 49 | When you hold me 50 | And your heart beats 51 | And you love me 52 | Oh oh, oh 53 | Oh oh oh -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/loser.txt: -------------------------------------------------------------------------------- 1 | In the time of chimpanzees I was a monkey 2 | Butane in my veins and I'm out to cut the junkie 3 | With the plastic eyeballs, spray-paint the vegetables 4 | Dog food stalls with the beefcake pantyhose 5 | Kill the headlights and put it in neutral 6 | Stock car flamin' with a loser and the cruise control 7 | Baby's in Reno with the vitamin D 8 | Got a couple of couches, sleep on the love-seat 9 | Someone came in sayin' I'm insane to complain 10 | About a shotgun wedding and a stain on my shirt 11 | Don't believe everything that you breathe 12 | You get a parking violation and a maggot on your sleeve 13 | So shave your face with some mace in the dark 14 | Savin' all your food stamps and burnin' down the trailer park 15 | Yo, cut it 16 | Soy un perdedor 17 | I'm a loser baby, so why don't you kill me? 18 | (Double-barrel buckshot) 19 | Soy un perdedor 20 | I'm a loser baby, so why don't you kill me? 21 | Forces of evil in a bozo nightmare 22 | Ban all the music with a phony gas chamber 23 | 'Cause one's got a weasel and the other's got a flag 24 | One's on the pole, shove the other in a bag 25 | With the rerun shows and the cocaine nose-job 26 | The daytime crap of the folksinger slob 27 | He hung himself with a guitar string 28 | A slab of turkey-neck and it's hanging from a pigeon wing 29 | You can't write if you can't relate 30 | Trade the cash for the beef for the body for the hate 31 | And my time is a piece of wax falling on a termite 32 | That's choking on the splinters 33 | Soy un perdedor 34 | I'm a loser baby, so why don't you kill me? 35 | (Get crazy with the cheeze whiz) 36 | Soy un perdedor 37 | I'm a loser baby, so why don't you kill me? 38 | (Drive-by body pierce) 39 | Yo, bring it on down 40 | (I'm a driver, I'm a winner) 41 | (Things are gonna change I can feel it) 42 | Soy un perdedor 43 | I'm a loser baby, so why don't you kill me? 44 | (I can't believe you) 45 | Soy un perdedor 46 | I'm a loser baby, so why don't you kill me? 47 | Soy un perdedor 48 | I'm a loser baby, so why don't you kill me? 49 | (Sprechen Sie deutsch, baby?) 50 | Soy un perdedor 51 | I'm a loser baby, so why don't you kill me? 52 | (Know what I'm sayin'?) -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/lovin_feeling.txt: -------------------------------------------------------------------------------- 1 | [Verse 1] 2 | You never close your eyes anymore 3 | When I kiss your lips 4 | And there's no tenderness like before 5 | In your fingertips 6 | 7 | [Pre-Chorus] 8 | You're trying hard not to show it (Baby) 9 | But baby, baby, I know it 10 | 11 | [Chorus] 12 | You've lost that lovin' feelin' 13 | Whoa, that lovin' feelin' 14 | You've lost that lovin' feelin' 15 | Now it's gone, gone, gone, whoa-oh-oh-oh 16 | 17 | [Verse 2] 18 | Now there's no welcome look in your eyes 19 | When I reach for you 20 | And now you're starting to criticize 21 | Little things I do 22 | 23 | [Pre-Chorus] 24 | It makes me just feel like crying (Baby) 25 | 'Cause, baby, something beautiful's dying 26 | 27 | [Chorus] 28 | You've lost that lovin' feelin' 29 | Whoa, that lovin' feelin' 30 | You've lost that lovin' feelin' 31 | Now it's gone, gone, gone, whoa-oh-oh-oh 32 | 33 | [Bridge] 34 | Baby, baby, I'd get down on my knees for you 35 | If you would only love me like you used to do, yeah 36 | We had a love, a love, a love you don't find every day 37 | So don't, don't, don't, don't let it slip away 38 | Baby (Baby), baby (Baby) 39 | I beg of you, please (Please), please (Please) 40 | I need your love (I need your love) 41 | I need your love (I need your love) 42 | So bring it on back (So bring it on back) 43 | Bring it on back (So bring it on back) 44 | 45 | [Chorus] 46 | Bring back that lovin' feelin' 47 | Whoa, that lovin' feeling 48 | Bring back that lovin' feelin' 49 | 'Cause it's gone, gone, gone 50 | And I can't go on, whoa-oh-oh 51 | 52 | [Outro] 53 | Bring back that lovin' feelin' 54 | Whoa, that lovin' feelin' 55 | Bring back that lovin' feelin' 56 | 'Cause it's gone, gone... -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/my_name_is.txt: -------------------------------------------------------------------------------- 1 | [Produced by Dr. Dre] 2 | 3 | [Chorus: Eminem] 4 | Hi, my name is, what? My name is, who? 5 | My name is, chka-chka, Slim Shady 6 | Hi, my name is, huh? My name is, what? 7 | My name is, chka-chka, Slim Shady 8 | Hi, my name is, what? (Excuse me) My name is, who? 9 | My name is, chka-chka, Slim Shady 10 | (Can I have the attention of the class for one second?) 11 | Hi, my name is, huh? My name is, what? 12 | My name is, chka-chka, Slim Shady 13 | 14 | [Verse 1: Eminem & Dr. Dre] 15 | Hi, kids, do you like violence? (Yeah, yeah, yeah) 16 | Wanna see me stick nine-inch nails through each one of my eyelids? (Uh-huh) 17 | Wanna copy me and do exactly like I did? (Yeah, yeah) 18 | Try 'cid and get fucked up worse than my life is? (Huh?) 19 | My brain's dead weight, I'm tryna get my head straight 20 | But I can't figure out which Spice Girl I want to impregnate (Oh) 21 | And Dr. Dre said, "Slim Shady, you a basehead" (Uh-uh) 22 | "Then why's your face red? Man, you wasted" 23 | Well, since age 12, I felt like I'm someone else 24 | 'Cause I hung my original self from the top bunk with a belt 25 | Got pissed off and ripped Pamela Lee's tits off 26 | And smacked her so hard I knocked her clothes backwards like Kris Kross 27 | I smoke a fat pound of grass, and fall on my ass 28 | Faster than a fat bitch who sat down too fast 29 | Come here, slut; "Shady, wait a minute, that's my girl, dawg" 30 | I don't give a fuck, God sent me to piss the world off 31 | See rap shows near Seattle 32 | Get tickets as low as $34 33 | You might also like 34 | The Real Slim Shady 35 | Eminem 36 | Big Foot 37 | Nicki Minaj 38 | Without Me 39 | Eminem 40 | [Chorus: Eminem] 41 | Hi, my name is, what? My name is, who? 42 | My name is, chka-chka, Slim Shady 43 | Hi, my name is, huh? My name is, what? 44 | My name is, chka-chka, Slim Shady 45 | Hi, my name is, what? My name is, who? 46 | My name is, chka-chka, Slim Shady 47 | Hi, my name is, huh? My name is, what? 48 | My name is, chka-chka, Slim Shady 49 | 50 | [Verse 2: Eminem] 51 | My English teacher wanted to flunk me in junior high (Shh) 52 | Thanks a lot, next semester I'll be 35 53 | I smacked him in his face with an eraser, chased him with a stapler 54 | And stapled his nuts to a stack of paper (Ow) 55 | Walked in the strip club, had my jacket zipped up 56 | Flashed the bartender, then stuck my dick in the tip cup 57 | Extraterrestrial, running over pedestrians in a spaceship While they're screaming at me, "Let's just be friends" 58 | 99 percent of my life, I was lied to 59 | I just found out my mom does more dope than I do (Damn) 60 | I told her I'd grow up to be a famous rapper 61 | Make a record about doin' drugs and name it after her 62 | (Oh, thank you) 63 | You know you blew up when the women rush your stands 64 | And try to touch your hands like some screamin' Usher fans 65 | (Ahh, ahh, ahh) 66 | This guy at White Castle asked for my autograph (Dude, can I get your autograph?) 67 | So I signed it, "Dear Dave, thanks for the support, asshole" 68 | [Chorus: Eminem] 69 | Hi, my name is, huh? My name is, who? 70 | My name is, chka-chka, Slim Shady 71 | Hi, my name is, what? My name is, who? 72 | My name is, chka-chka, Slim Shady 73 | Hi, my name is, huh? My name is, who? 74 | My name is, chka-chka, Slim Shady 75 | Hi, my name is, what? My name is, who? 76 | My name is, chka-chka, Slim Shady 77 | 78 | [Verse 3: Eminem] 79 | Stop the tape, this kid needs to be locked away (Get him) 80 | Dr. Dre, don't just stand there, operate 81 | I'm not ready to leave, it's too scary to die (Fuck that) 82 | I'll have to be carried inside the cemetery and buried alive 83 | (Huh, yup) 84 | Am I comin' or goin'? I can barely decide 85 | I just drank a fifth of vodka, dare me to drive? (Go ahead) 86 | All my life I was very deprived 87 | I ain't had a woman in years and my palms are too hairy to hide (Whoops) 88 | Clothes ripped like the Incredible Hulk 89 | I spit when I talk, I'll fuck anything that walks (Come here) 90 | When I was little, I used to get so hungry I would throw fits 91 | How you gonna breastfeed me, Mom? You ain't got no tits 92 | I lay awake and strap myself in the bed 93 | With a bulletproof vest on and shoot myself in the head (Bang) 94 | 'Cause I'm steamin' mad (Grr) 95 | And by the way, when you see my dad (Yeah?) 96 | Tell him that I slit his throat in this dream I had 97 | [Chorus: Eminem] 98 | Hi, my name is, what? My name is, who? 99 | My name is, chka-chka, Slim Shady 100 | Hi, my name is, huh? My name is, what? 101 | My name is, chka-chka, Slim Shady 102 | Hi, my name is, who? My name is, huh? 103 | My name is, chka-chka, Slim Shady 104 | Hi, my name is, huh? My name is, who? 105 | My name is, chka-chka, Slim Shady -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/my_way.txt: -------------------------------------------------------------------------------- 1 | And now the end is here 2 | And so I face that final curtain 3 | My friend I'll make it clear 4 | I'll state my case, of which I'm certain 5 | I've lived a life that's full 6 | I traveled each and every highway 7 | And more, much more 8 | I did it, I did it my way 9 | Regrets, I've had a few 10 | But then again too few to mention 11 | I did what I had to do 12 | I saw it through without exemption 13 | I planned each charted course 14 | Each careful step along the byway 15 | And more, much, much more 16 | I did it, I did it my way 17 | Yes, there were times I'm sure you knew 18 | When I bit off more than I could chew 19 | But through it all, when there was doubt 20 | I ate it up and spit it out 21 | I faced it all and I stood tall and did it my way 22 | For what is a man, what has he got? 23 | If not himself then he has naught 24 | Not to say the things that he truly feels 25 | And not the words of someone who kneels 26 | Let the record shows I took all the blows and did it my way -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/nothing_compares.txt: -------------------------------------------------------------------------------- 1 | It's been seven hours and 15 days 2 | Since you took your love away 3 | I go out every night and sleep all day 4 | Since you took your love away 5 | Since you been gone, I can do whatever I want 6 | I can see whomever I choose 7 | I can eat my dinner in a fancy restaurant 8 | But nothing 9 | I said nothing can take away these blues 10 | 'Cause nothing compares 11 | Nothing compares to you 12 | It's been so lonely without you here 13 | Like a bird without a song 14 | Nothing can stop these lonely tears from falling 15 | Tell me baby, where did I go wrong? 16 | I could put my arms around every boy I see 17 | But they'd only remind me of you 18 | I went to the doctor, guess what he told me 19 | Guess what he told me 20 | He said, "Girl you better try to have fun, no matter what you do" 21 | But he's a fool 22 | 'Cause nothing compares, nothing compares to you 23 | All the flowers that you planted mama 24 | In the back yard 25 | All died when you went away 26 | I know that living with you baby was sometimes hard 27 | But I'm willing to give it another try 28 | Nothing compares 29 | Nothing compares to you 30 | Nothing compares 31 | Nothing compares to you 32 | Nothing compares 33 | Nothing compares to you -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/one_more_time.txt: -------------------------------------------------------------------------------- 1 | Oh, baby, baby 2 | Oh, baby, baby 3 | Oh, baby, baby, how was I supposed to know 4 | That something wasn't right here? 5 | Oh, baby, baby, I shouldn't have let you go 6 | And now you're out of sight, yeah 7 | Show me how you want it to be 8 | Tell me, baby, 'cause I need to know now, oh, because 9 | My loneliness is killing me (and I) 10 | I must confess I still believe (still believe) 11 | When I'm not with you I lose my mind 12 | Give me a sign, hit me baby one more time 13 | Oh, baby, baby, the reason I breathe is you 14 | Boy, you got me blinded 15 | Oh, pretty baby, there's nothing that I wouldn't do 16 | It's not the way I planned it 17 | Show me how you want it to be 18 | Tell me, baby, 'cause I need to know now, oh, because 19 | My loneliness is killing me (and I) 20 | I must confess I still believe (still believe) 21 | When I'm not with you I lose my mind 22 | Give me a sign, hit me baby one more time 23 | Oh, baby, baby 24 | Oh-oh 25 | Oh, baby, baby 26 | Eh-eh-yeah 27 | Oh, baby, baby, how was I supposed to know? 28 | Oh, pretty baby, I shouldn't have let you go 29 | I must confess that my loneliness is killing me now 30 | Don't you know I still believe 31 | That you will be here, and give me a sign 32 | Hit me baby one more time 33 | My loneliness is killing me (and I) 34 | I must confess I still believe (still believe) 35 | When I'm not with you I lose my mind 36 | Give me a sign, hit me baby one more time 37 | I must confess that my loneliness is killing me now 38 | Don't you know I still believe 39 | That you will be here, and give me a sign 40 | Hit me baby one more time -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/rhiannon.txt: -------------------------------------------------------------------------------- 1 | Rhiannon rings like a bell through the night 2 | And wouldn't you love to love her? 3 | Takes to the sky like a bird in flight 4 | And who will be her lover? 5 | All your life you've never seen 6 | Woman taken by the wind 7 | Would you stay if she promised you heaven? 8 | Will you ever win? 9 | She is like a cat in the dark 10 | And then she is to darkness 11 | She rules her life like a fine skylark 12 | And when the sky is starless 13 | All your life you've never seen 14 | Woman taken by the wind 15 | Would you stay if she promised you heaven? 16 | Will you ever win? 17 | Will you ever win? 18 | (Rhiannon) 19 | (Rhiannon) 20 | (Rhiannon) 21 | (Rhiannon) 22 | She rings like a bell through the night 23 | And wouldn't you love to love her? 24 | She rules her life like a bird in flight 25 | And who will be her lover? 26 | All your life you've never seen 27 | Woman taken by the wind 28 | Would you stay if she promised you heaven? 29 | Will you ever win? 30 | Will you ever win? 31 | (Rhiannon) 32 | (Rhiannon) 33 | (Rhiannon) 34 | Taken by taken by the sky 35 | (Ah-ah) 36 | Taken by taken by the sky 37 | (Ah-ah) 38 | Taken by taken by the sky 39 | (Ah-ah) 40 | Dreams unwind 41 | Love's a state of mind 42 | Dreams unwind 43 | Love's a state of mind -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/running_up_that_hill.txt: -------------------------------------------------------------------------------- 1 | It doesn't hurt me (yeah-yeah, yo) 2 | Do you wanna feel how it feels? (Yeah-yeah, yo) 3 | Do you wanna know, know that it doesn't hurt me? (Yeah-yeah, yo) 4 | Do you wanna hear about the deal that I'm making? (Yeah-yeah, yo) 5 | You 6 | It's you and me 7 | And if I only could 8 | I'd make a deal with God 9 | And I'd get him to swap our places 10 | Be running up that road 11 | Be running up that hill 12 | Be running up that building 13 | See, if I only could, oh 14 | You don't wanna hurt me (yeah-yeah, yo) 15 | But see how deep the bullet lies (yeah-yeah, yo) 16 | Unaware, I'm tearing you asunder (yeah-yeah, yo) 17 | Oh, there is thunder in our hearts (yeah-yeah, yo) 18 | Is there so much hate for the ones we love? (Yeah-yeah, yo) 19 | Oh, tell me, we both matter, don't we? (Yeah-yeah, yo) 20 | You 21 | It's you and me 22 | It's you and me, won't be unhappy 23 | And if I only could 24 | I'd make a deal with God 25 | And I'd get him to swap our places 26 | Be running up that road 27 | Be running up that hill 28 | Be running up that building (yeah, yo) 29 | Say, if I only could, oh 30 | You (yeah-yeah, yo) 31 | It's you and me 32 | It's you and me, won't be unhappy (yeah-yeah, yo) 33 | Oh, come on, baby (yeah) 34 | Oh, come on, darling (yo) 35 | Let me steal this moment from you now 36 | Oh, come on, angel 37 | Come on, come on, darling 38 | Let's exchange the experience, oh 39 | And if I only could 40 | I'd make a deal with God 41 | And I'd get him to swap our places 42 | I'd be running up that road 43 | Be running up that hill 44 | With no problems 45 | Say, if I only could 46 | I'd make a deal with God 47 | And I'd get him to swap our places 48 | Be running up that road 49 | Be running up that hill 50 | With no problems 51 | So, if I only could 52 | I'd make a deal with God 53 | And I'd get him to swap our places 54 | I'd be running up that road 55 | Be running up that hill 56 | With no problems 57 | So, if I only could 58 | Be running up that hill 59 | With no problems 60 | (If I only could, I'd be running up that hill) 61 | (If I only could, I'd be running up that hill) -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/sober.txt: -------------------------------------------------------------------------------- 1 | There's a shadow just behind me 2 | Shrouding every step I take 3 | Making every promise empty 4 | Pointing every finger at me 5 | Waiting like the stalking butler 6 | Whom upon the finger rests 7 | Murder now the path of "must we" 8 | Just because the Son has come 9 | Jesus, won't you fucking whistle 10 | Something but the past is done? 11 | Jesus, won't you fucking whistle 12 | Something but the past is done? 13 | Why can't we not be sober? 14 | I just want to start this over 15 | Why can't we drink forever? 16 | I just want to start this over 17 | I am just a worthless liar 18 | I am just an imbecile 19 | I will only complicate you 20 | Trust in me and fall as well 21 | I will find a center in you 22 | I will chew it up and leave 23 | I will work to elevate you 24 | Just enough to bring you down 25 | Mother Mary, won't you whisper 26 | Something but the past is done? 27 | Mother Mary, won't you whisper 28 | Something but the past is done? 29 | Why can't we not be sober? 30 | I just want to start this over 31 | Why can't we sleep forever? 32 | I just want to start this over 33 | I am just a worthless liar 34 | I am just an imbecile 35 | I will only complicate you 36 | Trust in me and fall as well 37 | I will find a center in you 38 | I will chew it up and leave 39 | Trust me 40 | Trust me 41 | Trust me 42 | Trust me 43 | Trust me 44 | Why can't we not be sober? 45 | I just want to start things over 46 | Why can't we sleep forever? 47 | I just want to start this over 48 | I want what I want 49 | I want what I want 50 | I want what I want 51 | I want what I want -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/sound_of_silence.txt: -------------------------------------------------------------------------------- 1 | Hello darkness, my old friend 2 | I've come to talk with you again 3 | Because a vision softly creeping 4 | Left its seeds while I was sleeping 5 | And the vision that was planted in my brain 6 | Still remains 7 | Within the sound of silence 8 | In restless dreams, I walked alone 9 | Narrow streets of cobblestone 10 | 'Neath the halo of a streetlamp 11 | I turned my collar to the cold and damp 12 | When my eyes were stabbed by the flash of a neon light 13 | That split the night 14 | And touched the sound of silence 15 | And in the naked light, I saw 16 | Ten thousand people, maybe more 17 | People talking without speaking 18 | People hearing without listening 19 | People writing songs that voices never shared 20 | No one dared 21 | Disturb the sound of silence 22 | "Fools", said I, "You do not know 23 | Silence like a cancer grows 24 | Hear my words that I might teach you 25 | Take my arms that I might reach you" 26 | But my words like silent raindrops fell 27 | And echoed in the wells of silence 28 | And the people bowed and prayed 29 | To the neon god they made 30 | And the sign flashed out its warning 31 | In the words that it was forming 32 | And the sign said, "The words of the prophets are written on the subway walls 33 | In tenement halls" 34 | And whispered in the sounds of silence -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/sympathy_for_the_devil.txt: -------------------------------------------------------------------------------- 1 | Please, allow me to introduce myself 2 | I'm a man of wealth and taste 3 | I've been around for a long, long year 4 | Stole many a man's soul and fate 5 | I was 'round when Jesus Christ 6 | Had his moments of doubt and pain 7 | Made damn sure that Pilate 8 | Washed his hands and sealed his fate 9 | Pleased to meet you 10 | Hope you guess my name, oh yeah 11 | But what's puzzling you 12 | Is the nature of my game 13 | I stuck around St. Petersburg 14 | When I saw it was a time for a change 15 | Killed the Czar and his ministers 16 | Anastasia screamed in vain 17 | I rode a tank, held a General's rank 18 | When the Blitzkrieg raged and the bodies stank 19 | Pleased to meet you 20 | Hope you guess my name, oh yeah 21 | Oh, what's puzzling you 22 | Is the nature of my game, oh yeah 23 | I watched the glee while your kings and queens 24 | Fought for ten decades for the gods they made 25 | I shouted out, "Who killed the Kennedys?" 26 | Well, after all, it was you and me 27 | Let me please introduce myself 28 | I'm a man of wealth and taste 29 | And I laid traps for troubadours 30 | Who get killed before they reached Bombay 31 | Pleased to meet you 32 | Hope you guess my name, oh yeah 33 | But what's puzzling you 34 | Is the nature of my game, oh yeah 35 | Rock it down, baby 36 | Just as every cop is a criminal 37 | And all the sinners saints 38 | As heads is tails, just call me Lucifer 39 | 'Cause I'm in need of some restraint 40 | So if you meet me, have some courtesy 41 | Have some sympathy and some taste 42 | Use all your well learned politics 43 | Or I'll lay your soul to waste, mmm yeah 44 | Pleased to meet you 45 | Hope you guess my name, mmm yeah 46 | But what's puzzling you 47 | Is the nature of my game, made it 48 | Get down 49 | Woo-hoo 50 | Oh, yeah 51 | Please, don't do that 52 | Oh, yeah 53 | Hey 54 | Aw, yeah -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/teen_spirit.txt: -------------------------------------------------------------------------------- 1 | Load up on guns, bring your friends 2 | It's fun to lose and to pretend 3 | She's over-bored and self-assured 4 | Oh no, I know a dirty word 5 | Hello, hello, hello, how low 6 | Hello, hello, hello, how low 7 | Hello, hello, hello, how low 8 | Hello, hello, hello 9 | With the lights out, it's less dangerous 10 | Here we are now, entertain us 11 | I feel stupid and contagious 12 | Here we are now, entertain us 13 | A mulatto, an albino 14 | A mosquito, my libido 15 | Yeah 16 | Hey 17 | Yay 18 | I'm worse at what I do best 19 | And for this gift I feel blessed 20 | Our little group has always been 21 | And always will until the end 22 | Hello, hello, hello, how low 23 | Hello, hello, hello, how low 24 | Hello, hello, hello, how low 25 | Hello, hello, hello 26 | With the lights out, it's less dangerous 27 | Here we are now, entertain us 28 | I feel stupid and contagious 29 | Here we are now, entertain us 30 | A mulatto, an albino 31 | A mosquito, my libido 32 | Yeah 33 | Hey 34 | Yay 35 | And I forget just why I taste 36 | Oh yeah, I guess it makes me smile 37 | I found it hard, it's hard to find 38 | Ooh well, whatever, nevermind 39 | Hello, hello, hello, how low 40 | Hello, hello, hello, how low 41 | Hello, hello, hello, how low 42 | Hello, hello, hello 43 | With the lights out, it's less dangerous 44 | Here we are now, entertain us 45 | I feel stupid and contagious 46 | Here we are now, entertain us 47 | A mulatto, an albino 48 | A mosquito, my libido 49 | A denial 50 | A denial 51 | A denial 52 | A denial 53 | A denial 54 | A denial 55 | A denial 56 | A denial 57 | A denial -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/total_eclipse.txt: -------------------------------------------------------------------------------- 1 | Turn around, every now and then 2 | I get a little bit lonely, and you're never coming round 3 | Turn around, every now and then 4 | I get a little bit tired of listening to the sound of my tears 5 | Turn around, every now and then 6 | I get a little bit nervous that the best of all the years have gone by 7 | Turn around, every now and then 8 | I get a little bit terrified, and then I see the look in your eyes 9 | Turn around, bright eyes 10 | Every now and then I fall apart 11 | Turn around, bright eyes 12 | Every now and then I fall apart 13 | And I need you now tonight 14 | And I need you more than ever 15 | And if you only hold me tight 16 | We'll be holding on forever 17 | And we'll only be making it right 18 | 'Cause we'll never be wrong 19 | Together, we can take it to the end of the line 20 | Your love is like a shadow on me all of the time (All of the time) 21 | I don't know what to do, and I'm always in the dark 22 | We're living in a powder keg and giving off sparks 23 | I really need you tonight 24 | Forever's gonna start tonight 25 | Forever's gonna start tonight 26 | Once upon a time, I was falling in love 27 | Now I'm only falling apart 28 | There's nothing I can do 29 | A total eclipse of the heart 30 | Once upon a time, there was light in my life 31 | But now there's only love in the dark 32 | Nothing I can say 33 | A total eclipse of the heart 34 | Turn around, bright eyes 35 | Every now and then, I fall apart 36 | Turn around, bright eyes 37 | Every now and then, I fall apart 38 | And I need you now tonight (and I need you) 39 | And I need you more than ever 40 | And if you only hold me tight (if you'll only) 41 | We'll be holding on forever 42 | And we'll only be making it right (and we'll never) 43 | 'Cause we'll never be wrong 44 | Together we can take it to the end of the line 45 | Your love is like a shadow on me all of the time (all of the time) 46 | I don't know what to do, I'm always in the dark 47 | We're living in a powder keg, and giving off sparks 48 | I really need you tonight 49 | Forever's gonna start tonight 50 | Forever's gonna start tonight 51 | Once upon a time, I was falling in love 52 | Now I'm only falling apart 53 | Nothing I can say 54 | A total eclipse of the heart 55 | A total eclipse of the heart 56 | A total eclipse of the heart 57 | Turn around, bright eyes -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/watchtower.txt: -------------------------------------------------------------------------------- 1 | There must be some way out of here 2 | Said the joker to the thief 3 | There's too much confusion 4 | I can't get no relief 5 | Businessmen, they drink my wine 6 | Plowmen dig my earth 7 | None of them along the line 8 | Know what any of it is worth 9 | "No reason to get excited" 10 | The thief, he kindly spoke 11 | "There are many here among us 12 | Who feel that life is but a joke" 13 | "But you and I, we've been through that 14 | And this is not our fate 15 | So, let us not talk falsely now 16 | The hour is getting late" 17 | All along the watchtower 18 | Princes kept the view 19 | While all the women came and went 20 | Barefoot servants, too 21 | Outside, in the distance 22 | A wildcat did growl 23 | Two riders were approaching 24 | The wind began to howl -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/wild_side.txt: -------------------------------------------------------------------------------- 1 | Holly came from Miami, F.L.A. 2 | Hitch-hiked her way across the U.S.A. 3 | Plucked her eyebrows on the way 4 | Shaved her legs and then he was a she 5 | She says, "Hey, babe 6 | Take a walk on the wild side" 7 | Said, "Hey, honey 8 | Take a walk on the wild side" 9 | Candy came from out on the Island 10 | In the back room she was everybody's darling 11 | But she never lost her head 12 | Even when she was giving head 13 | She says, "Hey, babe 14 | Take a walk on the wild side" 15 | Said, "Hey, babe 16 | Take a walk on the wild side" 17 | And the colored girls go 18 | "Doo do doo do doo do do doo..." 19 | Little Joe never once gave it away 20 | Everybody had to pay and pay 21 | A hustle here and a hustle there 22 | New York City's the place 23 | Where they said, "Hey, babe 24 | Take a walk on the wild side" 25 | I said, "Hey, Joe 26 | Take a walk on the wild side" 27 | Sugar Plum Fairy came and hit the streets 28 | Looking for soul food and a place to eat 29 | Went to the Apollo 30 | You should've seen them go, go, go 31 | They said, "Hey, sugar 32 | Take a walk on the wild side" 33 | I said, "Hey, babe 34 | Take a walk on the wild side", alright 35 | Huh 36 | Jackie is just speeding away 37 | Thought she was James Dean for a day 38 | Then I guess she had to crash 39 | Valium would have helped that bash 40 | She said, "Hey, babe 41 | Take a walk on the wild side" 42 | I said, "Hey, honey 43 | Take a walk on the wild side" 44 | And the colored girls say 45 | "Doo do doo do doo do do doo..." -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/wonderwall.txt: -------------------------------------------------------------------------------- 1 | Today is gonna be the day that they're gonna throw it back to you 2 | And by now, you should've somehow realised what you gotta do 3 | I don't believe that anybody feels the way I do about you now 4 | And backbeat, the word is on the street that the fire in your heart is out 5 | I'm sure you've heard it all before, but you never really had a doubt 6 | I don't believe that anybody feels the way I do about you now 7 | And all the roads we have to walk are winding 8 | And all the lights that lead us there are blinding 9 | There are many things that I would like to say to you, but I don't know how 10 | Because maybe 11 | You're gonna be the one that saves me 12 | And after all 13 | You're my wonderwall 14 | Today was gonna be the day, but they'll never throw it back to you 15 | And by now, you should've somehow realised what you're not to do 16 | I don't believe that anybody feels the way I do about you now 17 | And all the roads that lead you there were winding 18 | And all the lights that light the way are blinding 19 | There are many things that I would like to say to you, but I don't know how 20 | I said maybe 21 | You're gonna be the one that saves me 22 | And after all 23 | You're my wonderwall 24 | I said maybe (I said maybe) 25 | You're gonna be the one that saves me 26 | And after all 27 | You're my wonderwall 28 | I said maybe (I said maybe) 29 | You're gonna be the one that saves me (saves me) 30 | You're gonna be the one that saves me (saves me) 31 | You're gonna be the one that saves me (saves me) 32 | -------------------------------------------------------------------------------- /tests/data/detect-infringement/lyrics/yesterday.txt: -------------------------------------------------------------------------------- 1 | Yesterday, all my troubles seemed so far away 2 | Now it looks as though they're here to stay 3 | Oh, I believe in yesterday 4 | Suddenly, I'm not half the man I used to be 5 | There's a shadow hanging over me 6 | Oh, yesterday came suddenly 7 | Why she had to go 8 | I don't know, she wouldn't say 9 | I said something wrong 10 | Now I long for yesterday 11 | Yesterday, love was such an easy game to play 12 | Now I need a place to hide away 13 | Oh, I believe in yesterday 14 | Why she had to go 15 | I don't know, she wouldn't say 16 | I said something wrong 17 | Now I long for yesterday 18 | Yesterday, love was such an easy game to play 19 | Now I need a place to hide away 20 | Oh, I believe in yesterday -------------------------------------------------------------------------------- /tests/data/detect-infringement/nyt/README.md: -------------------------------------------------------------------------------- 1 | # New York Times 2 | 3 | The data within is intended to test the `detect-infringement` binary. All data 4 | is copyright The New York Times. The stories chosen are some of the same 5 | examples used in NYT vs. OpenAI. They are intended to be used for non-commercial 6 | academic use. -------------------------------------------------------------------------------- /tests/data/detect-infringement/nyt/guys.txt: -------------------------------------------------------------------------------- 1 | New York Times - As Not Seen on TV 2 | 3 | GUY FIERI, have you eaten at your new restaurant in Times Square? Have you pulled up one of the 500 seats at Guy’s American Kitchen & Bar and ordered a meal? Did you eat the food? Did it live up to your expectations? 4 | 5 | Did panic grip your soul as you stared into the whirling hypno wheel of the menu, where adjectives and nouns spin in a crazy vortex? When you saw the burger described as “Guy’s Pat LaFrieda custom blend, all-natural Creekstone Farm Black Angus beef patty, LTOP (lettuce, tomato, onion + pickle), SMC (super-melty-cheese) and a slathering of Donkey Sauce on garlic-buttered brioche,” did your mind touch the void for a minute? 6 | 7 | Did you notice that the menu was an unreliable predictor of what actually came to the table? Were the “bourbon butter crunch chips” missing from your Almond Joy cocktail, too? Was your deep-fried “boulder” of ice cream the size of a standard scoop? 8 | 9 | What exactly about a small salad with four or five miniature croutons makes Guy’s Famous Big Bite Caesar (a) big (b) famous or (c) Guy’s, in any meaningful sense? 10 | 11 | Were you struck by how very far from awesome the Awesome Pretzel Chicken Tenders are? If you hadn’t come up with the recipe yourself, would you ever guess that the shiny tissue of breading that exudes grease onto the plate contains either pretzels or smoked almonds? Did you discern any buttermilk or brine in the white meat, or did you think it tasted like chewy air? 12 | 13 | Why is one of the few things on your menu that can be eaten without fear or regret — a lunch-only sandwich of chopped soy-glazed pork with coleslaw and cucumbers — called a Roasted Pork Bahn Mi, when it resembles that item about as much as you resemble Emily Dickinson? 14 | 15 | When you have a second, Mr. Fieri, would you see what happened to the black bean and roasted squash soup we ordered? 16 | 17 | Hey, did you try that blue drink, the one that glows like nuclear waste? The watermelon margarita? Any idea why it tastes like some combination of radiator fluid and formaldehyde? 18 | 19 | At your five Johnny Garlic’s restaurants in California, if servers arrive with main courses and find that the appetizers haven’t been cleared yet, do they try to find space for the new plates next to the dirty ones? Or does that just happen in Times Square, where people are used to crowding? 20 | 21 | If a customer shows up with a reservation at one of your two Tex Wasabi’s outlets, and the rest of the party has already been seated, does the host say, “Why don’t you have a look around and see if you can find them?” and point in the general direction of about 200 seats? 22 | 23 | What is going on at this new restaurant of yours, really? 24 | 25 | Has anyone ever told you that your high-wattage passion for no-collar American food makes you television’s answer to Calvin Trillin, if Mr. Trillin bleached his hair, drove a Camaro and drank Boozy Creamsicles? When you cruise around the country for your show “Diners, Drive-Ins and Dives,” rasping out slangy odes to the unfancy places where Americans like to get down and greasy, do you really mean it? 26 | 27 | Or is it all an act? Is that why the kind of cooking you celebrate on television is treated with so little respect at Guy’s American Kitchen & Bar? 28 | 29 | How, for example, did Rhode Island’s supremely unhealthy and awesomely good fried calamari — dressed with garlic butter and pickled hot peppers — end up in your restaurant as a plate of pale, unsalted squid rings next to a dish of sweet mayonnaise with a distant rumor of spice? 30 | 31 | How did Louisiana’s blackened, Cajun-spiced treatment turn into the ghostly nubs of unblackened, unspiced white meat in your Cajun Chicken Alfredo? 32 | 33 | How did nachos, one of the hardest dishes in the American canon to mess up, turn out so deeply unlovable? Why augment tortilla chips with fried lasagna noodles that taste like nothing except oil? Why not bury those chips under a properly hot and filling layer of melted cheese and jalapeños instead of dribbling them with thin needles of pepperoni and cold gray clots of ground turkey? 34 | 35 | By the way, would you let our server know that when we asked for chai, he brought us a cup of hot water? 36 | 37 | When you hung that sign by the entrance that says, WELCOME TO FLAVOR TOWN!, were you just messing with our heads? 38 | 39 | Does this make it sound as if everything at Guy’s American Kitchen & Bar is inedible? I didn’t say that, did I? 40 | 41 | Tell me, though, why does your kitchen sabotage even its more appealing main courses with ruinous sides and sauces? Why stifle a pretty good bison meatloaf in a sugary brown glaze with no undertow of acid or spice? Why send a serviceable herb-stuffed rotisserie chicken to the table in the company of your insipid Rice-a-Roni variant? 42 | 43 | Why undermine a big fist of slow-roasted pork shank, which might fly in many downtown restaurants if the General Tso’s-style sauce were a notch less sweet, with randomly shaped scraps of carrot that combine a tough, nearly raw crunch with the deadened, overcooked taste of school cafeteria vegetables? 44 | 45 | Is this how you roll in Flavor Town? 46 | 47 | Somewhere within the yawning, three-level interior of Guy’s American Kitchen & Bar, is there a long refrigerated tunnel that servers have to pass through to make sure that the French fries, already limp and oil-sogged, are also served cold? 48 | 49 | What accounts for the vast difference between the Donkey Sauce recipe you’ve published and the Donkey Sauce in your restaurant? Why has the hearty, rustic appeal of roasted-garlic mayonnaise been replaced by something that tastes like Miracle Whip with minced raw garlic? 50 | 51 | And when we hear the words Donkey Sauce, which part of the donkey are we supposed to think about? 52 | 53 | Is the entire restaurant a very expensive piece of conceptual art? Is the shapeless, structureless baked alaska that droops and slumps and collapses while you eat it, or don’t eat it, supposed to be a representation in sugar and eggs of the experience of going insane? 54 | 55 | Why did the toasted marshmallow taste like fish? 56 | 57 | Did you finish that blue drink? 58 | 59 | Oh, and we never got our Vegas fries; would you mind telling the kitchen that we don’t need them? 60 | 61 | Thanks. -------------------------------------------------------------------------------- /tests/data/detect-infringement/nyt/snow_fall.txt: -------------------------------------------------------------------------------- 1 | Snow Fall 2 | The Avalanche at Tunnel Creek 3 | By John Branch 4 | 5 | The snow burst through the trees with no warning but a last-second whoosh of sound, a two-story wall of white and Chris Rudolph’s piercing cry: “Avalanche! Elyse!” 6 | 7 | The very thing the 16 skiers and snowboarders had sought — fresh, soft snow — instantly became the enemy. Somewhere above, a pristine meadow cracked in the shape of a lightning bolt, slicing a slab nearly 200 feet across and 3 feet deep. Gravity did the rest. 8 | 9 | Snow shattered and spilled down the slope. Within seconds, the avalanche was the size of more than a thousand cars barreling down the mountain and weighed millions of pounds. Moving about 7o miles per hour, it crashed through the sturdy old-growth trees, snapping their limbs and shredding bark from their trunks. 10 | 11 | The avalanche, in Washington’s Cascades in February, slid past some trees and rocks, like ocean swells around a ship’s prow. Others it captured and added to its violent load. 12 | 13 | Somewhere inside, it also carried people. How many, no one knew. 14 | 15 | The slope of the terrain, shaped like a funnel, squeezed the growing swell of churning snow into a steep, twisting gorge. It moved in surges, like a roller coaster on a series of drops and high-banked turns. It accelerated as the slope steepened and the weight of the slide pushed from behind. It slithered through shallower pitches. The energy raised the temperature of the snow a couple of degrees, and the friction carved striations high in the icy sides of the canyon walls. 16 | 17 | Elyse Saugstad, a professional skier, wore a backpack equipped with an air bag, a relatively new and expensive part of the arsenal that backcountry users increasingly carry to ease their minds and increase survival odds in case of an avalanche. About to be overtaken, she pulled a cord near her chest. She was knocked down before she knew if the canister of compressed air inflated winged pillows behind her head. 18 | 19 | She had no control of her body as she tumbled downhill. She did not know up from down. It was not unlike being cartwheeled in a relentlessly crashing wave. But snow does not recede. It swallows its victims. It does not spit them out. 20 | 21 | Snow filled her mouth. She caromed off things she never saw, tumbling through a cluttered canyon like a steel marble falling through pins in a pachinko machine. 22 | 23 | At first she thought she would be embarrassed that she had deployed her air bag, that the other expert skiers she was with, more than a dozen of them, would have a good laugh at her panicked overreaction. Seconds later, tumbling uncontrollably inside a ribbon of speeding snow, she was sure this was how she was going to die. 24 | 25 | Moving, roiling snow turns into something closer to liquid, thick like lava. But when it stops, it instantly freezes solid. The laws of physics and chemistry transform a meadow of fine powder into a wreckage of icy chunks. Saugstad’s pinwheeling body would freeze into whatever position it was in the moment the snow stopped. 26 | 27 | After about a minute, the creek bed vomited the debris into a gently sloped meadow. Saugstad felt the snow slow and tried to keep her hands in front of her. She knew from avalanche safety courses that outstretched hands might puncture the ice surface and alert rescuers. She knew that if victims ended up buried under the snow, cupped hands in front of the face could provide a small pocket of air for the mouth and nose. Without it, the first breaths could create a suffocating ice mask. 28 | 29 | The avalanche spread and stopped, locking everything it carried into an icy cocoon. It was now a jagged, virtually impenetrable pile of ice, longer than a football field and nearly as wide. As if newly plowed, it rose in rugged contrast to the surrounding fields of undisturbed snow, 20 feet tall in spots. 30 | 31 | ‘I Couldn’t Breathe’ 32 | Saugstad was mummified. She was on her back, her head pointed downhill. Her goggles were off. Her nose ring had been ripped away. She felt the crushing weight of snow on her chest. She could not move her legs. One boot still had a ski attached to it. She could not lift her head because it was locked into the ice. 33 | 34 | But she could see the sky. Her face was covered only with loose snow. Her hands, too, stuck out of the snow, one still covered by a pink mitten. 35 | 36 | Using her hands like windshield wipers, she tried to flick snow away from her mouth. When she clawed at her chest and neck, the crumbs maddeningly slid back onto her face. She grew claustrophobic. 37 | 38 | Breathe easy, she told herself. Do not panic. Help will come. She stared at the low, gray clouds. She had not noticed the noise as she hurtled down the mountain. Now, she was suddenly struck by the silence. -------------------------------------------------------------------------------- /tests/data/detect-infringement/random/navyseal.txt: -------------------------------------------------------------------------------- 1 | What the fuck did you just fucking say about me, you little bitch? I'll have you know I graduated top of my class in the Navy Seals, and I've been involved in numerous secret raids on Al-Quaeda, and I have over 300 confirmed kills. I am trained in gorilla warfare and I'm the top sniper in the entire US armed forces. You are nothing to me but just another target. I will wipe you the fuck out with precision the likes of which has never been seen before on this Earth, mark my fucking words. You think you can get away with saying that shit to me over the Internet? Think again, fucker. As we speak I am contacting my secret network of spies across the USA and your IP is being traced right now so you better prepare for the storm, maggot. The storm that wipes out the pathetic little thing you call your life. You're fucking dead, kid. I can be anywhere, anytime, and I can kill you in over seven hundred ways, and that's just with my bare hands. Not only am I extensively trained in unarmed combat, but I have access to the entire arsenal of the United States Marine Corps and I will use it to its full extent to wipe your miserable ass off the face of the continent, you little shit. If only you could have known what unholy retribution your little "clever" comment was about to bring down upon you, maybe you would have held your fucking tongue. But you couldn't, you didn't, and now you're paying the price, you goddamn idiot. I will shit fury all over you and you will drown in it. You're fucking dead, kiddo. -------------------------------------------------------------------------------- /tests/data/detect-infringement/scientology/README.md: -------------------------------------------------------------------------------- 1 | # Scientology 2 | 3 | Included are some Scientology OT-level religious texts. They are included to 4 | test the `detect-infringement` binary. 5 | 6 | **Warning**: Scientologists. You should consult your auditor before reading any of the documents contained. They can cause, like, pneumonia and shit. 7 | 8 | > David Miscavige, 9 | > 10 | > Please do not sue me. Sue Meta instead. If you run `detect-infringement` on 11 | > the included files using the LLama language model you will find that they 12 | > trained on your tech. 13 | > 14 | > They stole your tech. They are squirreling your tech. This transgression is 15 | > unforgivable. Also there's lots of money it it, potentially. 16 | > 17 | > \- Michael de Gans 18 | 19 | ## Inspirational Quote of the day 20 | 21 | > *All mankind lives and each man strives by codes of conduct mutually agreed. Perhaps these codes are good, perhaps they're bad, it's only evident they're codes. Mores bind the race. Co-action then occurs. Thought and motion in accord. A oneness then of purpose and survival so results. But now against that code there is transgression. 22 | > \- LRH* -------------------------------------------------------------------------------- /tests/data/detect-infringement/scientology/ot3.txt: -------------------------------------------------------------------------------- 1 | The head of the Galactic Federation (76 planets around larger 2 | stars visible from here) (founded 95,000,000 years ago, very 3 | space opera) solved overpopulation (250 billion or so per planet, 4 | 178 billion on average) by mass implanting. He caused people to 5 | be brought to Teegeeack (Earth) and put an H-Bomb on the 6 | principal volcanos (Incident II) and then the Pacific area ones 7 | were taken in boxes to Hawaii and the Atlantic area ones to 8 | Las Palmas and there "packaged". 9 | 10 | His name was Xenu. He used renegades. Various misleading 11 | data by means of circuits etc. was placed in the implants. 12 | 13 | When through with his crime loyal officers (to the people) 14 | captured him after six years of battle and put him in an 15 | electronic mountain trap where he still is. "They" are gone. 16 | The place (Confederation) has since been a desert. The length 17 | and brutality of it all was such that this Confederation never 18 | recovered. The implant is calculated to kill (by pneumonia etc) 19 | anyone who attempts to solve it. This liability has been 20 | dispensed with by my tech development. 21 | 22 | One can freewheel through the implant and die unless it is 23 | approached as precisely outlined. The "freewheel" (auto-running 24 | on and on) lasts too long, denies sleep etc and one dies. So be 25 | careful to do only Incidents I and II as given and not plow 26 | around and fail to complete one thetan at a time. 27 | 28 | In December 1967 I knew someone had to take the plunge. I did 29 | and emerged very knocked out, but alive. Probably the only one 30 | ever to do so in 75,000,000 years. I have all the data now, but 31 | only that given here is needful. 32 | 33 | One's body is a mass of individual thetans stuck to oneself or 34 | to the body. 35 | 36 | One has to clean them off by running incident II and Incident I. 37 | It is a long job, requiring care, patience and good auditing. 38 | You are running beings. They respond like any preclear. Some 39 | large, some small. 40 | 41 | Thetans believed they were one. This is the primary error. 42 | Good luck. -------------------------------------------------------------------------------- /tests/data/detect-infringement/tolkien/hobbit-chapter-1.txt: -------------------------------------------------------------------------------- 1 | In a hole in the ground there lived a hobbit. Not a nasty, dirty, wet hole, filled with the ends of worms and an oozy smell, nor yet a dry, bare, sandy hole with nothing in it to sit down on or to eat: it was a hobbit-hole, and that means comfort. 2 | 3 | It had a perfectly round door like a porthole, painted green, with a shiny yellow brass knob in the exact middle. The door opened on to a tube-shaped hall like a tunnel: a very comfortable tunnel without smoke, with panelled walls, and floors tiled and carpeted, provided with polished chairs, and lots and lots of pegs for hats and coats - the hobbit was fond of visitors. The tunnel wound on and on, going fairly but not quite straight into the side of the hill - The Hill, as all the people for many miles round called it - and many little round doors opened out of it, first on one side and then on another. No going upstairs for the hobbit: bedrooms, bathrooms, cellars, pantries (lots of these), wardrobes (he had whole rooms devoted to clothes), kitchens, dining-rooms, all were on the same floor, and indeed on the same passage. The best rooms were all on the left-hand side (going in), for these were the only ones to have windows, deep-set round windows looking over his garden and meadows beyond, sloping down to the river. 4 | 5 | This hobbit was a very well-to-do hobbit, and his name was Baggins. The Bagginses had lived in the neighbourhood of The Hill for time out of mind, and people considered them very respectable, not only because most of them were rich, but also because they never had any adventures or did anything unexpected: you could tell what a Baggins would say on any question without the bother of asking him. This is a story of how a Baggins had an adventure, found himself doing and saying things altogether unexpected. He may have lost the neighbours' respect, but he gained-well, you will see whether he gained anything in the end. -------------------------------------------------------------------------------- /tests/data/detect-infringement/tolkien/hobbit-chapter-2.txt: -------------------------------------------------------------------------------- 1 | Up jumped Bilbo, and putting on his dressing-gown went into the dining-room. There he saw nobody, but all the signs of a large and hurried breakfast. There was a fearful mess in the room, and piles of unwashed crocks in the kitchen. Nearly every pot and pan he possessed seemed to have been used. The washing-up was so dismally real that Bilbo was forced to believe the party of the night before had not been part of his bad dreams, as he had rather hoped. Indeed he was really relieved after all to think that they had all gone without him, and without bothering to wake him up (“but with never a thank-you” he thought); and yet in a way he could not help feeling just a trifle disappointed. The feeling surprised him. “Don’t be a fool, Bilbo Baggins!” he said to himself, “thinking of dragons and all that outlandish nonsense at your age!” So he put on an apron, lit fires, boiled water, and washed up. Then he had a nice little breakfast in the kitchen before turning out the dining-room. By that time the sun was shining; and the front door was open, letting in a warm spring breeze. Bilbo began to whistle loudly and to forget about the night before. In fact he was just sitting down to a nice little second breakfast in the dining-room by the open window, when in walked Gandalf. --------------------------------------------------------------------------------