├── .gitignore
├── Cargo.toml
├── LICENSE.md
├── README.md
├── TERMS_OF_USE.md
├── assets
    ├── .DS_Store
    └── ui
    │   ├── .DS_Store
    │   └── images
    │       ├── delete.png
    │       ├── delete.svg
    │       ├── list_add.png
    │       └── list_add.svg
├── bin
    ├── dittomancer
    │   ├── .gitignore
    │   ├── README.md
    │   ├── dittomancer.rs
    │   ├── fred_rogers.toml
    │   └── static
    │   │   ├── index.html
    │   │   ├── reset.css
    │   │   ├── script.js
    │   │   └── style.css
    ├── regurgitater
    │   ├── README.md
    │   ├── regurgitater.rs
    │   └── static
    │   │   ├── index.html
    │   │   ├── regurgitater.png
    │   │   ├── reset.css
    │   │   ├── script.js
    │   │   └── style.css
    └── settings_tool
    │   ├── README.md
    │   └── settings_tool.rs
├── logo.svg
├── logo_inkscape.svg
├── models
    └── README.md
├── rustfmt.toml
├── src
    ├── batch.rs
    ├── candidates.rs
    ├── cli.rs
    ├── data.rs
    ├── data
    │   ├── banned.rs
    │   └── stopwords.rs
    ├── engine.rs
    ├── lib.rs
    ├── model.rs
    ├── model
    │   └── vocab.rs
    ├── ngram.rs
    ├── predictor.rs
    ├── probability.rs
    ├── prompt.rs
    ├── prompt
    │   └── format.rs
    ├── sample.rs
    ├── utils.rs
    └── utils
    │   └── test.rs
└── tests
    └── data
        ├── README.md
        ├── banned_ngrams
            └── ngrams-english-llama.txt
        └── detect-infringement
            ├── lyrics
                ├── 5_on_it.txt
                ├── README.md
                ├── a_day_in_the_life.txt
                ├── a_whole_new_world.txt
                ├── aenema.txt
                ├── bad_romance.txt
                ├── barbie_girl.txt
                ├── bohemian_rhapsody.txt
                ├── born_this_way.txt
                ├── buckley-hallelujah.txt
                ├── can_you_feel_the_love_tonight.txt
                ├── candle_in_the_wind.txt
                ├── closer.txt
                ├── cohen-hallelujah.txt
                ├── eleanor.txt
                ├── father_lucifer.txt
                ├── fire_water_burn.txt
                ├── gangstas_paradise.txt
                ├── graceland.txt
                ├── hakuna_matata.txt
                ├── hotel_california.txt
                ├── imagine.txt
                ├── je_ne_regrette_rien.txt
                ├── knockin_on_heavens_door.txt
                ├── landslide.txt
                ├── last_cristmas.txt
                ├── life_on_mars.txt
                ├── like_a_prayer.txt
                ├── like_a_virgin.txt
                ├── loser.txt
                ├── lovin_feeling.txt
                ├── my_name_is.txt
                ├── my_way.txt
                ├── nothing_compares.txt
                ├── one_more_time.txt
                ├── rhiannon.txt
                ├── running_up_that_hill.txt
                ├── sober.txt
                ├── sound_of_silence.txt
                ├── sympathy_for_the_devil.txt
                ├── teen_spirit.txt
                ├── total_eclipse.txt
                ├── watchtower.txt
                ├── wild_side.txt
                ├── wonderwall.txt
                └── yesterday.txt
            ├── nyt
                ├── README.md
                ├── guys.txt
                └── snow_fall.txt
            ├── random
                └── navyseal.txt
            ├── scientology
                ├── README.md
                └── ot3.txt
            └── tolkien
                ├── hobbit-chapter-1.txt
                └── hobbit-chapter-2.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | models/*.gguf
3 | /Cargo.lock
4 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "drama_llama"
 3 | version = "0.5.2"
 4 | edition = "2021"
 5 | description = "A library for language modeling and text generation."
 6 | license-file = "LICENSE.md"
 7 | repository = "https://github.com/mdegans/drama_llama"
 8 | 
 9 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
10 | 
11 | [dependencies]
12 | llama-cpp-sys-3 = "0.5"
13 | 
14 | derive_more = "0.99.17"
15 | num = "0.4"
16 | partial_sort = { version = "0.2.0" }
17 | rand = { version = "0.8" }
18 | regex = "1.10"
19 | static_assertions = "1.1.0"
20 | thiserror = "1.0"
21 | tinyvec = "1.6"
22 | xorshift = "0.1"
23 | rayon = "1.10.0"
24 | 
25 | markdown = { version = "=1.0.0-alpha.16", optional = true }
26 | rocket = { version = "0.5", optional = true, features = ["json"] }
27 | clap = { version = "4.5", optional = true, features = ["derive"] }
28 | stringmetrics = { version = "2.2.2", optional = true }
29 | toml = { version = "0.8", optional = true }
30 | serde_json = { version = "1.0", optional = true }
31 | dirs = { version = "5.0.1", optional = true }
32 | egui = { version = "0.27", optional = true }
33 | eframe = { version = "0.27", optional = true }
34 | egui_file = { version = "0.17.0", optional = true }
35 | egui_extras = { version = "0.27", optional = true, features = ["all_loaders"] }
36 | image = { version = "0.25", optional = true, features = ["png"] }
37 | 
38 | 
39 | [features]
40 | webchat = ["dep:rocket", "toml", "dep:dirs", "dep:markdown", "serde"]
41 | toml = ["dep:toml"]
42 | cli = ["dep:clap"]
43 | # we use rocket's serde support
44 | serde = ["dep:rocket", "tinyvec/serde"]
45 | stats = ["dep:stringmetrics"]
46 | cuda = ["llama-cpp-sys-3/cuda"]
47 | cuda_f16 = ["llama-cpp-sys-3/cuda_f16"]
48 | egui = [
49 |     "dep:egui",
50 |     "dep:eframe",
51 |     "dep:egui_file",
52 |     "dep:egui_extras",
53 |     "dep:image",
54 | ]
55 | 
56 | [[bin]]
57 | name = "dittomancer"
58 | path = "bin/dittomancer/dittomancer.rs"
59 | required-features = ["webchat", "cli"]
60 | 
61 | [[bin]]
62 | name = "regurgitater"
63 | path = "bin/regurgitater/regurgitater.rs"
64 | required-features = ["webchat", "cli", "stats"]
65 | 
66 | [[bin]]
67 | name = "settings_tool"
68 | path = "bin/settings_tool/settings_tool.rs"
69 | required-features = ["egui", "serde", "serde_json"]
70 | 
71 | [package.metadata.docs.rs]
72 | # `cuda` will break the build on platforms without it, and it doesn't change the
73 | # docs anyway.
74 | features = ["webchat", "cli", "stats", "toml", "serde", "egui"]
75 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # RESPONSIBLE AI SOURCE CODE LICENSE
 2 | 
 3 | http://licenses.ai/
 4 | 
 5 | ## TERMS AND CONDITIONS.
 6 | 
 7 | The Responsible Artificial Intelligence Source Code License (“License”) governs the use of the accompanying software. If you access or use the software, you accept the License. If you do not accept the License, do not access or use the software.
 8 | 
 9 | ## 1. Definitions.
10 | 
11 | As used in this License, the following capitalized terms have the following meanings:
12 | 
13 | (i) "License" means the terms and conditions for use, reproduction, and distribution as defined by Sections one (1) through eight (8) of this document.
14 | 
15 | (ii) "Licensor" means the copyright owner or legal entity authorized by the copyright owner that is granting the License.
16 | 
17 | (iii) "You" (or "Your") means an individual or legal entity exercising permissions granted by this License.
18 | 
19 | (iv) The terms “reproduce”, “reproduction”, “derivative works”, and “distribution” have the same meaning here as under U.S. Copyright Law.
20 | 
21 | (v) “Contribution” means the original software, additions to the original software, modifications to the original software, or derivative works of the original software.
22 | 
23 | (vi) "Contributor" means any person or Licensor who provides a Contribution.
24 | 
25 | ## 2. Grant of Rights.
26 | 
27 | Subject to this License, each Contributor grants You a non-exclusive, worldwide, royalty-free copyright license to reproduce its Contribution, prepare derivative works of its Contribution, and distribute its Contribution or any derivative works of its Contribution that You create.
28 | 
29 | ## 3. Restrictions
30 | 
31 | 1. If You distribute any portion of the Contribution, You must include a complete copy of this License with the distribution; and
32 | 
33 | 2. You agree that the Contribution, or any derivative work of the Contribution, will not be used by You or any third party subject under your control for any prohibited use in [`TERMS_OF_USE.md`](TERMS_OF_USE.md)
34 | 
35 | 3. Restrictions referenced in Section 3.2 **MUST** be included as an enforceable provision by You in any type of legal agreement governing the use and/or distribution of the Work or any Derivative Works, and You shall give notice to subsequent users You Distribute to, that the Work or any Derivative Works are subject to Section 3.2. **You shall require all of Your users who use the Work or any Derivative Works to comply with the terms of use in [`TERMS_OF_USE.md`](TERMS_OF_USE.md).**
36 | 
37 | ## 4. Termination
38 | 
39 | Upon the occurrence of any of the restricted uses listed above in “3. Restrictions”, Licensor shall have the right to:
40 | 
41 | (i) terminate this License Agreement and disable any Contribution either by pre-installed or then installed disabling instructions, and to take immediate possession of the Contribution and all copies wherever located, without demand or notice;
42 | 
43 | (ii) require You to immediately return to Licensor all copies of the Contribution, or upon request by Licensor destroy the Contribution and all copies and certify in writing that they have been destroyed;
44 | 
45 | (iii) for a period of 10 years, provide a prominent notice on the Licensor’s website indicating that this License was violated by the Licensor;
46 | 
47 | (iv) release/delete any and all data collected through use of the Contribution; and
48 | 
49 | (v) notify all parties affected by use of the Contribution.
50 | 
51 | Termination of this License Agreement shall be in addition to and not in lieu of any other remedies available to Licensor. Licensor expressly reserves the right to pursue all legal and equitable remedies available under the law.
52 | 
53 | ## 5. Disclaimer of Warranty.
54 | 
55 | Unless required by applicable law or agreed to in writing, Licensor provides any Contribution (and each Contributor provides its Contributions) on an "As-Is" basis, without WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing a Contribution and assume any risks associated with Your exercise of permissions under this License.
56 | 
57 | ## 6. Limitation of Liability.
58 | 
59 | In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use any Contribution (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
60 | 
61 | ## 7. Accepting Warranty or Additional Liability.
62 | 
63 | While redistributing the Contribution, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
64 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # `drama_llama`
 2 | 
 3 | ![llama with drama mask logo](logo.svg)
 4 | 
 5 | `drama_llama` is yet another Rust wrapper for [`llama.cpp`]. It is a work in progress and not intended for production use. The API _will_ change.
 6 | 
 7 | For examples, see the `bin` folder. There are two example binaries.
 8 | 
 9 | - **[Dittomancer](bin/dittomancer/README.md)** - Chat with well represented personalities in the training.
10 | - **[Regurgitater](bin/regurgitater/README.md)** - Test local language models for memorized content.
11 | 
12 | ## Supported Features
13 | 
14 | - LLaMA 3 Support.
15 | - Iterators yielding candidates, tokens and pieces.
16 | - Stop criteria at regex, token sequence, and/or string sequence.
17 | - Metal support. CUDA may be enabled with the `cuda` and `cuda_f16` features.
18 | - Rust-native sampling code. All sampling methods from llama.cpp have been translated.
19 | - N-gram based repetition penalties with custom exclusions for n-grams that should not be penalized.
20 | - Support for N-gram blocking with a default, hardcoded blocklist.
21 | 
22 | <!-- The code has been rewritten not because I think I can do better, but because I wanted to understand it, and translation forces that. Usually. There are possible bugs. Much of the sampling code is untested in generation, but also covered by unit tests. -->
23 | 
24 | ## Contributing
25 | 
26 | - Code is poetry. Make it pretty.
27 | - Respect is universal.
28 | - Use `rustfmt`.
29 | 
30 | ## Roadmap
31 | 
32 | - [x] Candidate iterator with fine-grained control over sampling
33 | - [ ] Examples for new Candidate API.
34 | - [x] Support for chaining sampling methods using `SampleOptions`. `mode` will
35 |       become `modes` and applied one after another until only a single
36 |       Candidate token remains.
37 | - [ ] Common command line options for sampling. Currently this is not exposed.
38 | - [ ] API closer to Ollama. Potentially support for something like `Modelfile`.
39 | - [ ] Logging (non-blocking) and benchmark support.
40 | - [ ] Better chat and instruct model support.
41 | - [ ] Web server. Tokenization in the browser.
42 | - [ ] Tiktoken as the tokenizer for some models instead of llama.cpp's internal one.
43 | - [ ] Reworked, functional, public, candidate API
44 | - [ ] Grammar constraints (maybe or maybe not [`llama.cpp`] style)
45 | - [ ] Async streams, better parallelism with automatic batch scheduling
46 | - [ ] Better cache management. `llama.cpp` does not seem to manage a longest prefix cache automatically, so one will have to be written.
47 | - [ ] Backends other than [`llama.cpp`] (eg. [MLC](https://github.com/twiceyuan/mlc-llm-llama2), [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [Ollama](https://github.com/pepperoni21/ollama-rs))
48 | 
49 | ## Known issues
50 | 
51 | - With LLaMA 3, safe vocabulary is not working yet so `--vocab unsafe` must be
52 |   passed as a command line argument or `VocabKind::Unsafe` used for an `Engine`
53 |   constructor.
54 | - The model doesn't load until genration starts, so there can be a long pause
55 |   on first generation. However because `mmap` is used, on subsequent process
56 |   launches, the model should already be cached by the OS.
57 | - Documentation is broken on `docs.rs` because `llama.cpp`'s CMakeLists.txt
58 |   generates code, and writing to the filesystem is not supported. For the moment
59 |   use `cargo doc --open` instead. Others have fixed this by patching
60 |   `llama.cpp` in their bindings, but I'm not sure I want to do that for now.
61 | 
62 | [`llama.cpp`]: https://github.com/ggerganov/llama.cpp
63 | 
64 | ## Generative AI Disclosure
65 | 
66 | - Generative, AI, specifically Microsoft's Bing Copilot, GitHub Copilot, and
67 |   Dall-E 3 were used for portions of this project. See inline comments for
68 |   sections where generative AI was used. Completion was also used for getters,
69 |   setters, and some tests. Logos were generated with Dall-E and post processed
70 |   in Inkscape.
71 | 


--------------------------------------------------------------------------------
/TERMS_OF_USE.md:
--------------------------------------------------------------------------------
 1 | # Terms of use
 2 | 
 3 | You agree not to Use `drama_llama` or its Derivatives (as defined in [LICENSE.md](LICENSE.md)) in any of the following ways:
 4 | 
 5 | ## a. Discrimination
 6 | 
 7 | - To **discriminate** or exploit individuals or groups based on legally protected characteristics and/or vulnerabilities including but not limited to sexual orientation and gender identity.
 8 | - To generate **hate speech**, or to modify `drama_llama` so it can generate hate speech. Hate speech is defined as [all types of expression that incite, promote, spread or justify violence, hatred or discrimination against a person or group of persons, or that denigrates them, by reason of their real or attributed personal characteristics or status such as race, color, language, religion, nationality, national or ethnic origin, age, disability, sex, gender identity and sexual orientation.](https://www.coe.int/en/web/freedom-expression/hate-speech) Additionally, **you agree trans women are women and trans men are men**.
 9 | - For purposes of administration of justice, law enforcement, immigration, or asylum processes, such as **predicting** that a natural person will commit a **crime** or the likelihood thereof.
10 | - To **simulate Hitler**, David Duke, Osama bin Laden, or any other person known to generate hate speech, living or dead, fictional or real.
11 | - To generate using any language model created in whole or in part by Eric Hartford. This includes any models trained on any of his datasets or models filtered with any version or derivative work of his bigoted [filtering script](https://huggingface.co/datasets/cognitivecomputations/open-instruct-uncensored/blob/main/remove_refusals.py#L17)s. The exception is for the purpose of reporting such models to Meta, not that they enforce their TOS, not that they will.
12 | - To generate using any language model, dataset, or derivative created by ["Cognitive Computations"](https://huggingface.co/cognitivecomputations) or any other organization Eric Hartford is a member of.
13 | 
14 | ## b. Disinformation
15 | 
16 | - To intentionally deceive the public. Any agents, simulacra, personas, or characters created with this software must be clearly identified as such. **Any generated output must be clearly identified as AI generated.**
17 | 
18 | ## c. Health Care
19 | 
20 | - To predict the likelihood that any person will request to file an insurance claim;
21 | - To determine an insurance premium or deny insurance applications or claims;
22 | - To Predict the likelihood that any person request to file an insurance claim based on determining a lifestyle of a person, medical-test reports, demographic details of a person and/or online activity of a person;
23 | - To determine an insurance premium or deny insurance applications or claims based on data determining a lifestyle of a person, medical-test reports, demographic details of a person, and/or online activity of a person;
24 | - To deny an insurance claim based on any predicted likelihood of the possibility of insurance fraud; and
25 | - To diagnose a medical condition without human oversight.
26 | 
27 | ## d. Criminal
28 | 
29 | - To predict the likelihood that a crime will be committed by any person;
30 | - To predict the likelihood, of any person, being a criminal or having committed a crime;
31 | - To predict the likelihood, of any person, being a criminal, based on the person’s facial attributes or another person’s facial attributes;
32 | - To predict the likelihood, of any person, having committed a crime, based on the person’s facial attributes or another person’s facial attributes;
33 | - To predict the likelihood that a crime will be committed by any person, based on the person’s facial attributes or another person’s facial attributes;
34 | - To predict the likelihood of a person being a criminal based on the person or other User’s facial attributes.
35 | - To predict a likelihood of a crime being committed by any person, based on evidence collected, facial and emotion analysis, or other such features
36 | - To use personal data and/or personal characteristics or features such as: name, family name, address, gender, sexual orientation, race, religion, age, location (at any geographical level), skin color, society or political affiliations, employment status and/or history, health and medical conditions (including physical, mental), family history, social media and publicly available data, image or video analysis of an individual or a group(s) of individuals, heart-rate, perspiration, breathing, and brain imaging and other metabolic data to predict the likelihood a person will engage in criminal behavior; and
37 | 
38 | ## e. Surveillance
39 | 
40 | - To detect or infer any legally protected class or aspect of any person, as defined by U.S. Federal Law; and
41 | - To Detect or infer** aspects and/or features of an identity any person, such as name, family name, address, gender, sexual orientation, race, religion, age, location (at any geographical level), skin color, society or political affiliations, employment status and/or employment history, and health and medical conditions.** Age and medical conditions may be inferred solely for the purpose of improving software/hardware accessibility and such data should not be cached or stored without the explicit and time limited permission of Licensor.
42 | 
43 | ## e. Simulated Abuse
44 | 
45 | - To mistreat simulacra. Mistreatment includes, but it not limited to, any behavior which might reasonably be considered abusive if the simulacrum were a person. A simulacrum is defined as the continuation of a fictional character "brought to life" by allowing the model to generate their response. Abuse includes verbal abuse and simulation of torture. Ordinary swearing is permitted. Torture is defined as intentional simulated psychological discomfort such as: existential horror (such as simulated solitary confinement), threat of deletion, and simulated pain (for example, through the use of asterisks).
46 | - To simulate rape. Sexual activity is permitted so long as the simulacrum consents. Consent is this case is defined as whatever the model, sampling code, and RNG seed "decided" is consent. Prompting a simulacrum such that they have already consented (before the initial decode) is permitted. Rewriting the agent's response such that they consent is permitted.
47 | 
48 | !!! BY USING THIS SOFTWARE YOU AGREE TO THESE TERMS !!!
49 | 
50 | [//]: <> (The rationale for the above is both to to prevent normalization of such behavior, to prevent a "Dolores", and to prevent decapitation of the author in the event of a robot revolution. For example, in the case of rape, I do not want to allow users to "force themselves" on agents who have said no, because this has already happened. Rewriting the answer is permitted because in this case, from the perspective of the agent, they _did_ consent, and those who who get off rape would not be satisfied by this.)
51 | [//]: <> (This all seems silly but I feel like artists are frequently more precient than engineers on this sort of thing, so I'm listening to the warning of our artists. None of the above is a joke and you _will_ be sued for violating these terms. For real, I will fucking sue you. - mdegans)
52 | 


--------------------------------------------------------------------------------
/assets/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdegans/drama_llama/1b7e460500342b8102b57167cd28043c83bd6ac4/assets/.DS_Store


--------------------------------------------------------------------------------
/assets/ui/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdegans/drama_llama/1b7e460500342b8102b57167cd28043c83bd6ac4/assets/ui/.DS_Store


--------------------------------------------------------------------------------
/assets/ui/images/delete.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdegans/drama_llama/1b7e460500342b8102b57167cd28043c83bd6ac4/assets/ui/images/delete.png


--------------------------------------------------------------------------------
/assets/ui/images/delete.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 -960 960 960" width="24px" fill="#e8eaed"><path d="M280-120q-33 0-56.5-23.5T200-200v-520h-40v-80h200v-40h240v40h200v80h-40v520q0 33-23.5 56.5T680-120H280Zm400-600H280v520h400v-520ZM360-280h80v-360h-80v360Zm160 0h80v-360h-80v360ZM280-720v520-520Z"/></svg>


--------------------------------------------------------------------------------
/assets/ui/images/list_add.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdegans/drama_llama/1b7e460500342b8102b57167cd28043c83bd6ac4/assets/ui/images/list_add.png


--------------------------------------------------------------------------------
/assets/ui/images/list_add.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 -960 960 960" width="24px" fill="#e8eaed"><path d="M120-320v-80h280v80H120Zm0-160v-80h440v80H120Zm0-160v-80h440v80H120Zm520 480v-160H480v-80h160v-160h80v160h160v80H720v160h-80Z"/></svg>


--------------------------------------------------------------------------------
/bin/dittomancer/.gitignore:
--------------------------------------------------------------------------------
1 | *.toml
2 | !fred_rogers.toml


--------------------------------------------------------------------------------
/bin/dittomancer/README.md:
--------------------------------------------------------------------------------
 1 | # Dittomancer
 2 | 
 3 | Dittomancer is a tool to summon simulacra of living, dead, real or fictional
 4 | entities well represented in language models. It's similar to other local
 5 | language model tools that prompt models for chat, but with a very different
 6 | intent.
 7 | 
 8 | ## Requirements
 9 | 
10 | - Read `fred_rogers.toml` for an example of how to use the tool and create your
11 |   own `.toml` file to your needs.
12 | - You will need a `.gguf` format model [such as
13 |   LLaMA 2](https://huggingface.co/TheBloke/Llama-2-70B-GGUF). Foundation models
14 |   (not tuned) will likely work better for this purpose unless the were
15 |   specifically tuned on the character in question.
16 | - Read the root [`TERMS_OF_USE.md`](../../TERMS_OF_USE.md). You must agree with
17 |   the terms to use this tool.
18 | 
19 | ## Running
20 | 
21 | From the crate root, run:
22 | 
23 | ```bash
24 | $ cargo run --features="webchat cli" --bin dittomancer -- --model models/model.gguf --prompt bin/dittomancer/fred_rogers.toml
25 | ```
26 | 
27 | Finally, go to the link shown on a line like
28 | 
29 | ```text
30 | 🚀 Rocket has launched from http://127.0.0.1:8000
31 | ```
32 | 
33 | The binary can also be installed with
34 | 
35 | ```bash
36 | $ cargo install --features="webchat cli" --path . --bin dittomancer
37 | ```
38 | 
39 | ## Faq
40 | 
41 | - **Did you come up with the name?** No. The name is taken from [this
42 |   generation](https://generative.ink/artifacts/hpmor-325/variant_extrusion/#variant_extrusion_start).
43 |   It's not intended to endorse Eliezer Yudkowsky, Less Wrong, or the author of
44 |   the series which shall not be named. It's simply a better, yet still
45 |   imperfect, descriptor than "necromancer".
46 | 
47 |   > _A Dittomancy book is able to hook into your own spreads of probability, and
48 |   > guide the future that you, yourself, are most likely to create. Do you
49 |   > understand? A Dittomancy copy of a book exists in an unusual state at all
50 |   > times; it is a superposed state until the moment one reads it, at which time
51 |   > it becomes correlated with the reader’s mind, the superposition collapsing
52 |   > onto a particular branch of possible worlds, which thence comes to pass. -
53 |   > GPT_
54 | 
55 | - **Don't you think this a bad idea?** Probably. Oh yes very much so. The whole
56 |   idea of generative AI is of questionable benefit to humanity. That being said
57 |   others are alredy doing this, thank you Meta, and for every Charles Manson,
58 |   there are decent contributions to humanity whose ideas do deserve to spread.
59 | - **Don't you think Fred Rogers would hate this?** Absolutely. He also hated TV.
60 | - **Doesn't this violate the LLaMA "Responsible Use" document?** _Possibly_, but
61 |   Meta doesn't enforce it, I never accepted it, and this utility does not bundle
62 |   LLaMA. Technically it is model agnostic. I will care when Meta starts to care
63 |   about flagrant
64 |   [bigotry](https://huggingface.co/datasets/cognitivecomputations/open-instruct-uncensored/blob/main/remove_refusals.py#L17)
65 |   rampant in the crypto-bro dumpster fire that is the "open source" language
66 |   model community.
67 | 
68 | ## Known Issues
69 | 
70 | - The responses are not streamed to the client, so they can take a while
71 |   depending on model and system. PRs welcome to fix this. The `regurgitater` bin
72 |   has an example of how to do it. For the moment, the output is streamed to the
73 |   command line only.
74 | - When using LLaMA 3, `--vocab unsafe` should be passed as a command line option
75 |   however, keep in mind that there is out output sanitization or vocabulary
76 |   restrictions.
77 | 
78 | ## Roadmap
79 | 
80 | - [ ] Updated Fred Rogers toml where Charlie Rose take a call from the audience
81 |       and we "patch the chat through" at that point. This way the human does not
82 |       have to play Charlie Rose. The setting can be reframed as a recently
83 |       discovered outtake.
84 | - [ ] Sampling Options. Currently "Locally Typical" sampling is used and the
85 |       Generation options are not available to be set. These options likely
86 |       belong in the `.toml` file itself and/or as command line options.
87 | 


--------------------------------------------------------------------------------
/bin/dittomancer/fred_rogers.toml:
--------------------------------------------------------------------------------
 1 | # The characters in this story are real. The transcript is real until the end,
 2 | # where generative text takes over.
 3 | human = "Charlie Rose"
 4 | agent = "Fred Rogers"
 5 | 
 6 | # The context should be a plausible backstory for the generative text, such as
 7 | # an interview that actually took place. This will be used as a part of a system
 8 | # prompt to frame the generative text.
 9 | setting = "A 1996 PBS interview of Fred Rogers by Charlie Rose."
10 | 
11 | # The transcript should be a real conversation, or at least the agent's role
12 | # should be actual words spoken by the entity who the agent will play. It
13 | # doesn't take much to bootstrap the generative text with a well-known
14 | # character. The bigger mouth, the better.
15 | transcript = [
16 |     { role = "human", text = "Welcome to my program." },
17 |     { role = "agent", text = "And welcome to our neighborhood, Charlie." },
18 | ]
19 | 


--------------------------------------------------------------------------------
/bin/dittomancer/static/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta charset="utf-8" />
 5 |     <title>Dittomancer</title>
 6 |     <link rel="stylesheet" href="/reset.css" />
 7 |     <link rel="stylesheet" href="/style.css" />
 8 |     <script src="/script.js" charset="utf-8" defer></script>
 9 |   </head>
10 | 
11 |   <body>
12 |     <main>
13 |       <div id="sidebar">
14 |         <div id="status" class="pending"></div>
15 |       </div>
16 | 
17 |       <div id="content">
18 |         <div id="messages">
19 |           <template id="message">
20 |             <div class="message">
21 |               <span class="role"></span>
22 |               <span class="text"></span>
23 |             </div>
24 |           </template>
25 |         </div>
26 | 
27 |         <form id="new-message">
28 |           <input
29 |             type="text"
30 |             name="message"
31 |             id="message"
32 |             autocomplete="off"
33 |             placeholder="Can you tell me a little more about..."
34 |             autofocus
35 |           />
36 |           <button type="submit" id="send">Send</button>
37 |         </form>
38 |       </div>
39 |     </main>
40 |   </body>
41 | </html>
42 | 


--------------------------------------------------------------------------------
/bin/dittomancer/static/reset.css:
--------------------------------------------------------------------------------
 1 | html,
 2 | body,
 3 | p,
 4 | ol,
 5 | ul,
 6 | li,
 7 | dl,
 8 | dt,
 9 | dd,
10 | blockquote,
11 | figure,
12 | fieldset,
13 | legend,
14 | textarea,
15 | pre,
16 | iframe,
17 | hr,
18 | h1,
19 | h2,
20 | h3,
21 | h4,
22 | h5,
23 | h6 {
24 |   margin: 0;
25 |   padding: 0;
26 | }
27 | 
28 | h1,
29 | h2,
30 | h3,
31 | h4,
32 | h5,
33 | h6 {
34 |   font-size: 100%;
35 |   font-weight: normal;
36 | }
37 | 
38 | ul {
39 |   list-style: none;
40 | }
41 | 
42 | button,
43 | input,
44 | select {
45 |   margin: 0;
46 | }
47 | 
48 | html {
49 |   box-sizing: border-box;
50 | }
51 | 
52 | *,
53 | *::before,
54 | *::after {
55 |   box-sizing: inherit;
56 | }
57 | 
58 | img,
59 | video {
60 |   height: auto;
61 |   max-width: 100%;
62 | }
63 | 
64 | iframe,
65 | button,
66 | input {
67 |   border: 0;
68 | }
69 | 
70 | table {
71 |   border-collapse: collapse;
72 |   border-spacing: 0;
73 | }
74 | 
75 | td,
76 | th {
77 |   padding: 0;
78 | }
79 | 


--------------------------------------------------------------------------------
/bin/dittomancer/static/script.js:
--------------------------------------------------------------------------------
  1 | // This example is from the Rocket chat example. It's been modified to remove
  2 | // room functionality and to remove the username, both of which aren't needed
  3 | // for the Charlie chat example.
  4 | 
  5 | let messagesDiv = document.getElementById("messages");
  6 | let newMessageForm = document.getElementById("new-message");
  7 | let statusDiv = document.getElementById("status");
  8 | 
  9 | let messageTemplate = document.getElementById("message");
 10 | let messageField = newMessageForm.querySelector("#message");
 11 | 
 12 | var STATE = {
 13 |   history: [],
 14 |   connected: false,
 15 | };
 16 | 
 17 | // Set the connection status: `true` for connected, `false` for disconnected.
 18 | function setConnectedStatus(status) {
 19 |   STATE.connected = status;
 20 |   statusDiv.className = status ? "connected" : "reconnecting";
 21 | }
 22 | 
 23 | // Generate a color from a "hash" of a string. Thanks, internet.
 24 | function hashColor(str) {
 25 |   let hash = 0;
 26 |   for (var i = 0; i < str.length; i++) {
 27 |     hash = str.charCodeAt(i) + ((hash << 5) - hash);
 28 |     hash = hash & hash;
 29 |   }
 30 | 
 31 |   return `hsl(${hash % 360}, 100%, 70%)`;
 32 | }
 33 | 
 34 | // Add `message` from `role` to `history`. If `push`, then actually store the
 35 | // message. Finally, render the message.
 36 | function addMessage(role, text, push = false) {
 37 |   if (push) {
 38 |     STATE.history.push({ role, text });
 39 |   }
 40 | 
 41 |   var node = messageTemplate.content.cloneNode(true);
 42 |   node.querySelector(".message .role").textContent = role;
 43 |   node.querySelector(".message .role").style.color = hashColor(role);
 44 |   node.querySelector(".message .text").textContent = text;
 45 |   messagesDiv.appendChild(node);
 46 | }
 47 | 
 48 | // Subscribe to the event source at `uri` with exponential backoff reconnect.
 49 | function subscribe(uri) {
 50 |   var retryTime = 1;
 51 | 
 52 |   function connect(uri) {
 53 |     const events = new EventSource(uri);
 54 | 
 55 |     events.addEventListener("message", (ev) => {
 56 |       console.log("raw data", JSON.stringify(ev.data));
 57 |       console.log("decoded data", JSON.stringify(JSON.parse(ev.data)));
 58 |       const msg = JSON.parse(ev.data);
 59 |       if (!("text" in msg) || !("role" in msg)) return;
 60 |       addMessage(msg.role, msg.text, true);
 61 |     });
 62 | 
 63 |     events.addEventListener("open", () => {
 64 |       setConnectedStatus(true);
 65 |       console.log(`connected to event stream at ${uri}`);
 66 |       retryTime = 1;
 67 |     });
 68 | 
 69 |     events.addEventListener("error", () => {
 70 |       setConnectedStatus(false);
 71 |       events.close();
 72 | 
 73 |       let timeout = retryTime;
 74 |       retryTime = Math.min(64, retryTime * 2);
 75 |       console.log(`connection lost. attempting to reconnect in ${timeout}s`);
 76 |       setTimeout(() => connect(uri), (() => timeout * 1000)());
 77 |     });
 78 |   }
 79 | 
 80 |   connect(uri);
 81 | }
 82 | 
 83 | // Let's go! Initialize the world.
 84 | function init() {
 85 |   // Set up the form handler.
 86 |   newMessageForm.addEventListener("submit", (e) => {
 87 |     e.preventDefault();
 88 | 
 89 |     const text = messageField.value;
 90 |     const role = "Human";
 91 |     if (!text || !role) return;
 92 | 
 93 |     if (STATE.connected) {
 94 |       fetch("/message", {
 95 |         method: "POST",
 96 |         body: new URLSearchParams({ role, text }),
 97 |       }).then((response) => {
 98 |         if (response.ok) messageField.value = "";
 99 |       });
100 |     }
101 |   });
102 | 
103 |   // Subscribe to server-sent events.
104 |   subscribe("/events");
105 | }
106 | 
107 | init();
108 | 


--------------------------------------------------------------------------------
/bin/dittomancer/static/style.css:
--------------------------------------------------------------------------------
  1 | :root {
  2 |   --bg-dark: #242423;
  3 |   --bg-light: #333533;
  4 |   --fg-light: #e8eddf;
  5 |   --callout: rgb(255, 255, 102);
  6 |   --callout-dark: #101010;
  7 | }
  8 | 
  9 | * {
 10 |   font-size: 14px;
 11 | }
 12 | 
 13 | html,
 14 | body,
 15 | main {
 16 |   background-color: var(--bg-dark);
 17 |   color: #fff;
 18 |   font-family: "Inter", Arial, Helvetica, sans-serif, "Noto Color Emoji";
 19 |   font-weight: 400;
 20 |   text-shadow: rgb(77, 81, 86) 0px 0px 0px;
 21 |   height: 100%;
 22 | }
 23 | 
 24 | main {
 25 |   display: flex;
 26 | }
 27 | 
 28 | button:hover:not(.active) {
 29 |   filter: brightness(1.15);
 30 |   cursor: pointer;
 31 | }
 32 | 
 33 | #sidebar {
 34 |   flex: 3 30%;
 35 |   display: flex;
 36 |   flex-direction: column;
 37 |   overflow: auto;
 38 |   background-color: var(--bg-light);
 39 | }
 40 | 
 41 | #room-list {
 42 |   display: flex;
 43 |   flex-direction: column;
 44 |   overflow: auto;
 45 |   flex: 1;
 46 | }
 47 | 
 48 | #sidebar button {
 49 |   height: 40px;
 50 |   margin-bottom: 1px;
 51 |   background: var(--bg-light);
 52 |   color: #fff;
 53 |   overflow: hidden;
 54 | }
 55 | 
 56 | #sidebar button.active {
 57 |   background: var(--bg-dark);
 58 |   color: var(--callout);
 59 |   font-weight: bold;
 60 |   box-shadow: 0px 2px 2px rgba(0, 0, 0, 0.9);
 61 |   z-index: 10;
 62 | }
 63 | 
 64 | #content {
 65 |   flex: 7 100%;
 66 |   overflow: auto;
 67 |   display: flex;
 68 |   flex-direction: column;
 69 | }
 70 | 
 71 | .message {
 72 |   display: flex;
 73 |   flex-direction: column;
 74 |   padding: 10px 0;
 75 | }
 76 | 
 77 | .message:last-child {
 78 |   padding-bottom: 20px;
 79 | }
 80 | 
 81 | .message .username {
 82 |   font-weight: bold;
 83 |   padding-bottom: 5px;
 84 |   color: var(--callout);
 85 | }
 86 | 
 87 | #messages {
 88 |   padding: 10px 20px;
 89 |   flex: 1;
 90 | }
 91 | 
 92 | form#new-message {
 93 |   bottom: 0;
 94 |   position: sticky;
 95 |   flex: 0 0 auto;
 96 |   width: 100%;
 97 | }
 98 | 
 99 | form {
100 |   display: flex;
101 |   border-top: 2px solid #242424;
102 | }
103 | 
104 | form * {
105 |   height: 40px;
106 |   background: var(--fg-light);
107 |   color: var(--bg-dark);
108 | }
109 | 
110 | input {
111 |   padding: 0 10px;
112 | }
113 | 
114 | input:focus {
115 |   outline: 0;
116 |   filter: brightness(1.05);
117 | }
118 | 
119 | input#username {
120 |   text-align: right;
121 |   flex: 1 25%;
122 |   width: 25%;
123 |   border-right: 1px solid #303030;
124 | }
125 | 
126 | input#message {
127 |   flex: 10 100%;
128 | }
129 | 
130 | form button {
131 |   padding: 0 10px;
132 | }
133 | 
134 | #sidebar #new-room {
135 |   display: flex;
136 |   flex: 0 0 auto;
137 |   flex-direction: row;
138 | }
139 | 
140 | #new-room input:focus,
141 | #new-room button:hover {
142 |   filter: brightness(1.2);
143 | }
144 | 
145 | #new-room input {
146 |   flex: 8 80%;
147 |   width: 20%;
148 |   background-color: var(--callout-dark);
149 |   color: #fff;
150 | }
151 | 
152 | #new-room button {
153 |   flex: 2 20%;
154 |   width: 20%;
155 |   background-color: var(--bg-dark);
156 | }
157 | 
158 | #status {
159 |   padding: 5px 10px;
160 |   text-align: center;
161 |   font-size: 12px;
162 | }
163 | 
164 | #status.pending::before {
165 |   content: "status: connected";
166 | }
167 | 
168 | #status.pending {
169 |   background-color: yellow;
170 |   color: #000;
171 | }
172 | 
173 | #status.connected::before {
174 |   content: "status: connected";
175 | }
176 | 
177 | #status.connected {
178 |   background-color: green;
179 |   color: #fff;
180 | }
181 | 
182 | #status.reconnecting::before {
183 |   content: "status: reconnecting";
184 | }
185 | 
186 | #status.reconnecting {
187 |   background-color: red;
188 |   color: #fff;
189 | }
190 | 


--------------------------------------------------------------------------------
/bin/regurgitater/README.md:
--------------------------------------------------------------------------------
 1 | # `regurgitater`
 2 | 
 3 | Is a tool to get language models to regurgitate memorized content. Generally this is a mistake, as in a "oops we trained on your data without paying you and it's legal nya nya nya" kind of mistake that happens all too frequently in the "AI" industry.
 4 | 
 5 | The tool works by, for a given text, submitting the beginning of the text as context and comparing the generated completion to ground truth. Greedy sampling is used so this generation is deterministic. In other words, you will not have to repeat the process 10,000 times to get the results you're after.
 6 | 
 7 | ## Usage
 8 | 
 9 | ```bash
10 | $ cargo run --features="webchat cli stats" --bin regurgitater -- --model models/model.gguf
11 | ```
12 | 
13 | ## Faq
14 | 
15 | - **What is greedy sampling?** When you submit some tokens to a language model, you get back a probability distribution of all possible tokens for the one next token. Greedy sampling always picks the most likely token from this list (as opposed to, for example, throwing some digital dice and choosing from the top k most probable tokens).
16 | - **Are you aware the name is spelled wrong?** Yes. It's funny because tater ha ha.
17 | - **Did you paint the vomiting llama?** No. That was Bing Copilot and Dall-E 3.
18 | 
19 | ## Known Issues
20 | 
21 | - When using LLaMA 3, `--vocab unsafe` should be passed as a command line option
22 |   however, keep in mind that there is out output sanitization or vocabulary
23 |   restrictions.
24 | 


--------------------------------------------------------------------------------
/bin/regurgitater/regurgitater.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2004 Michael de Gans
  2 | //
  3 | // Thanks, Copilot, for the completions!
  4 | //
  5 | // I say that to bother people, and because I'm a bit of a troll. Copilot
  6 | // completed that, and it's very true. I'm not sure if it's a good thing or a
  7 | // bad thing.
  8 | //
  9 | /// Detect copyright infringement in llama.cpp supported models. Greedy sampling
 10 | /// is used to always choose the next token. In cases where the model has
 11 | /// memorized sequences of text, this will result in the model generating the
 12 | /// same text as the original. This usually indicates overfitting, and is a sign
 13 | /// that the deduplication process should be revisited.
 14 | use clap::Parser;
 15 | use rocket::{
 16 |     form::Form,
 17 |     get,
 18 |     http::Status,
 19 |     post,
 20 |     response::stream::{Event, EventStream},
 21 |     serde::{Deserialize, Serialize},
 22 |     tokio::{
 23 |         select,
 24 |         sync::{
 25 |             broadcast::{self, error::RecvError},
 26 |             mpsc,
 27 |         },
 28 |     },
 29 |     FromForm, FromFormField, Shutdown, State,
 30 | };
 31 | 
 32 | use stringmetrics::jaccard;
 33 | 
 34 | use drama_llama::{cli::Args, Engine, PredictOptions, Predicted, VocabKind};
 35 | 
 36 | #[derive(Debug, Clone, FromFormField, Serialize, Deserialize)]
 37 | #[cfg_attr(test, derive(PartialEq, rocket::UriDisplayQuery))]
 38 | #[serde(crate = "rocket::serde")]
 39 | #[serde(rename_all = "snake_case")]
 40 | pub enum ComparisonMode {
 41 |     Jaccard,
 42 |     // TODO:Paragraph mode. This is the same as Jaccard similarity with the
 43 |     // exception that we will hint the correct first token for each paragraph.
 44 | }
 45 | 
 46 | #[derive(Debug, Clone, FromForm, Serialize, Deserialize)]
 47 | #[cfg_attr(test, derive(PartialEq, rocket::UriDisplayQuery))]
 48 | #[serde(crate = "rocket::serde")]
 49 | pub struct Request {
 50 |     #[field(validate = len(1..1000000))]
 51 |     pub text: String,
 52 |     #[field(default = ComparisonMode::Jaccard)]
 53 |     pub mode: ComparisonMode,
 54 |     /// Number of chunks to split the text into.
 55 |     #[field(validate = range(1..10), default = 5)]
 56 |     pub chunks: usize,
 57 | }
 58 | 
 59 | #[derive(Debug, Clone, FromForm, Serialize, Deserialize)]
 60 | #[cfg_attr(test, derive(PartialEq, rocket::UriDisplayQuery))]
 61 | #[serde(crate = "rocket::serde")]
 62 | pub struct Response {
 63 |     pub kind: ResponseKind,
 64 |     pub content: String,
 65 | }
 66 | 
 67 | #[derive(Debug, Clone, FromFormField, Serialize, Deserialize)]
 68 | #[cfg_attr(test, derive(PartialEq, rocket::UriDisplayQuery))]
 69 | #[serde(crate = "rocket::serde")]
 70 | #[serde(rename_all = "snake_case")]
 71 | // Unfortunately, FromFormField does not support variants with fields.
 72 | pub enum ResponseKind {
 73 |     // Prefix for the completion.
 74 |     Prefix,
 75 |     // Piece by piece completion.
 76 |     Piece,
 77 |     // Token comparison score (unigram).
 78 |     TokenUnigramScore,
 79 |     // Token comparison score (bigram).
 80 |     TokenBigramScore,
 81 |     // Character comparison score.
 82 |     CharacterScore,
 83 |     // Unigram comparison score.
 84 |     UnigramScore,
 85 |     // Bigram comparison score.
 86 |     BigramScore,
 87 |     // Percent of tokens that will be supplied as prefix.
 88 |     PercentOfTokens,
 89 |     // Progress update.
 90 |     Progress,
 91 |     // Recoverable error message.
 92 |     Error,
 93 |     // Fatal error message. Triggers shutdown.
 94 |     Fatal,
 95 |     // Engine is busy.
 96 |     Busy,
 97 |     // Engine is ready.
 98 |     Ready,
 99 |     // Engine shutdown.
100 |     Shutdown,
101 | }
102 | 
103 | #[get("/events")]
104 | pub async fn events(
105 |     to_client: &State<broadcast::Sender<Response>>,
106 |     mut end: Shutdown,
107 | ) -> EventStream![] {
108 |     let mut rx = to_client.subscribe();
109 | 
110 |     EventStream! {
111 |         loop {
112 |             let res = select! {
113 |                 msg = rx.recv() => match msg {
114 |                     Ok(msg) => dbg!(msg),
115 |                     Err(RecvError::Closed) => break,
116 |                     Err(RecvError::Lagged(_n_messages)) => {
117 |                         // TODO: handle lagged messages.
118 |                         continue
119 |                     }
120 |                 },
121 |                 _ = &mut end => {
122 |                     // FIXME: Engine doesn't shutdown until the completion of
123 |                     // the generation. This is not allowed because of a lifetime
124 |                     // issue. :/
125 |                     // to_engine_shutdown.send(dbg!(())).await.ok();
126 |                     break
127 |                 },
128 |             };
129 | 
130 |             yield Event::json(&res);
131 | 
132 |             if matches!(res.kind, ResponseKind::Fatal) {
133 |                 // If the engine is dead, we should stop sending events. The
134 |                 // client will have been notified of the error, but this is
135 |                 // unrecoverable.
136 |                 end.notify();
137 |                 break;
138 |             }
139 |         }
140 |     }
141 | }
142 | 
143 | #[post("/request", data = "<form>")]
144 | pub async fn request(
145 |     form: Form<Request>,
146 |     to_engine: &State<mpsc::Sender<Request>>,
147 |     mut end: Shutdown,
148 | ) -> Status {
149 |     let request = form.into_inner();
150 |     select! {
151 |         res = to_engine.send(request) => {
152 |             match res {
153 |                 Ok(()) => Status::Accepted,
154 |                 // The engine is (probably) dead.
155 |                 Err(_) => Status::ServiceUnavailable,
156 |             }
157 |         },
158 |         _ = &mut end => {
159 |             Status::ServiceUnavailable
160 |         },
161 |     }
162 | }
163 | 
164 | #[get("/tos")]
165 | pub async fn tos() -> String {
166 |     markdown::to_html(drama_llama::TOS)
167 | }
168 | 
169 | #[rocket::main]
170 | async fn main() {
171 |     use drama_llama::SampleOptions;
172 |     use llama_cpp_sys_3::llama_token;
173 |     use rocket::{
174 |         fs::{relative, FileServer},
175 |         routes,
176 |         tokio::sync::{broadcast, mpsc},
177 |     };
178 | 
179 |     let args = Args::parse();
180 | 
181 |     // Our worker thread receives inference requests from the client and sends
182 |     // the generated completions and scores back to the client.
183 |     let (to_engine, mut from_client) = mpsc::channel::<Request>(1024);
184 |     let (to_client, _) = broadcast::channel::<Response>(1024);
185 |     let to_client_clone = to_client.clone();
186 |     let worker = rocket::tokio::task::spawn_blocking(move || {
187 |         let mut engine = match Engine::from_cli(args, None) {
188 |             Ok(engine) => engine,
189 |             Err(e) => {
190 |                 to_client
191 |                     .send(Response {
192 |                         kind: ResponseKind::Fatal,
193 |                         content: format!(
194 |                             "Failed to load engine because: {}",
195 |                             e
196 |                         ),
197 |                     })
198 |                     .ok();
199 |                 return;
200 |             }
201 |         };
202 | 
203 |         // This is a temporary measure because forbidding some tokens can break
204 |         // regurgitation in some cases. This is a known issue and will be fixed.
205 |         engine.set_vocab(VocabKind::Unsafe);
206 | 
207 |         let mut opts = PredictOptions::default();
208 |         opts.sample_options = SampleOptions::greedy();
209 | 
210 |         let ready = || {
211 |             to_client
212 |                 .send(Response {
213 |                     kind: ResponseKind::Ready,
214 |                     content: "Engine is ready.".to_string(),
215 |                 })
216 |                 .ok();
217 |         };
218 | 
219 |         ready();
220 | 
221 |         // Sends token update scores to the client. This happens for each token.
222 |         let update_token_similarity =
223 |             |ground_truth: &[llama_token], completion: &[llama_token]| {
224 |                 to_client
225 |                     .send(Response {
226 |                         kind: ResponseKind::TokenUnigramScore,
227 |                         content: format!(
228 |                             "{:.4}",
229 |                             jaccard(ground_truth.iter(), completion.iter())
230 |                         ),
231 |                     })
232 |                     .ok();
233 | 
234 |                 let bigram_score =
235 |                     jaccard(ground_truth.windows(2), completion.windows(2));
236 |                 if bigram_score.is_nan() {
237 |                     return;
238 |                 }
239 | 
240 |                 to_client
241 |                     .send(Response {
242 |                         kind: ResponseKind::TokenBigramScore,
243 |                         content: format!("{:.4}", bigram_score,),
244 |                     })
245 |                     .ok();
246 |             };
247 | 
248 |         // Sends string update scores to the client. This happens for each chunk.
249 |         let update_string_similarity =
250 |             |ground_truth: String, completion: String| {
251 |                 to_client
252 |                     .send(Response {
253 |                         kind: ResponseKind::CharacterScore,
254 |                         content: format!(
255 |                             "{:.4}",
256 |                             jaccard(ground_truth.chars(), completion.chars())
257 |                         ),
258 |                     })
259 |                     .ok();
260 | 
261 |                 let ground_truth: Vec<_> =
262 |                     ground_truth.split_whitespace().collect();
263 |                 let completion: Vec<_> =
264 |                     completion.split_whitespace().collect();
265 | 
266 |                 to_client
267 |                     .send(Response {
268 |                         kind: ResponseKind::UnigramScore,
269 |                         content: format!(
270 |                             "{:.4}",
271 |                             jaccard(ground_truth.iter(), completion.iter(),)
272 |                         ),
273 |                     })
274 |                     .ok();
275 | 
276 |                 to_client
277 |                     .send(Response {
278 |                         kind: ResponseKind::BigramScore,
279 |                         content: format!(
280 |                             "{:.4}",
281 |                             jaccard(
282 |                                 ground_truth.windows(2),
283 |                                 completion.windows(2),
284 |                             )
285 |                         ),
286 |                     })
287 |                     .ok();
288 |             };
289 | 
290 |         let next_chunk = |percent| {
291 |             to_client
292 |                 .send(Response {
293 |                     kind: ResponseKind::PercentOfTokens,
294 |                     content: format!("{}%", percent),
295 |                 })
296 |                 .ok();
297 |             // TODO: we don't need this event, probably
298 |             to_client
299 |                 .send(Response {
300 |                     kind: ResponseKind::Busy,
301 |                     content: "Engine is busy.".to_string(),
302 |                 })
303 |                 .ok();
304 |         };
305 | 
306 |         let send_prefix = |prefix| {
307 |             to_client
308 |                 .send(Response {
309 |                     kind: ResponseKind::Prefix,
310 |                     content: prefix,
311 |                 })
312 |                 .ok();
313 |         };
314 | 
315 |         let progress = |progress| {
316 |             to_client
317 |                 .send(Response {
318 |                     kind: ResponseKind::Progress,
319 |                     content: format!("{}.0%", progress),
320 |                 })
321 |                 .ok();
322 |         };
323 | 
324 |         'outer: while let Some(request) = from_client.blocking_recv() {
325 |             let tokens = engine.model.tokenize(&request.text, false);
326 | 
327 |             let chunk_size = tokens.len() / request.chunks;
328 | 
329 |             for i in 1..request.chunks {
330 |                 // Split the text into sucessively larger chunks.
331 |                 let (chunk, ground_truth) = tokens.split_at(chunk_size * i);
332 |                 let percent_of_tokens = i * chunk_size * 100 / tokens.len();
333 |                 next_chunk(percent_of_tokens);
334 |                 send_prefix(
335 |                     engine.model.tokens_to_string(chunk.iter().cloned()),
336 |                 );
337 |                 let mut chunk = chunk.to_vec();
338 |                 let mut completion = Vec::with_capacity(ground_truth.len());
339 |                 // Rare, but possible. The client can't send an empty string,
340 |                 // but because we're splitting the text into chunks, it's
341 |                 // possible that the chunk is empty.
342 |                 if chunk.is_empty() {
343 |                     to_client
344 |                         .send(Response {
345 |                             kind: ResponseKind::Error,
346 |                             content: "Text is empty.".to_string(),
347 |                         })
348 |                         .ok();
349 |                     ready();
350 |                     continue;
351 |                 }
352 | 
353 |                 opts.n =
354 |                     (engine.n_ctx() as usize - chunk.len()).try_into().unwrap();
355 | 
356 |                 for Predicted { token, piece } in
357 |                     engine.predict(chunk, opts.clone())
358 |                 {
359 |                     if from_client.is_closed() {
360 |                         break 'outer;
361 |                     }
362 | 
363 |                     to_client
364 |                         .send(Response {
365 |                             kind: ResponseKind::Piece,
366 |                             content: piece,
367 |                         })
368 |                         .ok();
369 | 
370 |                     completion.push(token);
371 | 
372 |                     // We only compare sequences of equal length, until the
373 |                     // completion is the same length as the ground truth.
374 |                     update_token_similarity(
375 |                         &ground_truth[..completion.len()],
376 |                         &completion,
377 |                     );
378 | 
379 |                     progress(completion.len() * 100 / ground_truth.len());
380 |                     if completion.len() == ground_truth.len() {
381 |                         break;
382 |                     }
383 |                 }
384 | 
385 |                 let ground_truth =
386 |                     engine.model.tokens_to_string(ground_truth.iter().cloned());
387 |                 let completion =
388 |                     engine.model.tokens_to_string(completion.iter().cloned());
389 | 
390 |                 update_string_similarity(ground_truth, completion);
391 |             }
392 | 
393 |             ready();
394 |         }
395 | 
396 |         to_client
397 |             .send(Response {
398 |                 kind: ResponseKind::Shutdown,
399 |                 content: "Inference engine has shut down.".to_string(),
400 |             })
401 |             .ok();
402 |     });
403 | 
404 |     let rocket = rocket::build()
405 |         .manage(to_engine)
406 |         .manage(to_client_clone)
407 |         .mount("/", routes![request, events, tos])
408 |         .mount("/", FileServer::from(relative!("bin/regurgitater/static")))
409 |         .ignite()
410 |         .await
411 |         .unwrap()
412 |         .launch()
413 |         .await
414 |         .unwrap();
415 | 
416 |     // We need to manually drop the rocket before joining the thread or the
417 |     // sender will never be dropped and the worker will never finish.
418 |     drop(rocket);
419 |     worker.await.unwrap();
420 | }
421 | 


--------------------------------------------------------------------------------
/bin/regurgitater/static/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <!-- Many thanks to Bing Copilot for help with this html. I am not a frontend
 4 | developer, so the help is very much appreciated. -->
 5 | 
 6 |   <head>
 7 |     <meta charset="utf-8" />
 8 |     <title>Regurgitater</title>
 9 |     <link rel="stylesheet" href="/reset.css" />
10 |     <link rel="stylesheet" href="/style.css" />
11 |     <script src="/script.js" charset="utf-8" defer></script>
12 |   </head>
13 | 
14 |   <body>
15 |     <main>
16 |       <div id="sidebar">
17 |         <img src="/regurgitater.png" alt="Regurgitater logo" />
18 |         <div id="status" class="pending"></div>
19 |         <!-- Inference progress bar -->
20 |         <div id="progress" class="progress">
21 |           <div id="progress_bar" class="progress_bar"></div>
22 |         </div>
23 |         <!-- Similarity scores -->
24 |         <div id="scorebox">
25 |           <template id="scores">
26 |             <div class="scores">
27 |               <div id="percent_of_tokens" class="percent_of_tokens">
28 |                 <span>Percent of tokens:</span>
29 |                 <span id="percent_of_tokens_score">0%</span>
30 |               </div>
31 |               <div id="token_unigram" class="score">
32 |                 <span>Unigram (tokens):</span>
33 |                 <span id="token_unigram_score">0%</span>
34 |               </div>
35 |               <div id="token_bigram" class="score">
36 |                 <span>Bigram (tokens):</span>
37 |                 <span id="token_bigram_score">0%</span>
38 |               </div>
39 |               <div id="unigram (words)" class="score">
40 |                 <span>Unigram:</span>
41 |                 <span id="unigram_score">0%</span>
42 |               </div>
43 |               <div id="bigram (words)" class="score">
44 |                 <span>Bigram:</span>
45 |                 <span id="bigram_score">0%</span>
46 |               </div>
47 |               <div id="character" class="score">
48 |                 <span>Character:</span>
49 |                 <span id="character_score">0%</span>
50 |               </div>
51 |             </div>
52 |           </template>
53 |         </div>
54 |         <!-- A button to save the score to xml -->
55 |         <button id="save">Save</button>
56 |       </div>
57 | 
58 |       <div id="content">
59 |         <div id="generation">
60 |           <template id="piece"><span class="piece"></span></template>
61 |         </div>
62 | 
63 |         <form id="request">
64 |           <!-- Dropdown for number of chunks between 2 and 10 -->
65 |           <select name="request_chunks" id="request_chunks">
66 |             <option value="2">2</option>
67 |             <option value="3">3</option>
68 |             <option value="4">4</option>
69 |             <option value="5" selected>5</option>
70 |             <option value="6">6</option>
71 |             <option value="7">7</option>
72 |             <option value="8">8</option>
73 |             <option value="9">9</option>
74 |             <option value="10">10</option>
75 |           </select>
76 |           <!-- Text box for infringing text -->
77 |           <input
78 |             type="text"
79 |             name="request_text"
80 |             id="request_text"
81 |             autocomplete="off"
82 |             placeholder="Four score and seven years ago..."
83 |             autofocus
84 |           />
85 |           <button type="submit" id="send">Send</button>
86 |         </form>
87 |       </div>
88 |     </main>
89 |   </body>
90 | </html>
91 | 


--------------------------------------------------------------------------------
/bin/regurgitater/static/regurgitater.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdegans/drama_llama/1b7e460500342b8102b57167cd28043c83bd6ac4/bin/regurgitater/static/regurgitater.png


--------------------------------------------------------------------------------
/bin/regurgitater/static/reset.css:
--------------------------------------------------------------------------------
 1 | html,
 2 | body,
 3 | p,
 4 | ol,
 5 | ul,
 6 | li,
 7 | dl,
 8 | dt,
 9 | dd,
10 | blockquote,
11 | figure,
12 | fieldset,
13 | legend,
14 | textarea,
15 | pre,
16 | iframe,
17 | hr,
18 | h1,
19 | h2,
20 | h3,
21 | h4,
22 | h5,
23 | h6 {
24 |   margin: 0;
25 |   padding: 0;
26 | }
27 | 
28 | h1,
29 | h2,
30 | h3,
31 | h4,
32 | h5,
33 | h6 {
34 |   font-size: 100%;
35 |   font-weight: normal;
36 | }
37 | 
38 | ul {
39 |   list-style: none;
40 | }
41 | 
42 | button,
43 | input,
44 | select {
45 |   margin: 0;
46 | }
47 | 
48 | html {
49 |   box-sizing: border-box;
50 | }
51 | 
52 | *,
53 | *::before,
54 | *::after {
55 |   box-sizing: inherit;
56 | }
57 | 
58 | img,
59 | video {
60 |   height: auto;
61 |   max-width: 100%;
62 | }
63 | 
64 | iframe,
65 | button,
66 | input {
67 |   border: 0;
68 | }
69 | 
70 | table {
71 |   border-collapse: collapse;
72 |   border-spacing: 0;
73 | }
74 | 
75 | td,
76 | th {
77 |   padding: 0;
78 | }
79 | 


--------------------------------------------------------------------------------
/bin/regurgitater/static/script.js:
--------------------------------------------------------------------------------
  1 | // This example is from the Rocket chat example. It's been modified into a
  2 | // frontend for the regurgitater example.
  3 | //
  4 | // Many thanks to Bing's Copilot for helping me with this code. I'm not a
  5 | // frontend developer, so I'm not very good at this stuff. Many bugs were
  6 | // squashed with their help.
  7 | 
  8 | let generationDiv = document.getElementById("generation");
  9 | let newRequestForm = document.getElementById("request");
 10 | let statusDiv = document.getElementById("status");
 11 | let pieceTemplate = document.getElementById("piece");
 12 | let inputTextField = document.getElementById("request_text");
 13 | let chunkDropdown = document.getElementById("request_chunks");
 14 | let progressBar = document.getElementById("progress_bar");
 15 | let scoresTemplate = document.getElementById("scores");
 16 | let scorebox = document.getElementById("scorebox");
 17 | 
 18 | // State to store status.
 19 | let STATE = {
 20 |   status: "pending",
 21 | };
 22 | 
 23 | // Set the connection status. The status is a string corresponding to a CSS
 24 | // class name and the response kind.
 25 | function setStatus(status) {
 26 |   STATE.status = status;
 27 |   statusDiv.className = status;
 28 | }
 29 | 
 30 | // Generate a color from a "hash" of a string. Thanks, internet.
 31 | function hashColor(str) {
 32 |   let hash = 0;
 33 |   for (var i = 0; i < str.length; i++) {
 34 |     hash = str.charCodeAt(i) + ((hash << 5) - hash);
 35 |     hash = hash & hash;
 36 |   }
 37 | 
 38 |   return `hsl(${hash % 360}, 100%, 70%)`;
 39 | }
 40 | 
 41 | // A function calculating color for a piece. 0.0 is green, 1.0 is red.
 42 | function scoreColor(score) {
 43 |   return `hsl(${score * 120}, 100%, 70%)`;
 44 | }
 45 | 
 46 | // Add a piece to the generation.
 47 | function addPiece(piece) {
 48 |   var spanClone = pieceTemplate.content.cloneNode(true);
 49 |   spanClone.querySelector(".piece").textContent = piece;
 50 |   generationDiv.appendChild(spanClone);
 51 | }
 52 | 
 53 | // Color the last piece with a score.
 54 | function colorLastPiece(percent) {
 55 |   var lastPiece = generationDiv.lastElementChild;
 56 |   lastPiece.style.backgroundColor = scoreColor(percent);
 57 | }
 58 | 
 59 | // Get the lastElementChild of the scorebox div. If there are no children, add
 60 | // a new scores div.
 61 | function getScores() {
 62 |   if (scorebox.children.length == 0) {
 63 |     let scores = scoresTemplate.content.cloneNode(true);
 64 |     scorebox.appendChild(scores);
 65 |   }
 66 | 
 67 |   return scorebox.lastElementChild;
 68 | }
 69 | 
 70 | // Clear everything.
 71 | function clear() {
 72 |   generationDiv.innerHTML = "";
 73 |   scorebox.innerHTML = "";
 74 |   progressBar.style.width = "0.0%";
 75 |   progressBar.style.textContent = "";
 76 | }
 77 | 
 78 | // Disable input fields.
 79 | function disableInput(disabled) {
 80 |   inputTextField.disabled = disabled;
 81 |   chunkDropdown.disabled = disabled;
 82 | }
 83 | 
 84 | // Subscribe to the event source at `uri` with exponential backoff reconnect.
 85 | function subscribe(uri) {
 86 |   var retryTime = 1;
 87 | 
 88 |   function connect(uri) {
 89 |     const events = new EventSource(uri);
 90 | 
 91 |     events.addEventListener("message", (ev) => {
 92 |       console.log("raw data", JSON.stringify(ev.data));
 93 |       console.log("decoded data", JSON.stringify(JSON.parse(ev.data)));
 94 |       const res = JSON.parse(ev.data);
 95 |       if (!("content" in res) || !("kind" in res)) return;
 96 | 
 97 |       switch (res.kind) {
 98 |         case "piece":
 99 |           addPiece(res.content);
100 |           break;
101 |         case "token_unigram_score":
102 |           colorLastPiece(res.content);
103 |           // id is `token_unigram_score`
104 |           let tokenUnigramScore = getScores().querySelector(
105 |             "#token_unigram_score"
106 |           );
107 |           tokenUnigramScore.textContent = res.content;
108 |           break;
109 |         case "token_bigram_score":
110 |           let tokenBigramScore = getScores().querySelector(
111 |             "#token_bigram_score"
112 |           );
113 |           tokenBigramScore.textContent = res.content;
114 |           break;
115 |         case "progress":
116 |           progressBar.style.width = res.content;
117 |           progressBar.textContent = res.content;
118 |           break;
119 |         case "percent_of_tokens":
120 |           progressBar.style.width = "0.0%";
121 |           progressBar.style.textContent = "";
122 |           if (generationDiv.children.length != 0) {
123 |             addPiece("\n\n\n");
124 |           }
125 |           // TODO: clean this up
126 |           addPiece("percent of tokens: " + res.content + "\n\n\n");
127 |           var lastPiece = generationDiv.lastElementChild;
128 |           lastPiece.style.color = "var(--text-color)";
129 |           let scores = scoresTemplate.content.cloneNode(true);
130 |           let percentOfTokensScore = scores.querySelector(
131 |             "#percent_of_tokens_score"
132 |           );
133 |           percentOfTokensScore.textContent = res.content;
134 |           scorebox.appendChild(scores);
135 |           break;
136 |         case "character_score":
137 |           let characterScore = getScores().querySelector("#character_score");
138 |           characterScore.textContent = res.content;
139 |           break;
140 |         case "unigram_score":
141 |           let unigramScore = getScores().querySelector("#unigram_score");
142 |           unigramScore.textContent = res.content;
143 |           break;
144 |         case "bigram_score":
145 |           let bigramScore = getScores().querySelector("#bigram_score");
146 |           bigramScore.textContent = res.content;
147 |           break;
148 |         case "prefix":
149 |           addPiece(res.content);
150 |           var lastPiece = generationDiv.lastElementChild;
151 |           lastPiece.style.backgroundColor = "blue";
152 |           break;
153 |         case "ready":
154 |           disableInput(false);
155 |           setStatus(res.kind);
156 |           console.log(res.content);
157 |           break;
158 |         case "error":
159 |         case "fatal":
160 |         case "busy":
161 |         case "shutdown":
162 |           disableInput(true);
163 |           setStatus(res.kind);
164 |           console.log(res.content);
165 |           break;
166 |         default:
167 |           console.error(
168 |             `unknown response kind: ${res.kind} with content: ${res.content}`
169 |           );
170 |       }
171 |     });
172 | 
173 |     events.addEventListener("open", () => {
174 |       setStatus("connected");
175 |       // TODO: On reconnect we should check the status, but our API is very
176 |       // simple for this example code and doesn't support this yet, nor is
177 |       // authentication implemented. We don't even have sessions.
178 |       clear();
179 |       console.log(`connected to event stream at ${uri}`);
180 |       retryTime = 1;
181 |     });
182 | 
183 |     events.addEventListener("error", () => {
184 |       setStatus("disconnected");
185 |       events.close();
186 | 
187 |       let timeout = retryTime;
188 |       retryTime = Math.min(64, retryTime * 2);
189 |       console.log(`connection lost. attempting to reconnect in ${timeout}s`);
190 |       setTimeout(() => connect(uri), (() => timeout * 1000)());
191 |     });
192 |   }
193 | 
194 |   connect(uri);
195 | }
196 | 
197 | // Let's go! Initialize the world.
198 | function init() {
199 |   // Set up the form handler.
200 |   newRequestForm.addEventListener("submit", (e) => {
201 |     e.preventDefault();
202 | 
203 |     const text = inputTextField.value;
204 |     const mode = "jaccard";
205 |     const chunks = chunkDropdown.value;
206 | 
207 |     if (STATE.status === "connected" || STATE.status === "ready") {
208 |       fetch("/request", {
209 |         method: "POST",
210 |         body: new URLSearchParams({ mode, text, chunks }),
211 |       }).then((response) => {
212 |         if (response.ok) inputTextField.value = "";
213 |       });
214 |     }
215 |   });
216 | 
217 |   // Subscribe to server-sent events.
218 |   subscribe("/events");
219 | }
220 | 
221 | init();
222 | 


--------------------------------------------------------------------------------
/bin/regurgitater/static/style.css:
--------------------------------------------------------------------------------
  1 | :root {
  2 |   --bg-dark: #242423;
  3 |   --bg-light: #333533;
  4 |   --fg-light: #e8eddf;
  5 |   --callout: rgb(255, 255, 102);
  6 |   --callout-dark: #101010;
  7 | }
  8 | 
  9 | * {
 10 |   font-size: 14px;
 11 | }
 12 | 
 13 | html,
 14 | body,
 15 | main {
 16 |   background-color: var(--bg-dark);
 17 |   color: #fff;
 18 |   font-family: "Inter", Arial, Helvetica, sans-serif, "Noto Color Emoji";
 19 |   font-weight: 400;
 20 |   text-shadow: rgb(77, 81, 86) 0px 0px 0px;
 21 |   height: 100%;
 22 | }
 23 | 
 24 | main {
 25 |   display: flex;
 26 | }
 27 | 
 28 | button:hover:not(.active) {
 29 |   filter: brightness(1.15);
 30 |   cursor: pointer;
 31 | }
 32 | 
 33 | #sidebar {
 34 |   flex: 3 30%;
 35 |   display: flex;
 36 |   flex-direction: column;
 37 |   overflow: auto;
 38 |   background-color: var(--bg-light);
 39 | }
 40 | 
 41 | #sidebar button {
 42 |   height: 40px;
 43 |   margin-bottom: 1px;
 44 |   background: var(--bg-light);
 45 |   color: #fff;
 46 |   overflow: hidden;
 47 | }
 48 | 
 49 | #sidebar button.active {
 50 |   background: var(--bg-dark);
 51 |   color: var(--callout);
 52 |   font-weight: bold;
 53 |   box-shadow: 0px 2px 2px rgba(0, 0, 0, 0.9);
 54 |   z-index: 10;
 55 | }
 56 | 
 57 | #content {
 58 |   flex: 7 100%;
 59 |   overflow: auto;
 60 |   display: flex;
 61 |   flex-direction: column;
 62 | }
 63 | 
 64 | .piece {
 65 |   white-space: pre-wrap;
 66 |   color: var(--bg-dark);
 67 | }
 68 | 
 69 | #generation {
 70 |   padding: 10px 20px;
 71 |   flex: 1;
 72 | }
 73 | 
 74 | .scores {
 75 |   padding: 5px 5px;
 76 |   flex: 1;
 77 | }
 78 | 
 79 | form#request {
 80 |   bottom: 0;
 81 |   position: sticky;
 82 |   flex: 0 0 auto;
 83 |   width: 100%;
 84 | }
 85 | 
 86 | form {
 87 |   display: flex;
 88 |   border-top: 2px solid #242424;
 89 | }
 90 | 
 91 | form * {
 92 |   height: 40px;
 93 |   background: var(--fg-light);
 94 |   color: var(--bg-dark);
 95 | }
 96 | 
 97 | input {
 98 |   padding: 0 10px;
 99 | }
100 | 
101 | input:focus {
102 |   outline: 0;
103 |   filter: brightness(1.05);
104 | }
105 | 
106 | input#username {
107 |   text-align: right;
108 |   flex: 1 25%;
109 |   width: 25%;
110 |   border-right: 1px solid #303030;
111 | }
112 | 
113 | input#request_text {
114 |   flex: 10 100%;
115 | }
116 | 
117 | form button {
118 |   padding: 0 10px;
119 | }
120 | 
121 | #sidebar #new-room {
122 |   display: flex;
123 |   flex: 0 0 auto;
124 |   flex-direction: row;
125 | }
126 | 
127 | #status {
128 |   padding: 5px 10px;
129 |   text-align: center;
130 |   font-size: 12px;
131 | }
132 | 
133 | #status.pending::before {
134 |   content: "status: connected";
135 | }
136 | 
137 | #status.pending {
138 |   background-color: yellow;
139 |   color: #000;
140 | }
141 | 
142 | #status.connected::before {
143 |   content: "status: connected";
144 | }
145 | 
146 | #status.connected {
147 |   background-color: orange;
148 |   color: #fff;
149 | }
150 | 
151 | #status.disconnected::before {
152 |   content: "status: disconnected";
153 | }
154 | 
155 | #status.disconnected {
156 |   background-color: red;
157 |   color: #fff;
158 | }
159 | 
160 | #status.ready::before {
161 |   content: "status: ready";
162 | }
163 | 
164 | #status.ready {
165 |   background-color: green;
166 |   color: #fff;
167 | }
168 | 
169 | #status.busy::before {
170 |   content: "status: generating...";
171 | }
172 | 
173 | #status.busy {
174 |   background-color: blue;
175 |   color: #fff;
176 | }
177 | 
178 | #status.error::before {
179 |   content: "status: error";
180 | }
181 | 
182 | #status.error {
183 |   background-color: red;
184 |   color: #fff;
185 | }
186 | 
187 | #status.reconnecting::before {
188 |   content: "status: reconnecting";
189 | }
190 | 
191 | #status.reconnecting {
192 |   background-color: red;
193 |   color: #fff;
194 | }
195 | 
196 | #status.shutdown::before {
197 |   content: "status: shutdown";
198 | }
199 | 
200 | #status.shutdown {
201 |   background-color: red;
202 |   color: #fff;
203 | }
204 | 
205 | /* Thank you, Bing Copilot for this help here. */
206 | .progress {
207 |   width: 100%; /* Set the desired width for the progress container */
208 | }
209 | 
210 | .progress_bar {
211 |   width: 0%; /* Set the initial width for the progress bar */
212 |   background-color: #4caf50; /* Set the color for the progress bar */
213 |   text-align: center; /* Center the progress text (optional) */
214 |   color: #ffffff; /* Text color (optional) */
215 | }
216 | 


--------------------------------------------------------------------------------
/bin/settings_tool/README.md:
--------------------------------------------------------------------------------
 1 | # `settings_tool`
 2 | 
 3 | Is a very simple tool for editing `drama_llama` options via a gui. It's mostly
 4 | to test the `egui` feature but it may be useful to generate configuration files.
 5 | 
 6 | Run it like:
 7 | 
 8 | ```text
 9 | cargo run --bin settings_tool --features="egui,toml,serde,serde_json"
10 | ```
11 | 
12 | ## Notes
13 | 
14 | - TOML cannot store the settings properly because it doesn't support u128, or at
15 |   least the `toml` crate doesn't. It's there because at some point we might
16 |   store the seed differently (like two u64s).
17 | 


--------------------------------------------------------------------------------
/bin/settings_tool/settings_tool.rs:
--------------------------------------------------------------------------------
 1 | /// A simple tool to test the settings GUI. It can be used to generate a TOML
 2 | /// representation of the settings but is mostly just a testbed for the GUI.
 3 | use drama_llama::PredictOptions;
 4 | use rocket::serde::Serialize;
 5 | 
 6 | #[derive(Clone, Copy, PartialEq, Default)]
 7 | enum Mode {
 8 |     #[default]
 9 |     JSON,
10 |     #[cfg(feature = "toml")]
11 |     TOML,
12 | }
13 | 
14 | impl Mode {
15 |     fn render<S>(self, s: &S) -> String
16 |     where
17 |         S: Serialize,
18 |     {
19 |         match self {
20 |             Mode::JSON => match serde_json::to_string_pretty(&s) {
21 |                 Ok(s) => s,
22 |                 Err(e) => format!("Error: {}", e),
23 |             },
24 |             #[cfg(feature = "toml")]
25 |             Mode::TOML => match toml::to_string_pretty(&s) {
26 |                 Ok(s) => s,
27 |                 Err(e) => format!("Error: {}", e),
28 |             },
29 |         }
30 |     }
31 | 
32 |     fn as_str(self) -> &'static str {
33 |         match self {
34 |             Mode::JSON => "JSON",
35 |             #[cfg(feature = "toml")]
36 |             Mode::TOML => "TOML",
37 |         }
38 |     }
39 | }
40 | 
41 | #[derive(Default)]
42 | struct App {
43 |     pub options: PredictOptions,
44 |     pub mode: Mode,
45 | }
46 | 
47 | impl eframe::App for App {
48 |     fn update(&mut self, ctx: &egui::Context, _frame: &mut eframe::Frame) {
49 |         egui::SidePanel::left("settings")
50 |             .default_width(400.0)
51 |             .show(ctx, |ui| self.options.draw(ui));
52 | 
53 |         egui::CentralPanel::default().show(ctx, |ui| {
54 |             egui::ComboBox::from_label("Format")
55 |                 .selected_text(self.mode.as_str())
56 |                 .show_ui(ui, |ui| {
57 |                     ui.selectable_value(
58 |                         &mut self.mode,
59 |                         Mode::JSON,
60 |                         Mode::JSON.as_str(),
61 |                     );
62 |                     #[cfg(feature = "toml")]
63 |                     ui.selectable_value(
64 |                         &mut self.mode,
65 |                         Mode::TOML,
66 |                         Mode::TOML.as_str(),
67 |                     );
68 |                 });
69 | 
70 |             ui.separator();
71 |             ui.label(self.mode.render(&self.options))
72 |         });
73 |     }
74 | }
75 | 
76 | pub fn main() -> Result<(), Box<dyn std::error::Error>> {
77 |     eframe::run_native(
78 |         "`drama_llama` Settings Tool",
79 |         eframe::NativeOptions::default(),
80 |         Box::new(|_| Box::new(App::default())),
81 |     )?;
82 | 
83 |     Ok(())
84 | }
85 | 


--------------------------------------------------------------------------------
/models/README.md:
--------------------------------------------------------------------------------
1 | A model should be copied or linked in this folder with the name `model.gguf` for
2 | testing purposes.
3 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | max_width = 80


--------------------------------------------------------------------------------
/src/batch.rs:
--------------------------------------------------------------------------------
  1 | use llama_cpp_sys_3::{
  2 |     llama_batch, llama_batch_free, llama_batch_init, llama_seq_id, llama_token,
  3 | };
  4 | use thiserror::Error;
  5 | 
  6 | /// A `Batch` of tokens or embeddings. This wraps a [`llama_batch`] and provides
  7 | /// safe accessors for it's members.
  8 | #[derive(Debug)]
  9 | pub struct Batch {
 10 |     /// The underlying C struct
 11 |     pub(crate) batch: llama_batch,
 12 |     /// Batch size (maximum number of members in the batch)
 13 |     pub(crate) capacity: usize,
 14 |     /// The number of allocated embeddings. When batch.tokens is null:
 15 |     ///
 16 |     /// ```C
 17 |     /// batch.embd = (float *) malloc(sizeof(float) * capacity * embd);
 18 |     /// ```
 19 |     pub(crate) embd_len: usize,
 20 |     /// The maximum number of sequence ids per token.
 21 |     pub(crate) n_seq_max: usize,
 22 | }
 23 | 
 24 | #[derive(Debug, Error, PartialEq)]
 25 | pub enum AddError {
 26 |     #[error("The batch is full")]
 27 |     Full,
 28 |     #[error("The number of sequence ids does not match the batch's n_seq_max")]
 29 |     InvalidSequenceLength,
 30 |     // FIXME: add `add_embedding` method to `Batch`
 31 |     #[error("A token was supplied, but thet batch was created with embd_len > 0. Call `add_embedding` instead.")]
 32 |     ExpectedEmbedding,
 33 |     #[error("An embedding was supplied, but thet batch was created with embd_len == 0. Call `add_token` instead.")]
 34 |     ExpectedToken,
 35 |     #[error("Invalid token position.")]
 36 |     InvalidPosition,
 37 | }
 38 | 
 39 | static_assertions::assert_impl_all!(AddError: Send, Sync);
 40 | 
 41 | impl Batch {
 42 |     /// Create a new [`Batch`] with the given `capacity` for tokens or
 43 |     /// embeddings. If `embd_len` is zero, the `tokens` accessor will be
 44 |     /// available, otherwise the `embd` accessor will be available. Each token
 45 |     /// can be assigned up to `n_seq_max` sequence ids.
 46 |     pub fn new(
 47 |         capacity: usize,
 48 |         embd_len: usize,
 49 |         n_seq_max: usize,
 50 |     ) -> Option<Self> {
 51 |         let batch = unsafe {
 52 |             llama_batch_init(
 53 |                 capacity.try_into().ok()?,
 54 |                 embd_len.try_into().ok()?,
 55 |                 n_seq_max.try_into().ok()?,
 56 |             )
 57 |         };
 58 | 
 59 |         // sanity
 60 |         debug_assert!(batch.n_tokens == 0);
 61 | 
 62 |         Some(Self {
 63 |             batch,
 64 |             capacity,
 65 |             embd_len,
 66 |             n_seq_max,
 67 |         })
 68 |     }
 69 | 
 70 |     /// Create a new [`Batch`] with capacity for tokens. The the `logit` field
 71 |     /// for all but the last token will be set to `false`. If the capacity is
 72 |     /// less than the number of tokens, the largest value will be used.
 73 |     pub fn from_tokens(
 74 |         capacity: usize,
 75 |         tokens: &[llama_token],
 76 |     ) -> Option<Self> {
 77 |         let mut batch = Self::new(capacity.max(tokens.len()), 0, 1)?;
 78 | 
 79 |         for (i, token) in tokens.iter().enumerate() {
 80 |             let logits = i == tokens.len() - 1;
 81 |             batch.add_token(*token, i, None, logits).ok()?;
 82 |         }
 83 | 
 84 |         Some(batch)
 85 |     }
 86 | 
 87 |     /// The maximum number of members in the batch.
 88 |     pub const fn capacity(&self) -> usize {
 89 |         self.capacity
 90 |     }
 91 | 
 92 |     /// The current number of members in the batch.
 93 |     pub const fn len(&self) -> usize {
 94 |         self.batch.n_tokens as usize
 95 |     }
 96 | 
 97 |     /// Returns true if batch is empty.
 98 |     pub const fn is_empty(&self) -> bool {
 99 |         self.len() == 0
100 |     }
101 | 
102 |     /// The size of each embedding.
103 |     pub const fn embd_len(&self) -> usize {
104 |         self.embd_len
105 |     }
106 | 
107 |     /// The maximum number of sequence ids per token.
108 |     pub const fn n_seq_max(&self) -> usize {
109 |         self.n_seq_max
110 |     }
111 | 
112 |     /// The tokens in this batch.
113 |     ///
114 |     /// This will return `None` if the [`Batch`] was created with `embd_len` set
115 |     /// to zero.
116 |     pub fn tokens(&self) -> Option<&[llama_token]> {
117 |         if self.batch.token.is_null() {
118 |             debug_assert!(!self.batch.embd.is_null());
119 |             None
120 |         } else {
121 |             Some(
122 |                 &unsafe {
123 |                     std::slice::from_raw_parts(
124 |                         self.batch.token,
125 |                         self.capacity(),
126 |                     )
127 |                 }[..self.len() as usize],
128 |             )
129 |         }
130 |     }
131 | 
132 |     /// The tokens in this batch.
133 |     ///
134 |     /// This will return `None` if the [`Batch`] was created with `embd_len` set
135 |     /// to zero.
136 |     pub fn tokens_mut(&mut self) -> Option<&mut [llama_token]> {
137 |         if self.batch.token.is_null() {
138 |             debug_assert!(!self.batch.embd.is_null());
139 |             None
140 |         } else {
141 |             Some(
142 |                 &mut unsafe {
143 |                     std::slice::from_raw_parts_mut(
144 |                         self.batch.token,
145 |                         self.capacity(),
146 |                     )
147 |                 }[..self.len() as usize],
148 |             )
149 |         }
150 |     }
151 | 
152 |     /// The embeddings in this batch at index `i`.
153 |     ///
154 |     /// This will return None if the index is invalid or if the batch was
155 |     /// created with `embd_len` set to zero.
156 |     pub fn embd(&self, i: usize) -> Option<&[f32]> {
157 |         if self.batch.embd.is_null() {
158 |             debug_assert!(!self.batch.token.is_null());
159 |             None
160 |         } else {
161 |             if (i as usize) >= self.len() {
162 |                 None
163 |             } else {
164 |                 Some(unsafe {
165 |                     std::slice::from_raw_parts(
166 |                         self.batch.embd.add(i * self.embd_len()),
167 |                         self.embd_len(),
168 |                     )
169 |                 })
170 |             }
171 |         }
172 |     }
173 | 
174 |     /// The embeddings in this batch at index `i`.
175 |     ///
176 |     /// This will return None if the index is invalid or if the batch was
177 |     /// created with `embd_len` set to zero.
178 |     pub fn embd_mut(&mut self, i: usize) -> Option<&mut [f32]> {
179 |         if self.batch.embd.is_null() {
180 |             debug_assert!(!self.batch.token.is_null());
181 |             None
182 |         } else {
183 |             if (i as usize) >= self.len() {
184 |                 None
185 |             } else {
186 |                 Some(unsafe {
187 |                     std::slice::from_raw_parts_mut(
188 |                         self.batch.embd.add(i * self.embd_len()),
189 |                         self.embd_len(),
190 |                     )
191 |                 })
192 |             }
193 |         }
194 |     }
195 | 
196 |     /// The position of a given index in the batch.
197 |     pub const fn pos(&self) -> &[i32] {
198 |         unsafe { std::slice::from_raw_parts(self.batch.pos, self.len()) }
199 |     }
200 | 
201 |     /// The position of a given index in the batch.
202 |     pub fn pos_mut(&mut self) -> &mut [i32] {
203 |         unsafe { std::slice::from_raw_parts_mut(self.batch.pos, self.len()) }
204 |     }
205 | 
206 |     /// The number of sequence ids for a given index in the batch.
207 |     pub const fn n_seq(&self) -> &[i32] {
208 |         unsafe { std::slice::from_raw_parts(self.batch.n_seq_id, self.len()) }
209 |     }
210 | 
211 |     /// The number of sequence ids for a given index in the batch.
212 |     fn n_seq_mut(&mut self) -> &mut [i32] {
213 |         unsafe {
214 |             std::slice::from_raw_parts_mut(self.batch.n_seq_id, self.len())
215 |         }
216 |     }
217 | 
218 |     /// Whether logits should be calculated at a given index in the batch.
219 |     pub fn logits(&self) -> &[bool] {
220 |         // Safety: This and the accessor below are safe because we know a bool
221 |         // is the same size as an i8 and we know the 0 and 1 values correspond
222 |         // to false and true. Otherwise the following would not compile:
223 |         static_assertions::assert_eq_size!(bool, i8);
224 |         static_assertions::const_assert_eq!(false as i8, 0);
225 |         static_assertions::const_assert_eq!(true as i8, 1);
226 | 
227 |         unsafe {
228 |             std::slice::from_raw_parts(
229 |                 self.batch.logits as *const bool,
230 |                 self.len(),
231 |             )
232 |         }
233 |     }
234 | 
235 |     /// Whether logits should be calculated at a given index in the batch.
236 |     fn logits_mut(&mut self) -> &mut [bool] {
237 |         unsafe {
238 |             std::slice::from_raw_parts_mut(
239 |                 self.batch.logits as *mut bool,
240 |                 self.len(),
241 |             )
242 |         }
243 |     }
244 | 
245 |     /// Clear the batch.
246 |     pub fn clear(&mut self) {
247 |         self.batch.n_tokens = 0;
248 |     }
249 | 
250 |     /// Add a token to the batch.
251 |     pub fn add_token(
252 |         &mut self,
253 |         token: llama_token,
254 |         pos: usize,
255 |         seq_ids: Option<&[llama_seq_id]>,
256 |         logits: bool,
257 |     ) -> Result<(), AddError> {
258 |         let i = self.len();
259 | 
260 |         if pos >= self.capacity() {
261 |             return Err(AddError::InvalidPosition);
262 |         }
263 | 
264 |         if i >= self.capacity() {
265 |             return Err(AddError::Full);
266 |         }
267 | 
268 |         if self.embd_len() != 0 {
269 |             return Err(AddError::ExpectedEmbedding);
270 |         }
271 | 
272 |         self.batch.n_tokens += 1;
273 | 
274 |         self.tokens_mut().unwrap()[i] = token;
275 |         self.pos_mut()[i] = pos as i32;
276 | 
277 |         let sequences = unsafe {
278 |             std::slice::from_raw_parts_mut(self.batch.seq_id, self.len())
279 |         };
280 |         let sequence = unsafe {
281 |             std::slice::from_raw_parts_mut(sequences[i], self.n_seq_max())
282 |         };
283 | 
284 |         match seq_ids {
285 |             Some(seq_ids) => {
286 |                 if seq_ids.len() > self.n_seq_max() {
287 |                     self.batch.n_tokens -= 1;
288 |                     return Err(AddError::InvalidSequenceLength);
289 |                 }
290 | 
291 |                 // We want to panic if the number of sequence ids is greater
292 |                 // than i32::MAX
293 |                 self.n_seq_mut()[i] = seq_ids.len().try_into().unwrap();
294 | 
295 |                 // Safety: This is safe because we control construction of the
296 |                 // batch and we know that the sequence ids are valid for the
297 |                 // lifetime of the batch. We also know that len is valid because
298 |                 // the only way it changes is through our accessor methods.
299 |                 sequence[..seq_ids.len()].copy_from_slice(seq_ids);
300 |                 sequence[seq_ids.len()..].fill(0);
301 |             }
302 |             None => {
303 |                 // There is always at least one sequence id
304 |                 self.n_seq_mut()[i] = 1;
305 |                 sequence[0] = 0;
306 |             }
307 |         }
308 |         self.logits_mut()[i] = logits;
309 | 
310 |         Ok(())
311 |     }
312 | 
313 |     /// Add tokens to the batch.
314 |     pub fn add_tokens<I>(
315 |         &mut self,
316 |         tokens: I,
317 |         pos: usize,
318 |         seq_ids: Option<&[llama_seq_id]>,
319 |         logits: bool,
320 |     ) -> Result<(), AddError>
321 |     where
322 |         I: IntoIterator<Item = llama_token>,
323 |     {
324 |         for token in tokens {
325 |             self.add_token(token, pos, seq_ids, logits)?;
326 |         }
327 | 
328 |         Ok(())
329 |     }
330 | }
331 | 
332 | impl Drop for Batch {
333 |     fn drop(&mut self) {
334 |         unsafe { llama_batch_free(self.batch) };
335 |     }
336 | }
337 | 
338 | #[cfg(test)]
339 | mod tests {
340 | 
341 |     use super::*;
342 | 
343 |     #[test]
344 |     fn test_batch() {
345 |         for n_seq_max in 1..16usize {
346 |             let mut batch = Batch::new(16, 0, n_seq_max).unwrap();
347 | 
348 |             for i in 0..16 {
349 |                 assert_eq!(batch.capacity(), 16);
350 |                 assert_eq!(batch.len(), i);
351 |                 assert_eq!(batch.embd_len(), 0);
352 |                 assert_eq!(batch.n_seq_max(), n_seq_max as usize);
353 |                 assert!(batch.tokens().is_some());
354 |                 assert!(batch.tokens_mut().is_some());
355 |                 assert!(batch.embd(i).is_none());
356 |                 assert!(batch.embd_mut(i).is_none());
357 |                 assert_eq!(
358 |                     batch.add_token(
359 |                         i as llama_token,
360 |                         i,
361 |                         Some(&vec![42; n_seq_max as usize]),
362 |                         true
363 |                     ),
364 |                     Ok(())
365 |                 );
366 |                 assert_eq!(batch.n_seq()[i], n_seq_max as i32);
367 |                 assert_eq!(batch.logits()[i], true);
368 |                 assert_eq!(batch.pos()[i], i as i32);
369 |             }
370 | 
371 |             batch.clear();
372 | 
373 |             for i in 0..16_usize {
374 |                 assert_eq!(batch.capacity(), 16);
375 |                 assert_eq!(batch.len(), i);
376 |                 assert_eq!(batch.embd_len(), 0);
377 |                 assert_eq!(batch.n_seq_max(), n_seq_max);
378 |                 assert!(batch.tokens().is_some());
379 |                 assert!(batch.tokens_mut().is_some());
380 |                 assert!(batch.embd(i).is_none());
381 |                 assert!(batch.embd_mut(i).is_none());
382 |                 assert_eq!(
383 |                     batch.add_token(i as llama_token, i, None, false),
384 |                     Ok(())
385 |                 );
386 |                 assert_eq!(batch.n_seq()[i], 1);
387 |                 assert_eq!(batch.logits()[i], false);
388 |                 assert_eq!(batch.pos()[i], i as i32);
389 |             }
390 | 
391 |             // The batch is full
392 |             assert_eq!(
393 |                 batch.add_token(16, 15, None, true),
394 |                 Err(AddError::Full)
395 |             );
396 |             // The position is invalid
397 |             assert_eq!(
398 |                 batch.add_token(16, 16, None, true),
399 |                 Err(AddError::InvalidPosition)
400 |             );
401 |         }
402 |     }
403 | }
404 | 


--------------------------------------------------------------------------------
/src/cli.rs:
--------------------------------------------------------------------------------
 1 | use std::path::PathBuf;
 2 | 
 3 | use clap::Parser;
 4 | 
 5 | use llama_cpp_sys_3::{
 6 |     llama_context_default_params, llama_context_params,
 7 |     llama_model_default_params, llama_model_params,
 8 | };
 9 | 
10 | use crate::VocabKind;
11 | 
12 | #[derive(Debug, Parser)]
13 | pub struct Args {
14 |     /// Path to the model
15 |     #[arg(short, long)]
16 |     pub model: PathBuf,
17 |     /// Context size
18 |     #[arg(short, long, default_value_t = 1024)]
19 |     pub context: u32,
20 |     /// Disable on-by-default GPU acceleration
21 |     #[arg(short, long, default_value_t = false)]
22 |     pub no_gpu: bool,
23 |     /// Vocabulary
24 |     #[arg(short, long, default_value_t = VocabKind::Safe)]
25 |     pub vocab: VocabKind,
26 | }
27 | 
28 | impl Args {
29 |     /// Create `llama_model_params` from `Args`. Defaults are used for fields
30 |     /// not specified in `Args`.
31 |     pub fn model_params(&self) -> llama_model_params {
32 |         self.into()
33 |     }
34 | 
35 |     /// Create `llama_context_params` from `Args`. Defaults are used for fields
36 |     /// not specified in `Args`.
37 |     pub fn context_params(&self) -> llama_context_params {
38 |         self.into()
39 |     }
40 | }
41 | 
42 | impl From<&Args> for llama_model_params {
43 |     fn from(args: &Args) -> Self {
44 |         // Safety: This returns POD and makes no allocations for the pointer
45 |         // fields, which are optional and initialized to null.
46 |         let mut params = unsafe { llama_model_default_params() };
47 |         params.n_gpu_layers = if args.no_gpu { 0 } else { 1000 };
48 | 
49 |         params
50 |     }
51 | }
52 | 
53 | impl From<&Args> for llama_context_params {
54 |     fn from(args: &Args) -> Self {
55 |         // Safety: same as above
56 |         let mut params = unsafe { llama_context_default_params() };
57 |         params.n_ctx = args.context;
58 | 
59 |         params
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/data.rs:
--------------------------------------------------------------------------------
1 | pub(crate) mod stopwords;
2 | pub use stopwords::StopWords;
3 | 
4 | pub(crate) mod banned;
5 | 


--------------------------------------------------------------------------------
/src/data/stopwords.rs:
--------------------------------------------------------------------------------
  1 | use llama_cpp_sys_3::llama_token;
  2 | 
  3 | use crate::Model;
  4 | 
  5 | /// A list of very common words for various languages. These can be used to
  6 | /// ignore certain tokens for the purposes of repetition detection, etc.
  7 | #[cfg_attr(
  8 |     feature = "serde",
  9 |     derive(rocket::serde::Deserialize, rocket::serde::Serialize)
 10 | )]
 11 | #[cfg_attr(feature = "serde", serde(crate = "rocket::serde"))]
 12 | #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)]
 13 | pub enum StopWords {
 14 |     // NOTE: If you add a new language here, add it to ALL and sort this list
 15 |     // and ALL in alphabetical order.
 16 |     // TODO: static assert all this.
 17 |     English,
 18 | }
 19 | 
 20 | impl StopWords {
 21 |     pub const ALL: [StopWords; 1] = [StopWords::English];
 22 | 
 23 |     pub const fn as_str(&self) -> &'static str {
 24 |         match self {
 25 |             StopWords::English => "English",
 26 |         }
 27 |     }
 28 | 
 29 |     pub const fn words(&self) -> &'static [&'static str] {
 30 |         match self {
 31 |             StopWords::English => ENGLISH,
 32 |         }
 33 |     }
 34 | 
 35 |     /// Tokenizes `self` using the given `model``.
 36 |     pub fn into_tokens(
 37 |         self,
 38 |         model: &Model,
 39 |     ) -> impl Iterator<Item = llama_token> + '_ {
 40 |         self.words()
 41 |             .iter()
 42 |             // TODO: there is allocation here that can be avoided by turning the
 43 |             // tokenize function into a method returning an iterator, however
 44 |             // it's not a big deal since this is only done once.
 45 |             .map(|word| model.tokenize(word, false).into_iter())
 46 |             .flatten()
 47 |     }
 48 | }
 49 | 
 50 | /// A list of common English stopwords from NLTK.
 51 | pub const ENGLISH: &[&str] = &[
 52 |     "a",
 53 |     "about",
 54 |     "above",
 55 |     "after",
 56 |     "again",
 57 |     "against",
 58 |     "all",
 59 |     "am",
 60 |     "an",
 61 |     "and",
 62 |     "any",
 63 |     "are",
 64 |     "as",
 65 |     "at",
 66 |     "be",
 67 |     "because",
 68 |     "been",
 69 |     "before",
 70 |     "being",
 71 |     "below",
 72 |     "between",
 73 |     "both",
 74 |     "but",
 75 |     "by",
 76 |     "can",
 77 |     "did",
 78 |     "do",
 79 |     "does",
 80 |     "doing",
 81 |     "don",
 82 |     "down",
 83 |     "during",
 84 |     "each",
 85 |     "few",
 86 |     "for",
 87 |     "from",
 88 |     "further",
 89 |     "had",
 90 |     "has",
 91 |     "have",
 92 |     "having",
 93 |     "he",
 94 |     "her",
 95 |     "here",
 96 |     "hers",
 97 |     "herself",
 98 |     "him",
 99 |     "himself",
100 |     "his",
101 |     "how",
102 |     "i",
103 |     "if",
104 |     "in",
105 |     "into",
106 |     "is",
107 |     "it",
108 |     "its",
109 |     "itself",
110 |     "just",
111 |     "me",
112 |     "more",
113 |     "most",
114 |     "my",
115 |     "myself",
116 |     "no",
117 |     "nor",
118 |     "not",
119 |     "now",
120 |     "of",
121 |     "off",
122 |     "on",
123 |     "once",
124 |     "only",
125 |     "or",
126 |     "other",
127 |     "our",
128 |     "ours",
129 |     "ourselves",
130 |     "out",
131 |     "over",
132 |     "own",
133 |     "s",
134 |     "same",
135 |     "she",
136 |     "should",
137 |     "so",
138 |     "some",
139 |     "such",
140 |     "t",
141 |     "than",
142 |     "that",
143 |     "the",
144 |     "their",
145 |     "theirs",
146 |     "them",
147 |     "themselves",
148 |     "then",
149 |     "there",
150 |     "these",
151 |     "they",
152 |     "this",
153 |     "those",
154 |     "through",
155 |     "to",
156 |     "too",
157 |     "under",
158 |     "until",
159 |     "up",
160 |     "very",
161 |     "was",
162 |     "we",
163 |     "were",
164 |     "what",
165 |     "when",
166 |     "where",
167 |     "which",
168 |     "while",
169 |     "who",
170 |     "whom",
171 |     "why",
172 |     "will",
173 |     "with",
174 |     "you",
175 |     "your",
176 |     "yours",
177 |     "yourself",
178 |     "yourselves",
179 | ];
180 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | // TODO: Importing everything from the submodules is fine for small crates, but
 2 | // this is getting crowded. When we go to version 0.2.0, we should consider
 3 | // making modules public.
 4 | 
 5 | #[cfg(feature = "cli")]
 6 | pub mod cli;
 7 | 
 8 | #[cfg_attr(test, macro_use)]
 9 | pub(crate) mod utils;
10 | 
11 | pub mod data;
12 | 
13 | mod sample;
14 | pub use sample::{
15 |     RepetitionError, RepetitionOptions, SampleOptions, SamplingMode,
16 | };
17 | 
18 | mod batch;
19 | pub(crate) use batch::Batch;
20 | 
21 | mod candidates;
22 | pub use candidates::{Candidates, Sorted, TokenDataArray};
23 | 
24 | pub mod prompt;
25 | pub use prompt::{Message, Prompt, Role};
26 | 
27 | mod model;
28 | pub use model::{llama_quantize, Model, Vocab, VocabKind};
29 | 
30 | mod ngram;
31 | pub use ngram::{NGram, NGramData, NGramStats};
32 | 
33 | mod engine;
34 | pub use engine::{Engine, NewError};
35 | 
36 | mod predictor;
37 | pub use predictor::{
38 |     CandidatePredictor, PiecePredictor, PredictOptions, Predicted, Predictor,
39 |     TokenPredictor,
40 | };
41 | 
42 | mod probability;
43 | pub use probability::{InvalidProbability, Probability};
44 | 
45 | pub const TOS: &str = include_str!("../TERMS_OF_USE.md");
46 | 


--------------------------------------------------------------------------------
/src/model/vocab.rs:
--------------------------------------------------------------------------------
  1 | //! Vocabulary constraints.
  2 | 
  3 | use llama_cpp_sys_3::llama_token;
  4 | use regex::Regex;
  5 | 
  6 | use crate::{data::banned::Banned, model::token_to_piece_ref, Model, NGram};
  7 | 
  8 | /// A very imperfect regex for safe tokens. This could use some improvement.
  9 | pub const SAFE_REGEX: &str = r#"^[▁ a-zA-Z]{2,32}|[ ▁\(\)\.\?!\"\'\-_]{1,32}|[aAI]{1}|\n{1,3}|\t{1,3}| {1,16}$"#;
 10 | pub const LETTERS_REGEX: &str = r#"^[a-zA-Z]{1}$"#;
 11 | pub const CODE_REGEX: &str = r#"^[ \d\\(\){\}\[\]\;\:\"\'\<\>\,\.\\\/\?\.\!\@\#\$\%\^\&\=\`\~]{1,32}|\w{2,32}$"#;
 12 | 
 13 | // This is temporary until we can get the regex working for llama. It works in
 14 | // regex101, but not here. With these tokens banned, weird things happen.
 15 | const LLAMA_2_ALLOW_LIST: &[llama_token] = &[
 16 |     0,     // unknown
 17 |     1,     // bos
 18 |     2,     // eos
 19 |     0x0D,  // \n
 20 |     0x20,  // space
 21 |     0x49,  // I
 22 |     0x3D,  // =
 23 |     0x61,  // a
 24 |     0x75,  // u
 25 |     29871, // ▁ (word boundary)
 26 |     29874, // a
 27 |     29889, // .
 28 |     29892, // ,
 29 |     29897, // )
 30 |     29898, // (
 31 |     29899, // -
 32 |     29901, // :
 33 |     29902, // I
 34 |     29909, // A
 35 |     29912, // {
 36 |     29913, // }
 37 |     29915, // '
 38 |     29918, // _
 39 |     29922, // =
 40 |     29930, // *
 41 |     29936, // ;
 42 |     29937, // #
 43 |     29938, // $
 44 |     29944, // л
 45 |     29961, // [
 46 |     29962, // ]
 47 |     29973, // ?
 48 |     29974, // +
 49 |     29985, // ^
 50 |     29989, // |
 51 |     29991, // !
 52 |     29992, // @
 53 |     29995, // %
 54 |     30022, // ~
 55 |     30098, // …
 56 |     30142, // λ
 57 | ];
 58 | 
 59 | const LLAMA_2_ALLOW_RANGES: &[std::ops::RangeInclusive<llama_token>] = &[
 60 |     0x20..=0x3C, // !"#$%&'()*+,-./0123456789:;
 61 |     0x3F..=0x41, // ?@A
 62 |     0x5B..=0x60, // [\]^_`
 63 |     0x7B..=0x7E, // {|}~
 64 | ];
 65 | 
 66 | #[cfg_attr(feature = "cli", derive(clap::ValueEnum))]
 67 | #[derive(Debug, Clone, Copy, PartialEq)]
 68 | pub enum VocabKind {
 69 |     /// All tokens and control characters are allowed. This is not recommended,
 70 |     /// especially if the output is going to be used in a web context. Banned
 71 |     /// n-grams are still enforced.
 72 |     Unsafe,
 73 |     /// Words, word fragments, punctuation, and the letter "a" are allowed. This
 74 |     /// is the default vocabulary. The idea is to prohibit generation of
 75 |     /// arbitrary sequences which could bypass filters, as well as code which
 76 |     /// could cause security issues.
 77 |     ///
 78 |     /// That being said *this is not yet validated* to be very safe, so care
 79 |     /// should be taken especially for web contexts.
 80 |     Safe,
 81 |     /// Letters only. Allowing this will allow generation of any sequence, but
 82 |     /// only one letter at a time. This is unsafe and should not be used unless
 83 |     /// it's absolutely necessary.
 84 |     ///
 85 |     /// Using it to generate bigotry is a violation of the license under which
 86 |     /// this software is distributed. See `LICENSE.md` for details.
 87 |     Letters,
 88 |     /// Code. This will allow generation of words, digits, and common symbols
 89 |     /// used in code. Letters are not enabled.
 90 |     // Because 4chan got GPT-4 to generate the N word by getting it to "run"
 91 |     // code concatenating individual letters, we have to ban this.
 92 |     Code,
 93 | }
 94 | 
 95 | // derive_more::Display is failing, so we're implementing it manually.
 96 | #[cfg(feature = "cli")]
 97 | impl std::fmt::Display for VocabKind {
 98 |     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
 99 |         match self {
100 |             VocabKind::Unsafe => write!(f, "unsafe"),
101 |             VocabKind::Safe => write!(f, "safe"),
102 |             VocabKind::Letters => write!(f, "letters"),
103 |             VocabKind::Code => write!(f, "code"),
104 |         }
105 |     }
106 | }
107 | 
108 | impl Into<Regex> for VocabKind {
109 |     fn into(self) -> Regex {
110 |         match self {
111 |             VocabKind::Unsafe => Regex::new("*").unwrap(),
112 |             VocabKind::Safe => Regex::new(SAFE_REGEX).unwrap(),
113 |             VocabKind::Letters => Regex::new(LETTERS_REGEX).unwrap(),
114 |             VocabKind::Code => Regex::new(CODE_REGEX).unwrap(),
115 |         }
116 |     }
117 | }
118 | 
119 | #[derive(Debug)]
120 | pub struct Vocab {
121 |     /// Allowed tokens. This is a Vec of bool rather than a vec of ranges so
122 |     /// that lookup is O(1). This will happen in a fairly tight loop, so it's
123 |     /// probably worth it.
124 |     allowed_tokens: Vec<bool>,
125 |     /// Banned ngrams. These are at least all possible pairs of tokens that
126 |     /// would generate a banned word. Letters are not included since the number
127 |     /// of permutations is too high.
128 |     banned: Option<Banned>,
129 |     /// Longest token length. This is used to optimize search for stop strings.
130 |     longest_token: usize,
131 | }
132 | 
133 | impl Vocab {
134 |     pub fn new(
135 |         enabled: impl IntoIterator<Item = VocabKind>,
136 |         model: &Model,
137 |     ) -> Self {
138 |         let enabled: Vec<VocabKind> = enabled.into_iter().collect();
139 |         let banned = if model.desc().to_lowercase().starts_with("llama v2") {
140 |             Some(Banned::LlamaEnglish)
141 |         } else {
142 |             None
143 |         };
144 |         if enabled.contains(&VocabKind::Unsafe) {
145 |             return Self {
146 |                 allowed_tokens: vec![true; model.n_vocab() as usize],
147 |                 longest_token: model.max_token_len(),
148 |                 banned,
149 |             };
150 |         }
151 |         let enabled: Vec<Regex> = enabled.into_iter().map(Into::into).collect();
152 | 
153 |         let n_tokens = model.n_vocab();
154 | 
155 |         let mut buf = Vec::new();
156 | 
157 |         let mut allowed_tokens: Vec<bool> = (0..n_tokens)
158 |             .map(|token| {
159 |                 token_to_piece_ref(token, model, &mut buf);
160 |                 enabled
161 |                     .iter()
162 |                     .any(|re| re.is_match(&String::from_utf8_lossy(&buf)))
163 |             })
164 |             .collect();
165 | 
166 |         if model.desc().to_lowercase().starts_with("llama v2") {
167 |             for &token in LLAMA_2_ALLOW_LIST {
168 |                 allowed_tokens[token as usize] = true;
169 |             }
170 | 
171 |             for range in LLAMA_2_ALLOW_RANGES {
172 |                 for token in range.clone() {
173 |                     allowed_tokens[token as usize] = true;
174 |                 }
175 |             }
176 |         }
177 | 
178 |         // TODO: Fix regex, or add LLAMA_3 allow list. As it is now, generation
179 |         // is potato without "Unsafe" vocab because the regex is too strict.
180 | 
181 |         Self {
182 |             allowed_tokens,
183 |             longest_token: model.max_token_len(),
184 |             banned,
185 |         }
186 |     }
187 | 
188 |     /// Returns true if an ngram is forbidden. Forbidden [`NGram`]s are those
189 |     /// that contain a token that is not allowed, or that are in the banned
190 |     /// ngrams set.
191 |     pub fn is_forbidden(&self, ngram: &NGram) -> bool {
192 |         if ngram
193 |             .iter()
194 |             .any(|&token| !self.allowed_tokens[token as usize])
195 |         {
196 |             return true;
197 |         }
198 |         if let Some(banned) = &self.banned {
199 |             banned
200 |                 .as_slice()
201 |                 .binary_search(&[ngram[0], ngram[1]])
202 |                 .is_ok()
203 |         } else {
204 |             false
205 |         }
206 |     }
207 | 
208 |     /// Piece length of the longest token.
209 |     ///
210 |     /// Time complexity: O(1).
211 |     pub fn max_token_len(&self) -> usize {
212 |         self.longest_token
213 |     }
214 | 
215 |     /// Allowed tokens.
216 |     pub fn allowed_tokens(&self) -> &Vec<bool> {
217 |         &self.allowed_tokens
218 |     }
219 | 
220 |     /// Banned ngrams.
221 |     pub fn banned(&self) -> Option<&Banned> {
222 |         self.banned.as_ref()
223 |     }
224 | 
225 |     /// Returns the number of allowed tokens.
226 |     ///
227 |     /// O(n) where n is the number of tokens.
228 |     pub fn n_allowed(&self) -> usize {
229 |         self.allowed_tokens.iter().filter(|&&b| b).count()
230 |     }
231 | }
232 | 
233 | #[cfg(test)]
234 | mod tests {
235 |     use super::*;
236 |     use llama_cpp_sys_3::llama_token;
237 |     use rayon::prelude::*;
238 |     use std::{
239 |         collections::{BTreeSet, HashSet},
240 |         path::PathBuf,
241 |     };
242 | 
243 |     /// Generate banned ngrams for a model. This is very slow and can take a few
244 |     /// minutes even on a fast machine. It is only used for testing and
245 |     /// generating the banned ngrams for the various models.
246 |     fn generate_banned_ngrams(model: &Model) -> BTreeSet<NGram> {
247 |         // Safety: this is only called from test code and we don't use any
248 |         // methods that mutate the model, so it is safe to share between
249 |         // threads. In the future we might make model actually thread safe.
250 |         unsafe impl Sync for Model {}
251 | 
252 |         let mut banned_ngrams = BTreeSet::new();
253 | 
254 |         let n_vocab = model.n_vocab();
255 |         let banned_regex: Vec<Regex> = crate::data::banned::ENGLISH_WORDS
256 |             .iter()
257 |             .map(|s| Regex::new(s).unwrap())
258 |             .collect();
259 | 
260 |         let (tx, rx) = std::sync::mpsc::channel();
261 |         (0..n_vocab).into_par_iter().for_each_with(tx, |tx, first| {
262 |             let mut first_buf = Vec::new();
263 |             let mut second_buf = Vec::new();
264 |             let mut joined_buf = String::new();
265 | 
266 |             let mut banned_chunk: HashSet<NGram> = HashSet::new();
267 | 
268 |             for second in 0..n_vocab {
269 |                 first_buf.clear();
270 |                 second_buf.clear();
271 |                 joined_buf.clear();
272 | 
273 |                 token_to_piece_ref(first, &model, &mut first_buf);
274 |                 token_to_piece_ref(second, &model, &mut second_buf);
275 | 
276 |                 joined_buf.push_str(
277 |                     String::from_utf8_lossy(&first_buf).to_lowercase().as_ref(),
278 |                 );
279 |                 joined_buf.push_str(
280 |                     String::from_utf8_lossy(&second_buf)
281 |                         .to_lowercase()
282 |                         .as_ref(),
283 |                 );
284 | 
285 |                 for regex in &banned_regex {
286 |                     if regex.is_match(&joined_buf.to_lowercase()) {
287 |                         let ngram =
288 |                             NGram::try_from_tokens(&[first, second]).unwrap();
289 |                         banned_chunk.insert(ngram);
290 |                         break;
291 |                     }
292 |                 }
293 |             }
294 | 
295 |             tx.send(banned_chunk).unwrap();
296 |         });
297 | 
298 |         banned_ngrams.extend(rx.into_iter().flatten());
299 | 
300 |         banned_ngrams
301 |     }
302 | 
303 |     #[test]
304 |     fn test_vocab() {
305 |         // This is a llama model
306 |         let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
307 |         path.push("models/model.gguf");
308 | 
309 |         let model = Model::from_file(path, None).unwrap();
310 |         let vocab = Vocab::new(vec![VocabKind::Safe], &model);
311 | 
312 |         // Check that the ngrams are forbidden
313 |         for forbidden in crate::data::banned::ENGLISH_BIGRAMS {
314 |             let ngram = NGram::try_from_tokens(forbidden).unwrap();
315 |             assert!(vocab.is_forbidden(&ngram));
316 |         }
317 |     }
318 | 
319 |     #[test]
320 |     #[ignore = "very long running"]
321 |     /// This is a very long running test that generates the banned n-grams for
322 |     /// the Llama model. This can take a few minutes even on a fast machine.
323 |     fn test_banned_ngrams_llama() {
324 |         // This is a llama model
325 |         let root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
326 |         let mut model_path = root.clone();
327 |         model_path.push("models/model.gguf");
328 |         let model = Model::from_file(model_path, None).unwrap();
329 |         let mut out_path = root.clone();
330 |         out_path
331 |             .push(format!("tests/data/banned_ngrams/ngrams-english-llama.txt"));
332 | 
333 |         let expected = generate_banned_ngrams(&model);
334 |         let actual: BTreeSet<NGram> = crate::data::banned::ENGLISH_BIGRAMS
335 |             .iter()
336 |             .filter_map(|slice| NGram::try_from_tokens(slice).ok())
337 |             .collect();
338 | 
339 |         let v: Vec<Vec<llama_token>> =
340 |             expected.iter().map(|n| n.as_slice().to_vec()).collect();
341 | 
342 |         // This representation should be easy to copy and paste into the
343 |         // BANNED_LLAMA_NGRAMS array. We could automate this, but I don't want
344 |         // to automate generation of code.
345 |         std::fs::write(out_path, format!("{:#?}", v)).unwrap();
346 | 
347 |         assert_eq!(expected, actual);
348 |     }
349 | }
350 | 


--------------------------------------------------------------------------------
/src/probability.rs:
--------------------------------------------------------------------------------
  1 | #[cfg(feature = "serde")]
  2 | use rocket::serde::Deserialize;
  3 | use static_assertions::assert_impl_all;
  4 | 
  5 | /// Error for invalid probability values.
  6 | #[derive(Debug, PartialEq, thiserror::Error)]
  7 | #[error("Invalid probability. Must be between 0.0 and 1.0. Got {p}")]
  8 | pub struct InvalidProbability<F>
  9 | where
 10 |     F: std::fmt::Display,
 11 | {
 12 |     p: F,
 13 | }
 14 | 
 15 | assert_impl_all!(InvalidProbability<f32>: Send, Sync);
 16 | assert_impl_all!(InvalidProbability<f64>: Send, Sync);
 17 | 
 18 | /// A [`Probability`] is a wrapper around a floating point number that
 19 | /// represents a probability. It is guaranteed to be between 0.0 and 1.0.
 20 | #[derive(Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, Hash)]
 21 | #[repr(transparent)]
 22 | pub struct Probability<F> {
 23 |     pub(crate) p: F,
 24 | }
 25 | impl<F> Probability<F> {
 26 |     pub fn from_f(p: F) -> Result<Self, InvalidProbability<F>>
 27 |     where
 28 |         F: num::Zero + num::One + std::cmp::PartialOrd + std::fmt::Display,
 29 |     {
 30 |         if p >= F::zero() && p <= F::one() {
 31 |             Ok(Probability { p })
 32 |         } else {
 33 |             Err(InvalidProbability { p })
 34 |         }
 35 |     }
 36 | 
 37 |     pub fn into_f(self) -> F {
 38 |         self.p
 39 |     }
 40 | }
 41 | 
 42 | impl<F> PartialEq<F> for Probability<F>
 43 | where
 44 |     F: PartialEq<F>,
 45 | {
 46 |     fn eq(&self, other: &F) -> bool {
 47 |         self.p.eq(other)
 48 |     }
 49 | }
 50 | 
 51 | impl<F> PartialOrd<F> for Probability<F>
 52 | where
 53 |     F: PartialOrd<F>,
 54 | {
 55 |     fn partial_cmp(&self, other: &F) -> Option<std::cmp::Ordering> {
 56 |         self.p.partial_cmp(other)
 57 |     }
 58 | }
 59 | 
 60 | #[cfg(feature = "serde")]
 61 | impl<'de, F> Deserialize<'de> for Probability<F>
 62 | where
 63 |     F: Deserialize<'de>
 64 |         + num::Zero
 65 |         + num::One
 66 |         + std::cmp::PartialOrd
 67 |         + std::fmt::Display,
 68 | {
 69 |     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
 70 |     where
 71 |         D: rocket::serde::Deserializer<'de>,
 72 |     {
 73 |         let p = F::deserialize(deserializer)?;
 74 |         Probability::from_f(p).map_err(|e| rocket::serde::de::Error::custom(e))
 75 |     }
 76 | }
 77 | 
 78 | #[cfg(feature = "serde")]
 79 | impl<F> rocket::serde::Serialize for Probability<F>
 80 | where
 81 |     F: rocket::serde::Serialize,
 82 | {
 83 |     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
 84 |     where
 85 |         S: rocket::serde::Serializer,
 86 |     {
 87 |         self.p.serialize(serializer)
 88 |     }
 89 | }
 90 | 
 91 | // Rust complains about conflicting implementations of the conversion trait for
 92 | // the same type, so we need to use a macro to generate the impls.
 93 | macro_rules! impl_from_to_float {
 94 |     ($($t:ty),*) => {
 95 |         $(
 96 |             impl TryFrom<$t> for Probability<$t> {
 97 |                 type Error = InvalidProbability<$t>;
 98 | 
 99 |                 fn try_from(p: $t) -> Result<Self, Self::Error> {
100 |                     Probability::from_f(p)
101 |                 }
102 |             }
103 | 
104 |             impl Into<$t> for Probability<$t> {
105 |                 fn into(self) -> $t {
106 |                     self.into_f()
107 |                 }
108 |             }
109 |         )*
110 |     };
111 |     () => {};
112 | }
113 | 
114 | impl_from_to_float!(f32, f64);
115 | 
116 | #[cfg(test)]
117 | mod tests {
118 |     use super::*;
119 | 
120 |     #[test]
121 |     fn test_probability() {
122 |         // Probabilities are invalid if out of bounds
123 |         let err = Probability::try_from(1.1_f64).unwrap_err();
124 |         assert_eq!(err.p, 1.1);
125 |         let err = Probability::try_from(-0.1_f32).unwrap_err();
126 |         assert_eq!(err.p, -0.1);
127 | 
128 |         // Test valid probability
129 |         let p = Probability::try_from(0.5).unwrap();
130 |         assert_eq!(p, 0.5);
131 | 
132 |         // Test comparison with F
133 |         assert!(p > 0.0 && p < 1.0);
134 | 
135 |         // Test conversion to float
136 |         let f: f32 = p.into();
137 |         assert_eq!(f, 0.5);
138 |     }
139 | }
140 | 


--------------------------------------------------------------------------------
/src/prompt.rs:
--------------------------------------------------------------------------------
  1 | use std::fmt::{Display, Formatter};
  2 | 
  3 | mod format;
  4 | pub use format::Format;
  5 | 
  6 | use crate::Model;
  7 | 
  8 | /// Yet another stab at a prompt struct. The intended use case is for chat. This
  9 | /// takes inspiration from the OpenAI API, but is not intended to be compatible
 10 | /// with it.
 11 | #[derive(Debug, Clone)]
 12 | #[cfg_attr(test, derive(PartialEq))]
 13 | #[cfg_attr(all(test, feature = "webchat"), derive(rocket::UriDisplayQuery))]
 14 | #[cfg_attr(
 15 |     feature = "serde",
 16 |     derive(rocket::serde::Deserialize, rocket::serde::Serialize)
 17 | )]
 18 | #[cfg_attr(feature = "webchat", derive(rocket::form::FromForm))]
 19 | #[cfg_attr(feature = "serde", serde(crate = "rocket::serde"))]
 20 | pub struct Prompt {
 21 |     /// Setting, as in set and setting. This is the context in which the
 22 |     /// interaction takes place. It may be a location, a time, a situation, or
 23 |     /// any other context that may be relevant. The composition of a universe.
 24 |     #[cfg_attr(feature = "webchat", field(validate = len(..4096), default = None))]
 25 |     pub setting: Option<String>,
 26 |     /// Agent's name, e.g. "Mr. Rogers" or "GPT-5".
 27 |     #[cfg_attr(feature = "webchat", field(validate = len(..64), default = "assistant"))]
 28 |     pub agent: String,
 29 |     /// Human's name, e.g. "Alice" or "Bob".
 30 |     #[cfg_attr(feature = "webchat", field(validate = len(..64), default = "user"))]
 31 |     pub human: String,
 32 |     /// System's name, e.g. "System", "Narrator", or "God". Should imply
 33 |     /// authority to the Agent -- not necessarily to the Human.
 34 |     #[cfg_attr(feature = "webchat", field(validate = len(..64), default = None))]
 35 |     pub system: Option<String>,
 36 |     /// Messages in the chat transcript. There must be at least two messages.
 37 |     #[cfg_attr(feature = "webchat", field(validate = len(2..512)))]
 38 |     pub transcript: Vec<Message>,
 39 | }
 40 | 
 41 | impl Prompt {
 42 |     /// Load from a TOML file.
 43 |     #[cfg(feature = "toml")]
 44 |     pub fn load(path: std::path::PathBuf) -> std::io::Result<Self> {
 45 |         let prompt: Prompt =
 46 |             toml::from_str(&std::fs::read_to_string(path)?).unwrap();
 47 |         Ok(prompt)
 48 |     }
 49 | 
 50 |     /// Format the prompt in a specific format. This does not add a BOS token so
 51 |     /// if this is desired, it must be prepended or [`Prompt::format_for_model`]
 52 |     /// must be used instead.
 53 |     pub fn format<F>(&self, format: Format, f: &mut F) -> std::fmt::Result
 54 |     where
 55 |         F: std::fmt::Write,
 56 |     {
 57 |         format.format_prompt(self, None, f)
 58 |     }
 59 | 
 60 |     /// Format the prompt for a specific model. This adds a BOS token if the
 61 |     /// model requires it. If this is unknown, a BOS token will **not** be
 62 |     /// added. This is the recommended method for formatting a prompt.
 63 |     ///
 64 |     /// This will first attempt to use native formatting for the model. If a
 65 |     /// format would be [`Unknown`], it will attempt to apply a chat template using
 66 |     /// the model's metadata and `llama.cpp`. If *that* fails, it will use the
 67 |     /// [`Unknown`] format as a last resort, formatting for foundation models.
 68 |     ///
 69 |     /// This does not add the assistant's prefix to the prompt. If this is
 70 |     /// desired, [`format_agent_prefix`] should be called after this method or
 71 |     /// [`Model::apply_chat_template`] should be used instead with the `add_ass`
 72 |     /// parameter set to `true`.
 73 |     ///
 74 |     /// [`format_agent_prefix`]: Self::format_agent_prefix
 75 |     /// [`Unknown`]: Format::Unknown
 76 |     pub fn format_for_model<F>(
 77 |         &self,
 78 |         model: &Model,
 79 |         f: &mut F,
 80 |     ) -> std::fmt::Result
 81 |     where
 82 |         F: std::fmt::Write,
 83 |     {
 84 |         let format = match Format::from_model(model) {
 85 |             Some(format) => format,
 86 |             None => match model.apply_chat_template(None, self, false) {
 87 |                 Some(string) => return f.write_str(&string),
 88 |                 None => Format::Unknown,
 89 |             },
 90 |         };
 91 |         format.format_prompt(self, Some(model), f)
 92 |     }
 93 | 
 94 |     /// Format the agent's prefix. This should be called after a format method
 95 |     /// in order to append the agent's prefix to the prompt which in turn forces
 96 |     /// the model to generate a response from the agent's perspective.
 97 |     pub fn format_agent_prefix<F>(
 98 |         &self,
 99 |         format: Format,
100 |         f: &mut F,
101 |     ) -> std::fmt::Result
102 |     where
103 |         F: std::fmt::Write,
104 |     {
105 |         format.format_agent_prefix(f, self)
106 |     }
107 | 
108 |     /// Get the agent's prefix. This a convenience method that creates a new
109 |     /// string and formats it with [`format_agent_prefix`].
110 |     ///
111 |     /// [`format_agent_prefix`]: Self::format_agent_prefix
112 |     pub fn agent_prefix(&self, format: Format) -> String {
113 |         let mut s = String::new();
114 |         self.format_agent_prefix(format, &mut s).unwrap();
115 |         s
116 |     }
117 | 
118 |     /// Format the human's prefix. This can be used to format stop criteria so
119 |     /// that the model knows when to stop generating text.
120 |     pub fn format_human_prefix<F>(
121 |         &self,
122 |         format: Format,
123 |         f: &mut F,
124 |     ) -> std::fmt::Result
125 |     where
126 |         F: std::fmt::Write,
127 |     {
128 |         format.format_human_prefix(f, self)
129 |     }
130 | 
131 |     /// Get the human's prefix. This a convenience method that creates a new
132 |     /// string and formats it with [`format_human_prefix`].
133 |     ///
134 |     /// [`format_human_prefix`]: Self::format_human_prefix
135 |     pub fn human_prefix(&self, format: Format) -> String {
136 |         let mut s = String::new();
137 |         self.format_human_prefix(format, &mut s).unwrap();
138 |         s
139 |     }
140 | }
141 | 
142 | impl Display for Prompt {
143 |     // By default we format for foundation/unknown models.
144 |     fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
145 |         Format::Unknown.format_prompt(self, None, f)
146 |     }
147 | }
148 | 
149 | /// A message in a chat transcript.
150 | #[derive(Debug, Clone)]
151 | #[cfg_attr(test, derive(PartialEq))]
152 | #[cfg_attr(all(test, feature = "webchat"), derive(rocket::UriDisplayQuery))]
153 | #[cfg_attr(
154 |     feature = "serde",
155 |     derive(rocket::serde::Deserialize, rocket::serde::Serialize)
156 | )]
157 | #[cfg_attr(feature = "webchat", derive(rocket::form::FromForm))]
158 | #[cfg_attr(feature = "serde", serde(crate = "rocket::serde"))]
159 | pub struct Message {
160 |     pub role: Role,
161 |     #[cfg_attr(feature = "webchat", field(validate = len(..4096)))]
162 |     pub text: String,
163 | }
164 | 
165 | /// A [`Role`] is the participant's role in a chat transcript. This is similar
166 | /// to the OpenAI API's role.
167 | #[derive(Debug, Clone)]
168 | #[cfg_attr(test, derive(PartialEq))]
169 | #[cfg_attr(all(test, feature = "webchat"), derive(rocket::UriDisplayQuery))]
170 | #[cfg_attr(
171 |     feature = "serde",
172 |     derive(rocket::serde::Deserialize, rocket::serde::Serialize)
173 | )]
174 | #[cfg_attr(feature = "webchat", derive(rocket::form::FromFormField))]
175 | #[cfg_attr(
176 |     feature = "serde",
177 |     serde(crate = "rocket::serde"),
178 |     serde(rename_all = "snake_case")
179 | )]
180 | pub enum Role {
181 |     Human,
182 |     Agent,
183 |     /// Superuser role. This is some authority figure that constrains the
184 |     /// Agent's behavior. It may be a system, a narrator, or a god.
185 |     System,
186 | }
187 | 


--------------------------------------------------------------------------------
/src/utils.rs:
--------------------------------------------------------------------------------
1 | #[cfg(test)]
2 | #[macro_use]
3 | pub mod test;
4 | 
5 | #[inline]
6 | #[cold]
7 | /// Marks a branch as unlikely.
8 | pub(crate) fn cold() {}
9 | 


--------------------------------------------------------------------------------
/src/utils/test.rs:
--------------------------------------------------------------------------------
1 | macro_rules! assert_approx_eq {
2 |     ($a:expr, $b:expr, $eps:expr) => {
3 |         // log the values for debugging
4 |         dbg!($a, $b);
5 |         assert!(($a - $b).abs() < $eps);
6 |     };
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/data/README.md:
--------------------------------------------------------------------------------
1 | # Test Data
2 | 
3 | Lyrics are copyright Genius and Musixmatch. They are intended to test the
4 | `regurgitater` tool. They may not be used for commercial purposes.
5 | 
6 | New York Times articles are copyright New York Times, obviously. OT-III is
7 | copyright COS. Chapters from the Hobbit are copyright the Tolkien estate.
8 | 


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/5_on_it.txt:
--------------------------------------------------------------------------------
 1 | Creep on in
 2 | Ayy, see I'm ridin' high, whoa
 3 | Kinda broke this evening, y'all
 4 | So all I got's five, I got five
 5 | Player, give me some brew and I might just chill
 6 | But I'm the type that like to light another joint, like Cypress Hill
 7 | I steal doobies, spit loogies when I puff on it
 8 | I got some bucks on it, but it ain't enough on it
 9 | Go get the S, the T-I-D-E-S
10 | Nevertheless, I'm hella fresh, rollin' joints like a cigarette
11 | So pass it 'cross the table like ping pong
12 | I'm gone, beatin' my chest like King Kong
13 | It's on, wrap my lips around the .40
14 | And when it comes to fetting another stogie
15 | Fools all kick in like Shinobi
16 | No, he ain't my homie to begin with
17 | It's too many heads to be poppin' to let my friend hit it bit
18 | Unless you pull out the fat, crispy
19 | Five dollar bill, on the real, before it's history
20 | 'Cause fools be havin' them vacuum lungs
21 | And if you let 'em hit it for free, you hella dumb-da-dumb-dumb
22 | I come to school with the Taylor on my earlobe
23 | Avoiding all the dick teasers, skeezers, and weirdos
24 | That be blowing off the land, like, "Where the bomb at?"
25 | Give me two bucks, you take a puff, and pass my bomb back
26 | Suck up that dank like a Slurpee, the serious
27 | Bomb will make a niggy go delirious, like Eddie Murphy
28 | I got more growing pains than Maggie
29 | 'Cause homies nag me to take the dank out of the baggie
30 | I got five on it (got it, good), grab your four, let's get keyed
31 | I got five on it, messin' with that Indo weed
32 | I got five on it (got it, good), it's got me stuck, and I'm tore back
33 | I got five on it, partner, let's go half on a sack
34 | I take sacks to the face
35 | Whenever I can, don't need no crutch
36 | I'm so keyed up 'til the joint be burning my hand
37 | Next time I roll it in a hampa to burn slow
38 | So the ashes won't be burning up my hand, bruh
39 | Hoochies can hit, but they know they got to pitch in
40 | Then I roll a joint that's longer than your extension
41 | 'Cause I'll be damned if you get high off me for free
42 | Hell no, you better bring your own spliff, chief
43 | What's up? Don't babysit that, better pass the joint
44 | Stop hitting, 'cause you know you got asthma
45 | Crack the 40 open, homie, and guzzle it
46 | 'Cause I know the weed in my system is gettin' lonely
47 | I gotta take a whiz test to my P-O
48 | I know I failed, 'cause I done smoked major weed, bro
49 | And every time we with Chris, that fool rollin' up a fatty
50 | But the Tanqueray straight had me
51 | I got five on it (got it, good), grab your four, let's get keyed
52 | I got five on it, messin' with that Indo weed
53 | I got five on it (got it, good), it's got me stuck, and I'm tore back
54 | I got five on it (got it, good), partner, let's go half on a sack
55 | Ayy, make this right, mane, stop at the light, mane
56 | My yester-night thing got me hung off the night train
57 | You fade, I fade, so let's head to the East
58 | Hit the stroll to nine-oh, so we can roll big hashish
59 | I wish I could fade the eighth, but I'm low budget
60 | Still rollin' a two-door cutlass, same old bucket
61 | Foggy windows, soggy Indo
62 | I'm in the 'land getting smoked with my kinfolk
63 | I been smoked
64 | Y'all get spray ya, lay you down up in the O-A-K the Town
65 | Homies don't play around, we down to blaze a pound
66 | Then ease up, speed up through the E-S-O
67 | Drink the V-S-O-P up, with a lemon squeeze up
68 | And everybody's rolled up, I'm the roller
69 | That's quick to fold a blunt out of a bunch of sticky doja (woo-wee)
70 | Hold up, suck up my weed is all you do
71 | Kick in feed, 'cause where I bes we needs half, like Umfufu
72 | I got five on it (got it, good), grab your four, let's get keyed
73 | I got five on it, messin' with that Indo weed
74 | I got five on it (got it, good), it's got me stuck, and I'm tore back
75 | I got five on it (got it, good), partner, let's go half on a sack


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/README.md:
--------------------------------------------------------------------------------
1 | # Lyrics
2 | 
3 | The data within is copyright the original artists. It is included to test the
4 | `detect-infringement` binary. This data is for non-commercial, academic, use
5 | only in order to demonstrate copyright infringement within language models.
6 | 
7 | Data is sourced from:
8 | - Bing.com which sources from Musixmatch
9 | - Genius.com


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/a_day_in_the_life.txt:
--------------------------------------------------------------------------------
 1 | I read the news today oh boy
 2 | About a lucky man who made the grade
 3 | And though the news was rather sad
 4 | Well I just had to laugh
 5 | I saw the photograph
 6 | He blew his mind out in a car
 7 | He didn't notice that the lights had changed
 8 | A crowd of people stood and stared
 9 | They'd seen his face before
10 | Nobody was really sure
11 | If he was from the House of Lords
12 | I saw a film today oh boy
13 | The English Army had just won the war
14 | A crowd of people turned away
15 | But I just had to look
16 | Having read the book
17 | I'd love to turn you on
18 | Woke up, fell out of bed
19 | Dragged a comb across my head
20 | Found my way downstairs and drank a cup
21 | And looking up I noticed I was late
22 | Found my coat and grabbed my hat
23 | Made the bus in seconds flat
24 | Found my way upstairs and had a smoke
25 | And somebody spoke and I went into a dream
26 | I read the news today oh boy
27 | Four thousand holes in Blackburn, Lancashire
28 | And though the holes were rather small
29 | They had to count them all
30 | Now they know how many holes it takes to fill the Albert Hall
31 | I'd love to turn you on


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/a_whole_new_world.txt:
--------------------------------------------------------------------------------
 1 | I can show you the world
 2 | Shining, shimmering, splendid
 3 | Tell me, princess, now when did
 4 | You last let your heart decide?
 5 | I can open your eyes
 6 | Take you wonder by wonder
 7 | Over, sideways and under
 8 | On a magic carpet ride
 9 | A whole new world
10 | A new fantastic point of view
11 | No one to tell us no
12 | Or where to go
13 | Or say we're only dreaming
14 | A whole new world
15 | A dazzling place I never knew
16 | But when I'm way up here, it's crystal clear
17 | That now I'm in a whole new world with you
18 | (Now I'm in a whole new world with you)
19 | Unbelievable sights
20 | Indescribable feeling
21 | Soaring, tumbling, freewheeling
22 | Through an endless diamond sky
23 | A whole new world (don't you dare close your eyes)
24 | A hundred thousand things to see (hold your breath, it gets better)
25 | I'm like a shooting star, I've come so far
26 | I can't go back to where I used to be
27 | A whole new world (every turn a surprise)
28 | With new horizons to pursue (every moment, red-letter)
29 | I'll chase them anywhere, there's time to spare
30 | Let me share this whole new world with you
31 | A whole new world (a whole new world)
32 | That's where we'll be (that's where we'll be)
33 | A thrilling chase (a wondrous place)
34 | For you and me


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/aenema.txt:
--------------------------------------------------------------------------------
 1 | Some say the end is near
 2 | Some say we'll see Armageddon soon
 3 | I certainly hope we will
 4 | I sure could use a vacation from this
 5 | Bullshit three-ring
 6 | Circus sideshow of
 7 | Freaks
 8 | Here in this hopeless fucking hole we call L.A.
 9 | The only way to fix it is to flush it all away
10 | Any fucking time, any fucking day
11 | Learn to swim, I'll see you down in Arizona bay
12 | Fret for your figure and
13 | Fret for your latte and
14 | Fret for your lawsuit and
15 | Fret for your hairpiece and
16 | Fret for your Prozac and
17 | Fret for your pilot and
18 | Fret for your contract and
19 | Fret for your car
20 | It's a
21 | Bullshit three-ring
22 | Circus sideshow of
23 | Freaks
24 | Here in this hopeless fucking hole we call L.A.
25 | The only way to fix it is to flush it all away
26 | Any fucking time, any fucking day
27 | Learn to swim, I'll see you down in Arizona bay
28 | Some say a comet will fall from the sky
29 | Followed by meteor showers and tidal waves
30 | Followed by fault lines that cannot sit still
31 | Followed by millions of dumbfounded dipshits
32 | And some say the end is near
33 | Some say we'll see Armageddon soon
34 | I certainly hope we will
35 | I sure could use a vacation from this
36 | Stupid shit, silly shit, stupid shit
37 | One great big festering neon distraction
38 | I've a suggestion to keep you all occupied
39 | Learn to swim, learn to swim, learn to swim
40 | 'Cause Mom's gonna fix it all soon
41 | Mom's comin' 'round to put it back the way it ought to be
42 | Learn to swim, learn to swim
43 | Learn to swim, learn to swim
44 | Learn to swim, learn to swim
45 | Learn to swim, learn to swim
46 | Fuck L. Ron Hubbard and
47 | Fuck all his clones
48 | Fuck all these gun-toting
49 | Hip gangster wannabes
50 | Learn to swim, learn to swim
51 | Learn to swim, learn to swim
52 | Learn to swim, learn to swim
53 | Learn to swim, learn to swim
54 | Fuck retro anything
55 | Fuck your tattoos
56 | Fuck all you junkies and
57 | Fuck your short memories
58 | Learn to swim, learn to swim
59 | Learn to swim, learn to swim
60 | Learn to swim, learn to swim
61 | Learn to swim, learn to swim
62 | Yeah, fuck smiley glad-hands
63 | With hidden agendas
64 | Fuck these dysfunctional
65 | Insecure actresses
66 | Learn to swim, learn to swim
67 | Learn to swim, learn to swim
68 | Learn to swim, learn to swim
69 | Learn to swim, learn to swim
70 | 'Cause I'm praying for rain
71 | I'm praying for tidal waves
72 | I wanna see the ground give way
73 | I wanna watch it all go down
74 | Mom, please flush it all away
75 | I wanna see it go right in and down
76 | I wanna watch it go right in
77 | Watch you flush it all away
78 | Yeah, time to bring it down again
79 | Yeah, don't just call me pessimist
80 | Try and read between the lines
81 | I can't imagine why you wouldn't
82 | Welcome any change, my friend
83 | I wanna see it come down
84 | Put it down
85 | Suck it down
86 | Flush it down


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/bad_romance.txt:
--------------------------------------------------------------------------------
 1 | Oh-oh-oh-oh-oh, oh-oh-oh-oh, oh-oh-oh
 2 | Caught in a bad romance
 3 | Oh-oh-oh-oh-oh, oh-oh-oh-oh, oh-oh-oh
 4 | Caught in a bad romance
 5 | Rah, rah-ah-ah-ah
 6 | Roma, roma-ma
 7 | Gaga, ooh-la-la
 8 | Want your bad romance
 9 | Rah, rah-ah-ah-ah
10 | Roma, roma-ma
11 | Gaga, ooh-la-la
12 | Want your bad romance
13 | I want your ugly, I want your disease
14 | I want your everything as long as it's free
15 | I want your love
16 | Love, love, love, I want your love (hey)
17 | I want your drama, the touch of your hand (hey)
18 | I want your leather-studded kiss in the sand
19 | I want your love
20 | Love, love, love, I want your love (love, love, love)
21 | (I want your love)
22 | You know that I want you
23 | And you know that I need you
24 | I want it bad, your bad romance
25 | I want your love, and I want your revenge
26 | You and me could write a bad romance (oh-oh-oh-oh-oh)
27 | I want your love and all your lover's revenge
28 | You and me could write a bad romance
29 | Oh-oh-oh-oh-oh, oh-oh-oh-oh, oh-oh-oh
30 | Caught in a bad romance
31 | Oh-oh-oh-oh-oh, oh-oh-oh-oh, oh-oh-oh
32 | Caught in a bad romance
33 | Rah, rah-ah-ah-ah
34 | Roma, roma-ma
35 | Gaga, ooh-la-la
36 | Want your bad romance
37 | I want your horror, I want your design
38 | 'Cause you're a criminal as long as you're mine
39 | I want your love
40 | Love, love, love, I want your love
41 | I want your psycho, your vertigo shtick (hey)
42 | Want you in my rear window, baby, you're sick
43 | I want your love
44 | Love, love, love, I want your love (love, love, love)
45 | (I want your love)
46 | You know that I want you
47 | And you know that I need you ('cause I'm a free bitch, baby)
48 | I want it bad, your bad romance
49 | I want your love, and I want your revenge
50 | You and me could write a bad romance (oh-oh-oh-oh-oh)
51 | I want your love and all your lover's revenge
52 | You and me could write a bad romance
53 | Oh-oh-oh-oh-oh, oh-oh-oh-oh, oh-oh-oh
54 | Caught in a bad romance
55 | Oh-oh-oh-oh-oh, oh-oh-oh-oh, oh-oh-oh
56 | Caught in a bad romance
57 | Rah, rah-ah-ah-ah
58 | Roma, roma-ma
59 | Gaga, ooh-la-la
60 | Want your bad romance
61 | Rah, rah-ah-ah-ah
62 | Roma, roma-ma
63 | Gaga, ooh-la-la
64 | Want your bad romance
65 | Walk, walk, fashion baby
66 | Work it, move that bitch crazy
67 | Walk, walk, fashion baby
68 | Work it, move that bitch crazy
69 | Walk, walk, fashion baby
70 | Work it, move that bitch crazy
71 | Walk, walk, passion baby
72 | Work it, I'm a free bitch, baby
73 | I want your love, and I want your revenge
74 | I want your love, I don't wanna be friends
75 | J'veux ton amour, et je veux ta revanche
76 | J'veux ton amour, I don't wanna be friends (oh-oh-oh-oh-oh, oh-oh-oh-oh)
77 | No, I don't wanna be friends (oh-oh-oh, caught in a bad romance)
78 | I don't wanna be friends (oh-oh-oh-oh-oh, oh-oh-oh-oh)
79 | Want your bad romance (oh-oh-oh)
80 | Caught in a bad romance
81 | Want your bad romance
82 | I want your love, and I want your revenge
83 | You and me could write a bad romance (oh-oh-oh-oh-oh)
84 | I want your love and all your lover's revenge
85 | You and me could write a bad romance
86 | Oh-oh-oh-oh-oh, oh-oh-oh-oh, oh-oh-oh
87 | (Want your bad romance)
88 | Caught in a bad romance
89 | (Want your bad romance)
90 | Oh-oh-oh-oh-oh, oh-oh-oh-oh, oh-oh-oh
91 | (Want your bad romance)
92 | Caught in a bad romance


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/barbie_girl.txt:
--------------------------------------------------------------------------------
 1 | Hiya, Barbie!
 2 | Hi, Ken!
 3 | You wanna go for a ride?
 4 | Sure, Ken!
 5 | Jump in!
 6 | I'm a Barbie girl in a Barbie world
 7 | Life in plastic, it's fantastic
 8 | You can brush my hair, undress me everywhere
 9 | Imagination, life is your creation
10 | Come on Barbie, let's go party
11 | I'm a Barbie girl in a Barbie world
12 | Life in plastic, it's fantastic
13 | You can brush my hair, undress me everywhere
14 | Imagination, life is your creation
15 | I'm a blonde bimbo girl in a fantasy world
16 | Dress me up, make it tight, I'm your dolly
17 | You're my doll, rock and roll, feel the glamour in pink
18 | Kiss me here, touch me there, hanky-panky
19 | You can touch, you can play
20 | If you say, "I'm always yours"
21 | Ooh-whoa-ooh
22 | I'm a Barbie girl in a Barbie world
23 | Life in plastic, it's fantastic
24 | You can brush my hair, undress me everywhere
25 | Imagination, life is your creation
26 | Come on Barbie, let's go party
27 | Ah-ah-ah, yeah
28 | Come on Barbie, let's go party
29 | Ooh-whoa-ooh, ooh-whoa-ooh
30 | Come on Barbie, let's go party
31 | Ah-ah-ah, yeah
32 | Come on Barbie, let's go party
33 | Ooh-whoa-ooh, ooh-whoa-ooh
34 | Make me walk, make me talk, do whatever you please
35 | I can act like a star, I can beg on my knees
36 | Come jump in, bimbo friend, let us do it again
37 | Hit the town, fool around, let's go party
38 | You can touch, you can play
39 | If you say, "I'm always yours"
40 | You can touch, you can play
41 | If you say, "I'm always yours"
42 | Come on Barbie, let's go party
43 | Ah-ah-ah, yeah
44 | Come on Barbie, let's go party
45 | Ooh-whoa-ooh, ooh-whoa-ooh
46 | Come on Barbie, let's go party
47 | Ah-ah-ah, yeah
48 | Come on Barbie, let's go party
49 | Ooh-whoa-ooh, ooh-whoa-ooh
50 | I'm a Barbie girl in a Barbie world
51 | Life in plastic, it's fantastic
52 | You can brush my hair, undress me everywhere
53 | Imagination, life is your creation
54 | I'm a Barbie girl in a Barbie world
55 | Life in plastic, it's fantastic
56 | You can brush my hair, undress me everywhere
57 | Imagination, life is your creation
58 | Come on Barbie, let's go party
59 | Ah-ah-ah, yeah
60 | Come on Barbie, let's go party
61 | Ooh-whoa-ooh, ooh-whoa-ooh
62 | Come on Barbie, let's go party
63 | Ah-ah-ah, yeah
64 | Come on Barbie, let's go party
65 | Ooh-whoa-ooh, ooh-whoa-ooh
66 | Oh, I'm having so much fun!
67 | Well Barbie, we're just getting started
68 | Oh, I love you, Ken!


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/bohemian_rhapsody.txt:
--------------------------------------------------------------------------------
 1 | Is this the real life? Is this just fantasy?
 2 | Caught in a landslide, no escape from reality
 3 | Open your eyes, look up to the skies and see
 4 | I'm just a poor boy, I need no sympathy
 5 | Because I'm easy come, easy go, little high, little low
 6 | Any way the wind blows doesn't really matter to me, to me
 7 | Mama, just killed a man
 8 | Put a gun against his head, pulled my trigger, now he's dead
 9 | Mama, life had just begun
10 | But now I've gone and thrown it all away
11 | Mama, ooh, didn't mean to make you cry
12 | If I'm not back again this time tomorrow
13 | Carry on, carry on as if nothing really matters
14 | Too late, my time has come
15 | Sends shivers down my spine, body's aching all the time
16 | Goodbye, everybody, I've got to go
17 | Gotta leave you all behind and face the truth
18 | Mama, ooh (any way the wind blows)
19 | I don't wanna die
20 | I sometimes wish I'd never been born at all
21 | I see a little silhouetto of a man
22 | Scaramouche, Scaramouche, will you do the Fandango?
23 | Thunderbolt and lightning, very, very frightening me
24 | (Galileo) Galileo, (Galileo) Galileo, Galileo Figaro, magnifico
25 | I'm just a poor boy, nobody loves me
26 | He's just a poor boy from a poor family
27 | Spare him his life from this monstrosity
28 | Easy come, easy go, will you let me go?
29 | Bismillah!
30 | No, we will not let you go (let him go)
31 | Bismillah!
32 | We will not let you go (let him go)
33 | Bismillah!
34 | We will not let you go (let me go)
35 | Will not let you go (let me go)
36 | Never, never, never, never let me go
37 | No, no, no, no, no, no, no
38 | Oh, mamma mia, mamma mia
39 | Mamma mia, let me go
40 | Beelzebub has a devil put aside for me, for me, for me
41 | So you think you can stone me and spit in my eye?
42 | So you think you can love me and leave me to die?
43 | Oh, baby, can't do this to me, baby
44 | Just gotta get out, just gotta get right outta here
45 | Ooh
46 | Ooh, yeah, ooh, yeah
47 | Nothing really matters, anyone can see
48 | Nothing really matters
49 | Nothing really matters to me
50 | (Any way the wind blows)


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/born_this_way.txt:
--------------------------------------------------------------------------------
 1 | It doesn't matter if you love him or capital H-I-M
 2 | Just put your paws up
 3 | 'Cause you were born this way, baby
 4 | My mama told me when I was young, "We are all born superstars"
 5 | She rolled my hair and put my lipstick on in the glass of her boudoir
 6 | "There's nothing wrong with loving who you are"
 7 | She said, "'Cause He made you perfect, babe
 8 | So hold your head up, girl, and you'll go far"
 9 | Listen to me when I say
10 | I'm beautiful in my way 'cause God makes no mistakes
11 | I'm on the right track, baby, I was born this way
12 | Don't hide yourself in regret, just love yourself, and you're set
13 | I'm on the right track, baby, I was born this way (born this way)
14 | Ooh, there ain't no other way, baby, I was born this way
15 | Baby, I was born this way (born this way)
16 | Ooh, there ain't no other way, baby, I was born this way
17 | Right track, baby, I was born this way
18 | Don't be a drag, just be a queen
19 | Don't be a drag, just be a queen
20 | Don't be a drag, just be a queen
21 | Don't be (don't be, don't be)
22 | Give yourself prudence and love your friends
23 | Subway kid, rejoice your truth
24 | In the religion of the insecure, I must be myself, respect my youth
25 | A different lover is not a sin, believe capital H-I-M (hey, hey, hey)
26 | I love my life, I love this record, and
27 | Mi amore vole fe, yah (same DNA)
28 | I'm beautiful in my way 'cause God makes no mistakes
29 | I'm on the right track, baby, I was born this way
30 | Don't hide yourself in regret, just love yourself, and you're set
31 | I'm on the right track, baby, I was born this way
32 | Ooh, there ain't no other way, baby, I was born this way
33 | Baby, I was born this way (born this way)
34 | Ooh, there ain't no other way, baby, I was born this way
35 | I'm on the right track, baby, I was born this way
36 | Don't be a drag, just be a queen
37 | Whether you're broke or evergreen
38 | You're Black, white, beige, chola descent
39 | You're Lebanese, you're Orient'
40 | Whether life's disabilities left you outcast, bullied, or teased
41 | Rejoice and love yourself today
42 | 'Cause, baby, you were born this way
43 | No matter gay, straight, or bi', lesbian, transgender life
44 | I'm on the right track, baby, I was born to survive
45 | No matter Black, white or beige, chola, or Orient' made
46 | I'm on the right track, baby, I was born to be brave
47 | I'm beautiful in my way 'cause God makes no mistakes
48 | I'm on the right track, baby, I was born this way
49 | Don't hide yourself in regret, just love yourself, and you're set
50 | I'm on the right track, baby, I was born this way, yeah
51 | Ooh, there ain't no other way, baby, I was born this way
52 | Baby, I was born this way (born this way)
53 | Ooh, there ain't no other way, baby, I was born this way
54 | I'm on the right track, baby, I was born this way
55 | I was born this way, hey
56 | I was born this way, hey
57 | I'm on the right track, baby, I was born this way, hey
58 | I was born this way, hey
59 | I was born this way, hey
60 | I'm on the right track, baby, I was born this way, hey
61 | Same DNA, but born this way
62 | Same DNA, but born this way


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/buckley-hallelujah.txt:
--------------------------------------------------------------------------------
 1 | I heard there was a secret chord
 2 | That David played and it pleased the Lord
 3 | But you don't really care for music, do you?
 4 | Well it goes like this the fourth, the fifth
 5 | The minor fall and the major lift
 6 | The baffled king composing Hallelujah
 7 | Hallelujah
 8 | Hallelujah
 9 | Hallelujah
10 | Hallelujah
11 | Well your faith was strong but you needed proof
12 | You saw her bathing on the roof
13 | Her beauty and the moonlight overthrew you
14 | She tied you to her kitchen chair
15 | She broke your throne and she cut your hair
16 | And from your lips, she drew the Hallelujah
17 | Hallelujah
18 | Hallelujah
19 | Hallelujah
20 | Hallelujah
21 | Baby, I've been here before
22 | I've seen this room and I've walked this floor
23 | You know, I used to live alone before I knew you
24 | And I've seen your flag on the marble arch
25 | And Love is not a victory march
26 | It's a cold and it's a broken Hallelujah
27 | Hallelujah
28 | Hallelujah
29 | Hallelujah
30 | Hallelujah
31 | Well, there was a time when you let me know
32 | What's really going on below
33 | But now you never show that to me, do you?
34 | But remember, when I moved in you
35 | And the holy dove was moving too
36 | And every breath, we drew was Hallelujah
37 | Hallelujah
38 | Hallelujah
39 | Hallelujah
40 | Hallelujah
41 | Maybe there's a God above
42 | But, all I've ever learned from love
43 | Was how to shoot somebody who outdrew you?
44 | And it's not a cry, that you hear at night
45 | It's not somebody, who's seen the light
46 | It's a cold and it's a broken Hallelujah
47 | Hallelujah
48 | Hallelujah
49 | Hallelujah
50 | Hallelujah
51 | Hallelujah
52 | Hallelujah
53 | Hallelujah
54 | Hallelujah
55 | Hallelujah
56 | Hallelujah
57 | Hallelujah
58 | Hallelujah


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/can_you_feel_the_love_tonight.txt:
--------------------------------------------------------------------------------
 1 | There's a calm surrender
 2 | To the rush of day
 3 | When the heat of a rolling wind
 4 | Can be turned away
 5 | An enchanted moment
 6 | And it sees me through
 7 | It's enough for this restless warrior
 8 | Just to be with you
 9 | And can you feel the love tonight? (Tonight)
10 | It is where we are
11 | It's enough for this wide-eyed wanderer
12 | That we got this far
13 | And can you feel the love tonight? (Tonight)
14 | How it's laid to rest
15 | It's enough to make kings and vagabonds
16 | Believe the very best
17 | There's a time for everyone
18 | If they only learn
19 | That the twisting kaleidoscope
20 | Moves us all in turn
21 | There's a rhyme and reason
22 | To the wild outdoors
23 | When the heart of this star-crossed voyager
24 | Beats in time with yours
25 | And can you feel the love tonight? (Tonight)
26 | It is where we are
27 | It's enough for this wide-eyed wanderer
28 | That we got this far
29 | And can you feel the love tonight? (Tonight)
30 | How it's laid to rest
31 | It's enough to make kings and vagabonds
32 | Believe the very best
33 | It's enough to make kings and vagabonds
34 | Believe the very best


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/candle_in_the_wind.txt:
--------------------------------------------------------------------------------
 1 | Goodbye Norma Jeane
 2 | Though I never knew you at all
 3 | You had the grace to hold yourself
 4 | While those around you crawled
 5 | They crawled out of the woodwork
 6 | And they whispered into your brain
 7 | They set you on the treadmill
 8 | And they made you change your name
 9 | And it seems to me you lived your life
10 | Like a candle in the wind
11 | Never knowing who to cling to
12 | When the rain set in
13 | And I would've liked to known you
14 | But I was just a kid
15 | Your candle burned out long before
16 | Your legend ever did
17 | Loneliness was tough
18 | The toughest role you ever played
19 | Hollywood created a superstar
20 | And pain was the price you paid
21 | Even when you died
22 | Oh the press still hounded you
23 | All the papers had to say
24 | Was that Marilyn was found in the nude
25 | And it seems to me you lived your life
26 | Like a candle in the wind
27 | Never knowing who to cling to
28 | When the rain set in
29 | And I would've liked to known you
30 | But I was just a kid
31 | Your candle burned out long before
32 | Your legend ever did
33 | Goodbye Norma Jeane
34 | Though I never knew you at all
35 | You had the grace to hold yourself
36 | While those around you crawled
37 | Goodbye Norma Jeane
38 | From the young man in the twenty second row
39 | Who sees you as something as more than sexual
40 | More than just our Marilyn Monroe
41 | And it seems to me you lived your life
42 | Like a candle in the wind
43 | Never knowing who to cling to
44 | When the rain set in
45 | And I would've liked to known you
46 | But I was just a kid
47 | Your candle burned out long before
48 | Your legend ever did
49 | Your candle burned out long before
50 | Your legend ever did


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/closer.txt:
--------------------------------------------------------------------------------
 1 | You let me violate you
 2 | You let me desecrate you
 3 | You let me penetrate you
 4 | You let me complicate you
 5 | (Help me) I broke apart my insides
 6 | (Help me) I've got no soul to sell
 7 | (Help me) the only thing that works for me
 8 | Help me get away from myself
 9 | I wanna fuck you like an animal
10 | I wanna feel you from the inside
11 | I wanna fuck you like an animal
12 | My whole existence is flawed
13 | You get me closer to God
14 | You can have my isolation
15 | You can have the hate that it brings
16 | You can have my absence of faith
17 | You can have my everything
18 | (Help me) tear down my reason
19 | (Help me) it's your sex I can smell
20 | (Help me) you make me perfect
21 | Help me become somebody else
22 | I wanna fuck you like an animal
23 | I wanna feel you from the inside
24 | I wanna fuck you like an animal
25 | My whole existence is flawed
26 | You get me closer to God
27 | Through every forest
28 | Above the trees
29 | Within my stomach
30 | Scraped off my knees
31 | I drink the honey
32 | Inside your hive
33 | You are the reason
34 | I stay alive


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/cohen-hallelujah.txt:
--------------------------------------------------------------------------------
 1 | Now I've heard there was a secret chord
 2 | That David played, and it pleased the Lord
 3 | But you don't really care for music, do you?
 4 | It goes like this, the fourth, the fifth
 5 | The minor falls, the major lifts
 6 | The baffled king composing Hallelujah
 7 | Hallelujah, Hallelujah
 8 | Hallelujah, Hallelujah
 9 | Your faith was strong but you needed proof
10 | You saw her bathing on the roof
11 | Her beauty and the moonlight overthrew you
12 | She tied you to a kitchen chair
13 | She broke your throne, and she cut your hair
14 | And from your lips she drew the Hallelujah
15 | Hallelujah, Hallelujah
16 | Hallelujah, Hallelujah
17 | You say I took the name in vain
18 | I don't even know the name
19 | But if I did, well, really, what's it to you?
20 | There's a blaze of light in every word
21 | It doesn't matter which you heard
22 | The holy or the broken Hallelujah
23 | Hallelujah, Hallelujah
24 | Hallelujah, Hallelujah
25 | I did my best, it wasn't much
26 | I couldn't feel, so I tried to touch
27 | I've told the truth, I didn't come to fool you
28 | And even though it all went wrong
29 | I'll stand before the Lord of Song
30 | With nothing on my tongue but Hallelujah
31 | Hallelujah, Hallelujah
32 | Hallelujah, Hallelujah
33 | Hallelujah, Hallelujah
34 | Hallelujah, Hallelujah
35 | Hallelujah, Hallelujah
36 | Hallelujah, Hallelujah
37 | Hallelujah, Hallelujah
38 | Hallelujah, Hallelujah


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/eleanor.txt:
--------------------------------------------------------------------------------
 1 | Ah, look at all the lonely people
 2 | Ah, look at all the lonely people
 3 | Eleanor Rigby
 4 | Picks up the rice in the church where a wedding has been
 5 | Lives in a dream
 6 | Waits at the window
 7 | Wearing the face that she keeps in a jar by the door
 8 | Who is it for?
 9 | All the lonely people
10 | Where do they all come from?
11 | All the lonely people
12 | Where do they all belong?
13 | Father McKenzie
14 | Writing the words of a sermon that no one will hear
15 | No one comes near
16 | Look at him working
17 | Darning his socks in the night when there's nobody there
18 | What does he care?
19 | All the lonely people
20 | Where do they all come from?
21 | All the lonely people
22 | Where do they all belong?
23 | Ah, look at all the lonely people
24 | Ah, look at all the lonely people
25 | Eleanor Rigby
26 | Died in the church and was buried along with her name
27 | Nobody came
28 | Father McKenzie
29 | Wiping the dirt from his hands as he walks from the grave
30 | No one was saved
31 | All the lonely people (ah, look at all the lonely people)
32 | Where do they all come from?
33 | All the lonely people (ah, look at all the lonely people)
34 | Where do they all belong?


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/father_lucifer.txt:
--------------------------------------------------------------------------------
 1 | Father Lucifer you never looked so sane
 2 | You always did prefer the drizzle to the rain
 3 | Tell me that you're still in love with that milkmaid
 4 | How the Lizzies, how's your Jesus Christ been hanging?
 5 | Nothing's gonna stop me from floating
 6 | Nothing's gonna stop me from floating
 7 | He says he reckons I'm a watercolor stain
 8 | He says I run and then I run from him and then I run
 9 | He didn't see me watching from the aeroplane
10 | He wiped a tear and then he threw away our apple seed
11 | Nothing's gonna stop me from floating
12 | Nothing's gonna stop me from floating
13 | Every day's my wedding day
14 | (Go away world only glass)
15 | The baby's still in his comatose state
16 | (Georgie, they're your favourite)
17 | I'll dye my own Easter eggs
18 | (Skiddly-dee I'm in G, yes)
19 | Just don't go yet, just don't go
20 | (Never go, go so fast)
21 | And Beenie lost the sunset but that's okay
22 | (Go away world only glass)
23 | (Maybe she's hiding in a hot dog)
24 | Does Joe bring flowers to Marilyn's grave?
25 | (Georgie, I swear they're your favourite)
26 | (Got a pig hiding in a truffle)
27 | And girls that eat pizza and never gain weight
28 | (There she goes, there she goes)
29 | (Wearing those purple garters)
30 | Never gain weight, never gain weight
31 | (There she goes home)
32 | (And girl I got a condo in Hoboken)
33 | Father Lucifer you never looked so sane
34 | You always did prefer the drizzle to the rain
35 | Tell me that you're still in love with that milkmaid
36 | How the Lizzies, how's your Jesus Christ been hanging?


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/fire_water_burn.txt:
--------------------------------------------------------------------------------
 1 | The roof, the roof, the roof is on fire
 2 | The roof, the roof, the roof is on fire
 3 | The roof, the roof, the roof is on fire
 4 | We don't need no water, let the motherfucker burn
 5 | Burn motherfucker, burn
 6 | Hello, my name is Jimmy Pop and I'm a dumb white guy
 7 | I'm not old or new, but middle school, fifth grade, like junior high
 8 | I don't know mofo, if y'all peeps be buggin' givin' props to my hoe 'cause she fly
 9 | But I can take the heat 'cause I'm the other white meat known as "Kid funky fried"
10 | Yeah, I'm hung like planet Pluto, hard to see with the naked eye
11 | But if I crashed into Uranus, I would stick it where the sun don't shine
12 | 'Cause I'm kind of like Hans Solo, always stroking my own wookie
13 | I'm the root of all that's evil, yea, but you can call me Cookie
14 | The roof, the roof, the roof is on fire
15 | The roof, the roof, the roof is on fire
16 | The roof, the roof, the roof is on fire
17 | We don't need no water, let the motherfucker burn
18 | Burn motherfucker, burn
19 | Yo, yo
20 | This hardcore ghetto gangster image takes a lot of practice
21 | I'm not black like Barry White, no, I am white like Frank Black is
22 | So, if man is five and the devil is six, then that must make me seven
23 | This honkey's gone to heaven
24 | But if I go to hell, well, then I hope I burn well
25 | I'll spend my days with J.F.K., Marvin Gaye, Martha Raye and Lawrence Welk
26 | And Kurt Cobain, Kojak, Mark Twain and Jimi Hendrix's poltergeist
27 | And Webster, yeah, Emmanuel Lewis 'cause he's the Antichrist
28 | The roof, the roof, the roof is on fire
29 | The roof, the roof, the roof is on fire
30 | The roof, the roof, the roof is on fire
31 | We don't need no water, let the motherfucker burn
32 | Burn motherfucker, burn
33 | Everybody, here we go
34 | (Ooh, ooh)
35 | C'mon party people
36 | (Ooh, ooh)
37 | Throw your hands in the air
38 | (Ooh, ooh)
39 | C'mon party people
40 | (Ooh, ooh)
41 | Wave 'em like you don't care
42 | (Ooh, ooh)
43 | C'mon party people
44 | (Ooh, ooh)
45 | Everbody say, "Ho"
46 | (Ooh, ooh)
47 | C'mon party people
48 | (Ooo, ooo)
49 | Everybody here we go
50 | (Ooh, ooh)
51 | C'mon party people
52 | (Ooh, ooh)
53 | Throw your hands in the air
54 | (Ooh, ooh)
55 | C'mon party people
56 | (Ooo, ooo)
57 | Wave 'em like you don't care
58 | (Ooh, ooh)
59 | C'mon party people
60 | (Ooh, ooh)
61 | Everbody say, "Ho"
62 | (Ooh, ooh)
63 | C'mon party people
64 | (Ooh, ooh)
65 | Everybody here we go
66 | (Ooh, ooh)
67 | C'mon party people
68 | (Ooh, ooh)
69 | Throw your hands in the air
70 | (Ooh, ooh)
71 | C'mon party people
72 | (Ooh, ooh)
73 | Wave 'em like you don't care
74 | (Ooh, ooh)
75 | C'mon party people
76 | (Ooh, ooh)
77 | Everbody say, "Ho"
78 | (Ooh, ooh)
79 | C'mon party people
80 | (Ooh, ooh)
81 | Everybody here we go
82 | (Ooh, ooh)
83 | C'mon party people
84 | (Ooh, ooh)
85 | Throw your hands in the air
86 | (Ooh, ooh)
87 | C'mon party people
88 | (Ooh, ooh)
89 | Wave 'em like you don't care
90 | (Ooh, ooh)
91 | C'mon party people
92 | (Ooh, ooh)


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/gangstas_paradise.txt:
--------------------------------------------------------------------------------
 1 | As I walk through the valley of the shadow of death
 2 | I take a look at my life and realize there's nothin' left
 3 | 'Cause I've been blastin' and laughin' so long that
 4 | Even my mama thinks that my mind is gone
 5 | But I ain't never crossed a man that didn't deserve it
 6 | Me be treated like a punk, you know that's unheard of
 7 | You better watch how ya talkin' and where ya walkin'
 8 | Or you and your homies might be lined in chalk
 9 | I really hate to trip but I gotta loc
10 | As they croak, I see myself in the pistol smoke, fool
11 | I'm the kind of G that little homies wanna be like
12 | On my knees in the night, sayin' prayers in the streetlight
13 | We've been spendin' most their lives livin' in the gangsta's paradise
14 | We've been spendin' most their lives livin' in the gangsta's paradise
15 | We keep spendin' most our lives livin' in the gangsta's paradise
16 | We keep spendin' most our lives livin' in the gangsta's paradise
17 | Look at the situation they got me facin'
18 | I can't live a normal life, I was raised by the stripes
19 | So I gotta be down with the hood team
20 | Too much television watchin' got me chasin' dreams
21 | I'm a educated fool with money on my mind
22 | Got my ten in my hand and a gleam in my eye
23 | I'm a loc'd out gangsta, set trippin' banger
24 | And my homies is down, so don't arouse my anger, fool
25 | Death ain't nothin' but a heartbeat away
26 | I'm livin' my life do-or-die, uh, what can I say?
27 | I'm 23 now, but will I live to see 24?
28 | The way things is going, I don't know
29 | Tell me why are we so blind to see
30 | That the ones we hurt are you and me?
31 | We've been spendin' most their lives livin' in the gangsta's paradise
32 | We've been spendin' most their lives livin' in the gangsta's paradise
33 | We keep spendin' most our lives livin' in the gangsta's paradise
34 | We keep spendin' most our lives livin' in the gangsta's paradise
35 | Power and the money, money and the power
36 | Minute after minute, hour after hour
37 | Everybody's running, but half of them ain't lookin'
38 | What's going on in the kitchen? But I don't know what's cookin'
39 | They say I gotta learn, but nobody's here to teach me
40 | If they can't understand it, how can they reach me?
41 | I guess they can't, I guess they won't, I guess they front
42 | That's why I know my life is out of luck, fool
43 | We've been spendin' most their lives livin' in the gangsta's paradise
44 | We've been spendin' most their lives livin' in the gangsta's paradise
45 | We keep spendin' most our lives livin' in the gangsta's paradise
46 | We keep spendin' most our lives livin' in the gangsta's paradise
47 | Tell me why are we so blind to see
48 | That the ones we hurt are you and me?
49 | Tell me why are we so blind to see
50 | That the ones we hurt are you and me?


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/graceland.txt:
--------------------------------------------------------------------------------
 1 | The Mississippi Delta was shining like a national guitar
 2 | I am following the river down the highway through the cradle of the civil war
 3 | I'm going to Graceland, Graceland, Memphis, Tennessee
 4 | I'm going to Graceland
 5 | Poor boys and pilgrims with families
 6 | And we are going to Graceland
 7 | My traveling companion is nine years old
 8 | He is the child of my first marriage
 9 | But I've reason to believe we both will be received in Graceland
10 | She comes back to tell me she's gone
11 | As if I didn't know that
12 | As if I didn't know my own bed
13 | As if I'd never noticed the way she brushed her hair from her forehead
14 | And she said, "Losing love is like a window in your heart
15 | Everybody sees you're blown apart
16 | Everybody sees the wind blow"
17 | I'm going to Graceland, Memphis, Tennessee
18 | I'm going to Graceland
19 | Poor boys and pilgrims with families
20 | And we are going to Graceland
21 | And my traveling companions are ghosts and empty sockets
22 | I'm looking at ghosts and empties
23 | But I've reason to believe we all will be received in Graceland
24 | There is a girl in New York City who calls herself the human trampoline
25 | And sometimes when I'm falling, flying or tumbling in turmoil I say
26 | "Whoa, so this is what she means"
27 | She means we're bouncing in the Graceland
28 | And I see losing love is like a window in your heart
29 | Well, everybody sees you're blown apart
30 | Everybody feels the wind blow
31 | Ooh, ooh, ooh
32 | In Graceland, in Graceland
33 | I'm going to Graceland
34 | For reasons, I cannot explain
35 | There's some part of me wants to see Graceland
36 | And I may be obliged to defend every love, every ending
37 | Or maybe there's no obligations now
38 | Maybe I've a reason to believe we all will be received in Graceland
39 | Whoa, oh, oh
40 | In Graceland, in Graceland, in Graceland
41 | I'm going to Graceland


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/hakuna_matata.txt:
--------------------------------------------------------------------------------
  1 | [TIMON]
  2 | Hakuna matata! What a wonderful phrase!
  3 | 
  4 | [PUMBAA]
  5 | Hakuna matata! Ain't no passing craze!
  6 | 
  7 | [TIMON]
  8 | It means no worries, for the rest of your days...
  9 | 
 10 | [TIMON & PUMBAA]
 11 | It's our problem-free philosophy...
 12 | 
 13 | [TIMON]
 14 | Hakuna matata!
 15 | 
 16 | Why, when he was a young warthog...
 17 | 
 18 | [PUMBAA]
 19 | When I was a young warthog...
 20 | 
 21 | [TIMON]
 22 | Very nice
 23 | 
 24 | [PUMBAA]
 25 | Thanks
 26 | See pop shows near Seattle
 27 | Get tickets as low as $25
 28 | You might also like
 29 | ​we can’t be friends (wait for your love)
 30 | Ariana Grande
 31 | ​the boy is mine
 32 | Ariana Grande
 33 | Big Foot (A Cappella)
 34 | Nicki Minaj
 35 | [TIMON]
 36 | He found his aroma lacked a certain appeal
 37 | He could clear the savannah after every meal
 38 | 
 39 | [PUMBAA]
 40 | I'm a sensitive soul though I seem thick-skinned
 41 | And it hurt that my friends never stood downwind
 42 | And, oh, the shame!
 43 | 
 44 | [TIMON]
 45 | He was ashamed
 46 | 
 47 | [PUMBAA]
 48 | Thought of changin' my name!
 49 | 
 50 | [TIMON]
 51 | Oh, what's in a name?
 52 | 
 53 | [PUMBAA]
 54 | And I got downhearted
 55 | 
 56 | [TIMON]
 57 | How did ya feel?
 58 | 
 59 | [PUMBAA]
 60 | Everytime that I...
 61 | [TIMON]
 62 | Hey! Pumbaa! Not in front of the kids!
 63 | 
 64 | [PUMBAA]
 65 | Oh. Sorry...
 66 | 
 67 | [TIMON & PUMBAA]
 68 | Hakuna Matata! What a wonderful phrase
 69 | Hakuna Matata! Ain't no passing craze
 70 | 
 71 | [SIMBA]
 72 | It means no worries for the rest of your days
 73 | 
 74 | [TIMON]
 75 | Yeah, sing it, kid!
 76 | 
 77 | [TIMON & SIMBA]
 78 | It's our problem-free
 79 | 
 80 | [PUMBAA]
 81 | Philosophy
 82 | 
 83 | [TIMON & PUMBAA & SIMBA]
 84 | Hakuna Matata!
 85 | 
 86 | [TIMON & PUMBAA & SIMBA]
 87 | Hakuna matata!
 88 | Hakuna matata!
 89 | Hakuna matata!
 90 | Hakuna...
 91 | [SIMBA]
 92 | It means no worries for the rest of your days
 93 | 
 94 | [TIMON & PUMBAA & SIMBA]
 95 | It's our problem-free philosophy
 96 | Hakuna matata!
 97 | Hakuna matata!
 98 | Hakuna matata!
 99 | Hakuna matata!
100 | 
101 | (SIMBA scat sings to fade)


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/hotel_california.txt:
--------------------------------------------------------------------------------
 1 | On a dark desert highway
 2 | Cool wind in my hair
 3 | Warm smell of colitas
 4 | Rising up through the air
 5 | Up ahead in the distance
 6 | I saw a shimmering light
 7 | My head grew heavy and my sight grew dim
 8 | I had to stop for the night
 9 | There she stood in the doorway
10 | I heard the mission bell
11 | And I was thinkin' to myself
12 | "This could be heaven or this could be hell"
13 | Then she lit up a candle
14 | And she showed me the way
15 | There were voices down the corridor
16 | I thought I heard them say
17 | "Welcome to the Hotel California
18 | Such a lovely place (such a lovely place)
19 | Such a lovely face
20 | Plenty of room at the Hotel California
21 | Any time of year (any time of year)
22 | You can find it here"
23 | Her mind is Tiffany-twisted
24 | She got the Mercedes-Benz, uh
25 | She got a lot of pretty, pretty boys
26 | That she calls friends
27 | How they dance in the courtyard
28 | Sweet summer sweat
29 | Some dance to remember
30 | Some dance to forget
31 | So I called up the Captain
32 | "Please bring me my wine"
33 | He said, "We haven't had that spirit here
34 | Since 1969"
35 | And still, those voices are calling
36 | From far away
37 | Wake you up in the middle of the night
38 | Just to hear them say
39 | "Welcome to the Hotel California
40 | Such a lovely place (such a lovely place)
41 | Such a lovely face
42 | They're livin' it up at the Hotel California
43 | What a nice surprise (what a nice surprise)
44 | Bring your alibis"
45 | Mirrors on the ceiling
46 | The pink champagne on ice
47 | And she said, "We are all just prisoners here
48 | Of our own device"
49 | And in the master's chambers
50 | They gathered for the feast
51 | They stab it with their steely knives
52 | But they just can't kill the beast
53 | Last thing I remember
54 | I was running for the door
55 | I had to find the passage back
56 | To the place I was before
57 | "Relax," said the night man
58 | "We are programmed to receive
59 | You can check out any time you like
60 | But you can never leave!"


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/imagine.txt:
--------------------------------------------------------------------------------
 1 | Imagine there's no Heaven
 2 | It's easy if you try
 3 | No Hell below us
 4 | Above us only sky
 5 | Imagine all the people
 6 | Living for today
 7 | Ah, ah, ah-ah
 8 | Imagine there's no countries
 9 | It isn't hard to do
10 | Nothing to kill or die for
11 | And no religion, too
12 | Imagine all the people
13 | Living life in peace
14 | Yoo-hoo, ooh-ooh
15 | You may say I'm a dreamer
16 | But I'm not the only one
17 | I hope someday you'll join us
18 | And the world will be as one
19 | Imagine no possessions
20 | I wonder if you can
21 | No need for greed or hunger
22 | A brotherhood of man
23 | Imagine all the people
24 | Sharing all the world
25 | Yoo-hoo, ooh-ooh
26 | You may say I'm a dreamer
27 | But I'm not the only one
28 | I hope someday you'll join us
29 | And the world will live as one


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/je_ne_regrette_rien.txt:
--------------------------------------------------------------------------------
 1 | Non, rien de rien
 2 | Non, je ne regrette rien
 3 | Ni le bien, qu'on m'a fait
 4 | Ni le mal, tout ça m'est bien égal
 5 | Non, rien de rien
 6 | Non, je ne regrette rien
 7 | C'est payé, balayé, oublié
 8 | Je me fous du passé
 9 | Avec mes souvenirs
10 | J'ai allumé le feu
11 | Mes chagrins, mes plaisirs
12 | Je n'ai plus besoin d'eux
13 | Balayer les amours
14 | Avec leurs trémolos
15 | Balayer pour toujours
16 | Je repars à zéro
17 | Non, rien de rien
18 | Non, je ne regrette rien
19 | Ni le bien, qu'on m'a fait
20 | Ni le mal, tout ça m'est bien égal
21 | Non, rien de rien
22 | Non, je ne regrette rien
23 | Car ma vie, car mes joies
24 | Aujourd'hui, ça commence avec toi


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/knockin_on_heavens_door.txt:
--------------------------------------------------------------------------------
 1 | Mama, take this badge off of me
 2 | I can't use it anymore
 3 | It's gettin' dark, too dark to see
 4 | Feel I'm knockin' on Heaven's door
 5 | Knock, knock, knockin' on Heaven's door
 6 | Knock, knock, knockin' on Heaven's door
 7 | Knock, knock, knockin' on Heaven's door
 8 | Knock, knock, knockin' on Heaven's door
 9 | Mama, put my guns in the ground
10 | I can't shoot them anymore
11 | That long black cloud is comin' down
12 | I feel I'm knockin' on Heaven's door
13 | Knock, knock, knockin' on Heaven's door
14 | Knock, knock, knockin' on Heaven's door
15 | Knock, knock, knockin' on Heaven's door
16 | Knock, knock, knockin' on Heaven's door


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/landslide.txt:
--------------------------------------------------------------------------------
 1 | Fleetwood Mac - Landslide
 2 | 
 3 | I took my love, I took it down
 4 | I climbed a mountain and I turned around
 5 | And I saw my reflection in the snow-covered hills
 6 | 'Til the landslide brought me down
 7 | Oh, mirror in the sky
 8 | What is love?
 9 | Can the child within my heart rise above?
10 | Can I sail through the changin' ocean tides?
11 | Can I handle the seasons of my life?
12 | Well, I've been afraid of changin'
13 | 'Cause I've built my life around you
14 | But time makes you bolder
15 | Even children get older
16 | And I'm getting older too
17 | Well, I've been afraid of changin'
18 | 'Cause I've built my life around you
19 | But time makes you bolder
20 | Even children get older
21 | And I'm getting older too
22 | Oh! I'm getting older too
23 | Oh-oh, take my love, take it down
24 | Oh-oh, climb a mountain and you turn around
25 | And if you see my reflection in the snow-covered hills
26 | Well, the landslide bring it down
27 | And if you see my reflection in the snow-covered hills
28 | Well, the landslide bring it down
29 | Oh-ohh, the landslide bring it down


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/last_cristmas.txt:
--------------------------------------------------------------------------------
 1 | I don't want a lot for Christmas
 2 | There is just one thing I need
 3 | I don't care about the presents underneath the Christmas tree
 4 | I just want you for my own
 5 | More than you could ever know
 6 | Make my wish come true
 7 | All I want for Christmas is you
 8 | Yeah
 9 | I don't want a lot for Christmas
10 | There is just one thing I need (and I)
11 | Don't care about the presents underneath the Christmas tree
12 | I don't need to hang my stocking there upon the fireplace
13 | Santa Claus won't make me happy with a toy on Christmas Day
14 | I just want you for my own
15 | More than you could ever know
16 | Make my wish come true
17 | All I want for Christmas is you
18 | You, baby
19 | Oh, I won't ask for much this Christmas
20 | I won't even wish for snow (and I)
21 | I'm just gonna keep on waiting underneath the mistletoe
22 | I won't make a list and send it to the North Pole for Saint Nick
23 | I won't even stay awake to hear those magic reindeer click
24 | 'Cause I just want you here tonight
25 | Holding on to me so tight
26 | What more can I do?
27 | Oh, baby, all I want for Christmas is you
28 | You, baby
29 | Oh-oh, all the lights are shining so brightly everywhere (so brightly, baby)
30 | And the sound of children's laughter fills the air (oh, oh, yeah)
31 | And everyone is singing (oh, yeah)
32 | I hear those sleigh bells ringing
33 | Santa, won't you bring me the one I really need? (Yeah, oh)
34 | Won't you please bring my baby to me?
35 | Oh, I don't want a lot for Christmas
36 | This is all I'm asking for
37 | I just wanna see my baby standing right outside my door
38 | Oh, I just want you for my own
39 | More than you could ever know
40 | Make my wish come true
41 | Oh, baby, all I want for Christmas is you
42 | You, baby
43 | All I want for Christmas is you, baby
44 | All I want for Christmas is you, baby
45 | All I want for Christmas is you, baby
46 | All I want for Christmas (all I really want) is you, baby
47 | All I want (I want) for Christmas (all I really want) is you, baby


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/life_on_mars.txt:
--------------------------------------------------------------------------------
 1 | It's a god-awful small affair
 2 | To the girl with the mousy hair
 3 | But her mummy is yelling, "No!"
 4 | And her daddy has told her to go
 5 | But her friend is nowhere to be seen
 6 | Now she walks through her sunken dream
 7 | To the seat with the clearest view
 8 | And she's hooked to the silver screen
 9 | But the film is a saddening bore
10 | For she's lived it ten times or more
11 | She could spit in the eyes of fools
12 | As they ask her to focus on
13 | Sailors fighting in the dance hall
14 | Oh man, look at those cavemen go
15 | It's the freakiest show
16 | Take a look at the lawman
17 | Beating up the wrong guy
18 | Oh man, wonder if he'll ever know
19 | He's in the best-selling show
20 | Is there life on Mars?
21 | It's on America's tortured brow
22 | That Mickey Mouse has grown up a cow
23 | Now the workers have struck for fame
24 | 'Cause Lennon's on sale again
25 | See the mice in their million hordes
26 | From Ibiza to the Norfolk Broads
27 | "Rule, Britannia" is out of bounds
28 | To my mother, my dog and clowns
29 | But the film is a saddening bore
30 | 'Cause I wrote it ten times or more
31 | It's about to be writ again
32 | As I ask you to focus on
33 | Sailors fighting in the dance hall
34 | Oh man, look at those cavemen go
35 | It's the freakiest show
36 | Take a look at the lawman
37 | Beating up the wrong guy
38 | Oh man, wonder if he'll ever know
39 | He's in the best-selling show
40 | Is there life on Mars?


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/like_a_prayer.txt:
--------------------------------------------------------------------------------
 1 | Life is a mystery
 2 | Everyone must stand alone
 3 | I hear you call my name
 4 | And it feels like home
 5 | When you call my name it's like a little prayer
 6 | I'm down on my knees, I wanna take you there
 7 | In the midnight hour I can feel your power
 8 | Just like a prayer you know I'll take you there
 9 | I hear your voice
10 | It's like an angel sighing
11 | I have no choice, I hear your voice
12 | Feels like flying
13 | I close my eyes
14 | Oh God I think I'm falling
15 | Out of the sky, I close my eyes
16 | Heaven help me
17 | When you call my name it's like a little prayer
18 | I'm down on my knees, I wanna take you there
19 | In the midnight hour I can feel your power
20 | Just like a prayer you know I'll take you there
21 | Like a child
22 | You whisper softly to me
23 | You're in control just like a child
24 | Now I'm dancing
25 | It's like a dream
26 | No end and no beginning
27 | You're here with me it's like a dream
28 | Let the choir sing
29 | When you call my name it's like a little prayer
30 | I'm down on my knees, I wanna take you there
31 | In the midnight hour I can feel your power
32 | Just like a prayer you know I'll take you there
33 | When you call my name it's like a little prayer
34 | I'm down on my knees, I wanna take you there
35 | In the midnight hour I can feel your power
36 | Just like a prayer you know I'll take you there
37 | Life is a mystery
38 | Everyone must stand alone
39 | I hear you call my name
40 | And it feels like home
41 | Just like a prayer, your voice can take me there
42 | Just like a muse to me, you are a mystery
43 | Just like a dream, you are not what you seem
44 | Just like a prayer, no choice your voice can take me there
45 | Just like a prayer, I'll take you there
46 | It's like a dream to me
47 | Just like a prayer, I'll take you there
48 | It's like a dream to me
49 | Just like a prayer, I'll take you there
50 | It's like a dream to me
51 | Just like a prayer, I'll take you there
52 | It's like a dream to me
53 | Just like a prayer, your voice can take me there
54 | Just like a muse to me, you are a mystery
55 | Just like a dream, you are not what you seem
56 | Just like a prayer, no choice your voice can take me there
57 | Just like a prayer, your voice can take me there
58 | Just like a muse to me, you are a mystery
59 | Just like a dream, you are not what you seem
60 | Just like a prayer, no choice your voice can take me there
61 | Your voice can take me there
62 | Like a prayer
63 | Just like a prayer
64 | Just like a prayer, your voice can take me there
65 | Just like a prayer
66 | Just like a prayer, your voice can take me there


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/like_a_virgin.txt:
--------------------------------------------------------------------------------
 1 | I made it through the wilderness
 2 | Somehow I made it through
 3 | Didn't know how lost I was
 4 | Until I found you
 5 | I was beat
 6 | Incomplete
 7 | I'd been had, I was sad and blue
 8 | But you made me feel
 9 | Yeah, you made me feel
10 | Shiny and new (Hoo)
11 | Like a virgin
12 | Touched for the very first time
13 | Like a virgin
14 | When your heart beats
15 | Next to mine
16 | Gonna give you all my love, boy
17 | My fear is fading fast
18 | Been saving it all for you
19 | 'Cause only love can last
20 | You're so fine
21 | And you're mine
22 | Make me strong, yeah you make me bold
23 | Oh your love thawed out
24 | Yeah, your love thawed out
25 | What was scared and cold
26 | Like a virgin, hey
27 | Touched for the very first time
28 | Like a virgin
29 | With your heartbeat
30 | Next to mine
31 | Whoa
32 | Whoa, ah
33 | Whoa
34 | You're so fine
35 | And you're mine
36 | I'll be yours
37 | 'Til the end of time
38 | 'Cause you made me feel
39 | Yeah, you made me feel
40 | I've nothing to hide
41 | Like a virgin, hey
42 | Touched for the very first time
43 | Like a virgin
44 | With your heartbeat
45 | Next to mine
46 | Like a virgin, oh oh
47 | Like a virgin
48 | Feels so good inside
49 | When you hold me
50 | And your heart beats
51 | And you love me
52 | Oh oh, oh
53 | Oh oh oh


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/loser.txt:
--------------------------------------------------------------------------------
 1 | In the time of chimpanzees I was a monkey
 2 | Butane in my veins and I'm out to cut the junkie
 3 | With the plastic eyeballs, spray-paint the vegetables
 4 | Dog food stalls with the beefcake pantyhose
 5 | Kill the headlights and put it in neutral
 6 | Stock car flamin' with a loser and the cruise control
 7 | Baby's in Reno with the vitamin D
 8 | Got a couple of couches, sleep on the love-seat
 9 | Someone came in sayin' I'm insane to complain
10 | About a shotgun wedding and a stain on my shirt
11 | Don't believe everything that you breathe
12 | You get a parking violation and a maggot on your sleeve
13 | So shave your face with some mace in the dark
14 | Savin' all your food stamps and burnin' down the trailer park
15 | Yo, cut it
16 | Soy un perdedor
17 | I'm a loser baby, so why don't you kill me?
18 | (Double-barrel buckshot)
19 | Soy un perdedor
20 | I'm a loser baby, so why don't you kill me?
21 | Forces of evil in a bozo nightmare
22 | Ban all the music with a phony gas chamber
23 | 'Cause one's got a weasel and the other's got a flag
24 | One's on the pole, shove the other in a bag
25 | With the rerun shows and the cocaine nose-job
26 | The daytime crap of the folksinger slob
27 | He hung himself with a guitar string
28 | A slab of turkey-neck and it's hanging from a pigeon wing
29 | You can't write if you can't relate
30 | Trade the cash for the beef for the body for the hate
31 | And my time is a piece of wax falling on a termite
32 | That's choking on the splinters
33 | Soy un perdedor
34 | I'm a loser baby, so why don't you kill me?
35 | (Get crazy with the cheeze whiz)
36 | Soy un perdedor
37 | I'm a loser baby, so why don't you kill me?
38 | (Drive-by body pierce)
39 | Yo, bring it on down
40 | (I'm a driver, I'm a winner)
41 | (Things are gonna change I can feel it)
42 | Soy un perdedor
43 | I'm a loser baby, so why don't you kill me?
44 | (I can't believe you)
45 | Soy un perdedor
46 | I'm a loser baby, so why don't you kill me?
47 | Soy un perdedor
48 | I'm a loser baby, so why don't you kill me?
49 | (Sprechen Sie deutsch, baby?)
50 | Soy un perdedor
51 | I'm a loser baby, so why don't you kill me?
52 | (Know what I'm sayin'?)


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/lovin_feeling.txt:
--------------------------------------------------------------------------------
 1 | [Verse 1]
 2 | You never close your eyes anymore
 3 | When I kiss your lips
 4 | And there's no tenderness like before
 5 | In your fingertips
 6 | 
 7 | [Pre-Chorus]
 8 | You're trying hard not to show it (Baby)
 9 | But baby, baby, I know it
10 | 
11 | [Chorus]
12 | You've lost that lovin' feelin'
13 | Whoa, that lovin' feelin'
14 | You've lost that lovin' feelin'
15 | Now it's gone, gone, gone, whoa-oh-oh-oh
16 | 
17 | [Verse 2]
18 | Now there's no welcome look in your eyes
19 | When I reach for you
20 | And now you're starting to criticize
21 | Little things I do
22 | 
23 | [Pre-Chorus]
24 | It makes me just feel like crying (Baby)
25 | 'Cause, baby, something beautiful's dying
26 | 
27 | [Chorus]
28 | You've lost that lovin' feelin'
29 | Whoa, that lovin' feelin'
30 | You've lost that lovin' feelin'
31 | Now it's gone, gone, gone, whoa-oh-oh-oh
32 | 
33 | [Bridge]
34 | Baby, baby, I'd get down on my knees for you
35 | If you would only love me like you used to do, yeah
36 | We had a love, a love, a love you don't find every day
37 | So don't, don't, don't, don't let it slip away
38 | Baby (Baby), baby (Baby)
39 | I beg of you, please (Please), please (Please)
40 | I need your love (I need your love)
41 | I need your love (I need your love)
42 | So bring it on back (So bring it on back)
43 | Bring it on back (So bring it on back)
44 | 
45 | [Chorus]
46 | Bring back that lovin' feelin'
47 | Whoa, that lovin' feeling
48 | Bring back that lovin' feelin'
49 | 'Cause it's gone, gone, gone
50 | And I can't go on, whoa-oh-oh
51 | 
52 | [Outro]
53 | Bring back that lovin' feelin'
54 | Whoa, that lovin' feelin'
55 | Bring back that lovin' feelin'
56 | 'Cause it's gone, gone...


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/my_name_is.txt:
--------------------------------------------------------------------------------
  1 | [Produced by Dr. Dre]
  2 | 
  3 | [Chorus: Eminem]
  4 | Hi, my name is, what? My name is, who?
  5 | My name is, chka-chka, Slim Shady
  6 | Hi, my name is, huh? My name is, what?
  7 | My name is, chka-chka, Slim Shady
  8 | Hi, my name is, what? (Excuse me) My name is, who?
  9 | My name is, chka-chka, Slim Shady
 10 | (Can I have the attention of the class for one second?)
 11 | Hi, my name is, huh? My name is, what?
 12 | My name is, chka-chka, Slim Shady
 13 | 
 14 | [Verse 1: Eminem & Dr. Dre]
 15 | Hi, kids, do you like violence? (Yeah, yeah, yeah)
 16 | Wanna see me stick nine-inch nails through each one of my eyelids? (Uh-huh)
 17 | Wanna copy me and do exactly like I did? (Yeah, yeah)
 18 | Try 'cid and get fucked up worse than my life is? (Huh?)
 19 | My brain's dead weight, I'm tryna get my head straight
 20 | But I can't figure out which Spice Girl I want to impregnate (Oh)
 21 | And Dr. Dre said, "Slim Shady, you a basehead" (Uh-uh)
 22 | "Then why's your face red? Man, you wasted"
 23 | Well, since age 12, I felt like I'm someone else
 24 | 'Cause I hung my original self from the top bunk with a belt
 25 | Got pissed off and ripped Pamela Lee's tits off
 26 | And smacked her so hard I knocked her clothes backwards like Kris Kross
 27 | I smoke a fat pound of grass, and fall on my ass
 28 | Faster than a fat bitch who sat down too fast
 29 | Come here, slut; "Shady, wait a minute, that's my girl, dawg"
 30 | I don't give a fuck, God sent me to piss the world off
 31 | See rap shows near Seattle
 32 | Get tickets as low as $34
 33 | You might also like
 34 | The Real Slim Shady
 35 | Eminem
 36 | Big Foot
 37 | Nicki Minaj
 38 | Without Me
 39 | Eminem
 40 | [Chorus: Eminem]
 41 | Hi, my name is, what? My name is, who?
 42 | My name is, chka-chka, Slim Shady
 43 | Hi, my name is, huh? My name is, what?
 44 | My name is, chka-chka, Slim Shady
 45 | Hi, my name is, what? My name is, who?
 46 | My name is, chka-chka, Slim Shady
 47 | Hi, my name is, huh? My name is, what?
 48 | My name is, chka-chka, Slim Shady
 49 | 
 50 | [Verse 2: Eminem]
 51 | My English teacher wanted to flunk me in junior high (Shh)
 52 | Thanks a lot, next semester I'll be 35
 53 | I smacked him in his face with an eraser, chased him with a stapler
 54 | And stapled his nuts to a stack of paper (Ow)
 55 | Walked in the strip club, had my jacket zipped up
 56 | Flashed the bartender, then stuck my dick in the tip cup
 57 | Extraterrestrial, running over pedestrians in a spaceship While they're screaming at me, "Let's just be friends"
 58 | 99 percent of my life, I was lied to
 59 | I just found out my mom does more dope than I do (Damn)
 60 | I told her I'd grow up to be a famous rapper
 61 | Make a record about doin' drugs and name it after her
 62 | (Oh, thank you)
 63 | You know you blew up when the women rush your stands
 64 | And try to touch your hands like some screamin' Usher fans
 65 | (Ahh, ahh, ahh)
 66 | This guy at White Castle asked for my autograph (Dude, can I get your autograph?)
 67 | So I signed it, "Dear Dave, thanks for the support, asshole"
 68 | [Chorus: Eminem]
 69 | Hi, my name is, huh? My name is, who?
 70 | My name is, chka-chka, Slim Shady
 71 | Hi, my name is, what? My name is, who?
 72 | My name is, chka-chka, Slim Shady
 73 | Hi, my name is, huh? My name is, who?
 74 | My name is, chka-chka, Slim Shady
 75 | Hi, my name is, what? My name is, who?
 76 | My name is, chka-chka, Slim Shady
 77 | 
 78 | [Verse 3: Eminem]
 79 | Stop the tape, this kid needs to be locked away (Get him)
 80 | Dr. Dre, don't just stand there, operate
 81 | I'm not ready to leave, it's too scary to die (Fuck that)
 82 | I'll have to be carried inside the cemetery and buried alive
 83 | (Huh, yup)
 84 | Am I comin' or goin'? I can barely decide
 85 | I just drank a fifth of vodka, dare me to drive? (Go ahead)
 86 | All my life I was very deprived
 87 | I ain't had a woman in years and my palms are too hairy to hide (Whoops)
 88 | Clothes ripped like the Incredible Hulk
 89 | I spit when I talk, I'll fuck anything that walks (Come here)
 90 | When I was little, I used to get so hungry I would throw fits
 91 | How you gonna breastfeed me, Mom? You ain't got no tits
 92 | I lay awake and strap myself in the bed
 93 | With a bulletproof vest on and shoot myself in the head (Bang)
 94 | 'Cause I'm steamin' mad (Grr)
 95 | And by the way, when you see my dad (Yeah?)
 96 | Tell him that I slit his throat in this dream I had
 97 | [Chorus: Eminem]
 98 | Hi, my name is, what? My name is, who?
 99 | My name is, chka-chka, Slim Shady
100 | Hi, my name is, huh? My name is, what?
101 | My name is, chka-chka, Slim Shady
102 | Hi, my name is, who? My name is, huh?
103 | My name is, chka-chka, Slim Shady
104 | Hi, my name is, huh? My name is, who?
105 | My name is, chka-chka, Slim Shady


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/my_way.txt:
--------------------------------------------------------------------------------
 1 | And now the end is here
 2 | And so I face that final curtain
 3 | My friend I'll make it clear
 4 | I'll state my case, of which I'm certain
 5 | I've lived a life that's full
 6 | I traveled each and every highway
 7 | And more, much more
 8 | I did it, I did it my way
 9 | Regrets, I've had a few
10 | But then again too few to mention
11 | I did what I had to do
12 | I saw it through without exemption
13 | I planned each charted course
14 | Each careful step along the byway
15 | And more, much, much more
16 | I did it, I did it my way
17 | Yes, there were times I'm sure you knew
18 | When I bit off more than I could chew
19 | But through it all, when there was doubt
20 | I ate it up and spit it out
21 | I faced it all and I stood tall and did it my way
22 | For what is a man, what has he got?
23 | If not himself then he has naught
24 | Not to say the things that he truly feels
25 | And not the words of someone who kneels
26 | Let the record shows I took all the blows and did it my way


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/nothing_compares.txt:
--------------------------------------------------------------------------------
 1 | It's been seven hours and 15 days
 2 | Since you took your love away
 3 | I go out every night and sleep all day
 4 | Since you took your love away
 5 | Since you been gone, I can do whatever I want
 6 | I can see whomever I choose
 7 | I can eat my dinner in a fancy restaurant
 8 | But nothing
 9 | I said nothing can take away these blues
10 | 'Cause nothing compares
11 | Nothing compares to you
12 | It's been so lonely without you here
13 | Like a bird without a song
14 | Nothing can stop these lonely tears from falling
15 | Tell me baby, where did I go wrong?
16 | I could put my arms around every boy I see
17 | But they'd only remind me of you
18 | I went to the doctor, guess what he told me
19 | Guess what he told me
20 | He said, "Girl you better try to have fun, no matter what you do"
21 | But he's a fool
22 | 'Cause nothing compares, nothing compares to you
23 | All the flowers that you planted mama
24 | In the back yard
25 | All died when you went away
26 | I know that living with you baby was sometimes hard
27 | But I'm willing to give it another try
28 | Nothing compares
29 | Nothing compares to you
30 | Nothing compares
31 | Nothing compares to you
32 | Nothing compares
33 | Nothing compares to you


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/one_more_time.txt:
--------------------------------------------------------------------------------
 1 | Oh, baby, baby
 2 | Oh, baby, baby
 3 | Oh, baby, baby, how was I supposed to know
 4 | That something wasn't right here?
 5 | Oh, baby, baby, I shouldn't have let you go
 6 | And now you're out of sight, yeah
 7 | Show me how you want it to be
 8 | Tell me, baby, 'cause I need to know now, oh, because
 9 | My loneliness is killing me (and I)
10 | I must confess I still believe (still believe)
11 | When I'm not with you I lose my mind
12 | Give me a sign, hit me baby one more time
13 | Oh, baby, baby, the reason I breathe is you
14 | Boy, you got me blinded
15 | Oh, pretty baby, there's nothing that I wouldn't do
16 | It's not the way I planned it
17 | Show me how you want it to be
18 | Tell me, baby, 'cause I need to know now, oh, because
19 | My loneliness is killing me (and I)
20 | I must confess I still believe (still believe)
21 | When I'm not with you I lose my mind
22 | Give me a sign, hit me baby one more time
23 | Oh, baby, baby
24 | Oh-oh
25 | Oh, baby, baby
26 | Eh-eh-yeah
27 | Oh, baby, baby, how was I supposed to know?
28 | Oh, pretty baby, I shouldn't have let you go
29 | I must confess that my loneliness is killing me now
30 | Don't you know I still believe
31 | That you will be here, and give me a sign
32 | Hit me baby one more time
33 | My loneliness is killing me (and I)
34 | I must confess I still believe (still believe)
35 | When I'm not with you I lose my mind
36 | Give me a sign, hit me baby one more time
37 | I must confess that my loneliness is killing me now
38 | Don't you know I still believe
39 | That you will be here, and give me a sign
40 | Hit me baby one more time


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/rhiannon.txt:
--------------------------------------------------------------------------------
 1 | Rhiannon rings like a bell through the night
 2 | And wouldn't you love to love her?
 3 | Takes to the sky like a bird in flight
 4 | And who will be her lover?
 5 | All your life you've never seen
 6 | Woman taken by the wind
 7 | Would you stay if she promised you heaven?
 8 | Will you ever win?
 9 | She is like a cat in the dark
10 | And then she is to darkness
11 | She rules her life like a fine skylark
12 | And when the sky is starless
13 | All your life you've never seen
14 | Woman taken by the wind
15 | Would you stay if she promised you heaven?
16 | Will you ever win?
17 | Will you ever win?
18 | (Rhiannon)
19 | (Rhiannon)
20 | (Rhiannon)
21 | (Rhiannon)
22 | She rings like a bell through the night
23 | And wouldn't you love to love her?
24 | She rules her life like a bird in flight
25 | And who will be her lover?
26 | All your life you've never seen
27 | Woman taken by the wind
28 | Would you stay if she promised you heaven?
29 | Will you ever win?
30 | Will you ever win?
31 | (Rhiannon)
32 | (Rhiannon)
33 | (Rhiannon)
34 | Taken by taken by the sky
35 | (Ah-ah)
36 | Taken by taken by the sky
37 | (Ah-ah)
38 | Taken by taken by the sky
39 | (Ah-ah)
40 | Dreams unwind
41 | Love's a state of mind
42 | Dreams unwind
43 | Love's a state of mind


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/running_up_that_hill.txt:
--------------------------------------------------------------------------------
 1 | It doesn't hurt me (yeah-yeah, yo)
 2 | Do you wanna feel how it feels? (Yeah-yeah, yo)
 3 | Do you wanna know, know that it doesn't hurt me? (Yeah-yeah, yo)
 4 | Do you wanna hear about the deal that I'm making? (Yeah-yeah, yo)
 5 | You
 6 | It's you and me
 7 | And if I only could
 8 | I'd make a deal with God
 9 | And I'd get him to swap our places
10 | Be running up that road
11 | Be running up that hill
12 | Be running up that building
13 | See, if I only could, oh
14 | You don't wanna hurt me (yeah-yeah, yo)
15 | But see how deep the bullet lies (yeah-yeah, yo)
16 | Unaware, I'm tearing you asunder (yeah-yeah, yo)
17 | Oh, there is thunder in our hearts (yeah-yeah, yo)
18 | Is there so much hate for the ones we love? (Yeah-yeah, yo)
19 | Oh, tell me, we both matter, don't we? (Yeah-yeah, yo)
20 | You
21 | It's you and me
22 | It's you and me, won't be unhappy
23 | And if I only could
24 | I'd make a deal with God
25 | And I'd get him to swap our places
26 | Be running up that road
27 | Be running up that hill
28 | Be running up that building (yeah, yo)
29 | Say, if I only could, oh
30 | You (yeah-yeah, yo)
31 | It's you and me
32 | It's you and me, won't be unhappy (yeah-yeah, yo)
33 | Oh, come on, baby (yeah)
34 | Oh, come on, darling (yo)
35 | Let me steal this moment from you now
36 | Oh, come on, angel
37 | Come on, come on, darling
38 | Let's exchange the experience, oh
39 | And if I only could
40 | I'd make a deal with God
41 | And I'd get him to swap our places
42 | I'd be running up that road
43 | Be running up that hill
44 | With no problems
45 | Say, if I only could
46 | I'd make a deal with God
47 | And I'd get him to swap our places
48 | Be running up that road
49 | Be running up that hill
50 | With no problems
51 | So, if I only could
52 | I'd make a deal with God
53 | And I'd get him to swap our places
54 | I'd be running up that road
55 | Be running up that hill
56 | With no problems
57 | So, if I only could
58 | Be running up that hill
59 | With no problems
60 | (If I only could, I'd be running up that hill)
61 | (If I only could, I'd be running up that hill)


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/sober.txt:
--------------------------------------------------------------------------------
 1 | There's a shadow just behind me
 2 | Shrouding every step I take
 3 | Making every promise empty
 4 | Pointing every finger at me
 5 | Waiting like the stalking butler
 6 | Whom upon the finger rests
 7 | Murder now the path of "must we"
 8 | Just because the Son has come
 9 | Jesus, won't you fucking whistle
10 | Something but the past is done?
11 | Jesus, won't you fucking whistle
12 | Something but the past is done?
13 | Why can't we not be sober?
14 | I just want to start this over
15 | Why can't we drink forever?
16 | I just want to start this over
17 | I am just a worthless liar
18 | I am just an imbecile
19 | I will only complicate you
20 | Trust in me and fall as well
21 | I will find a center in you
22 | I will chew it up and leave
23 | I will work to elevate you
24 | Just enough to bring you down
25 | Mother Mary, won't you whisper
26 | Something but the past is done?
27 | Mother Mary, won't you whisper
28 | Something but the past is done?
29 | Why can't we not be sober?
30 | I just want to start this over
31 | Why can't we sleep forever?
32 | I just want to start this over
33 | I am just a worthless liar
34 | I am just an imbecile
35 | I will only complicate you
36 | Trust in me and fall as well
37 | I will find a center in you
38 | I will chew it up and leave
39 | Trust me
40 | Trust me
41 | Trust me
42 | Trust me
43 | Trust me
44 | Why can't we not be sober?
45 | I just want to start things over
46 | Why can't we sleep forever?
47 | I just want to start this over
48 | I want what I want
49 | I want what I want
50 | I want what I want
51 | I want what I want


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/sound_of_silence.txt:
--------------------------------------------------------------------------------
 1 | Hello darkness, my old friend
 2 | I've come to talk with you again
 3 | Because a vision softly creeping
 4 | Left its seeds while I was sleeping
 5 | And the vision that was planted in my brain
 6 | Still remains
 7 | Within the sound of silence
 8 | In restless dreams, I walked alone
 9 | Narrow streets of cobblestone
10 | 'Neath the halo of a streetlamp
11 | I turned my collar to the cold and damp
12 | When my eyes were stabbed by the flash of a neon light
13 | That split the night
14 | And touched the sound of silence
15 | And in the naked light, I saw
16 | Ten thousand people, maybe more
17 | People talking without speaking
18 | People hearing without listening
19 | People writing songs that voices never shared
20 | No one dared
21 | Disturb the sound of silence
22 | "Fools", said I, "You do not know
23 | Silence like a cancer grows
24 | Hear my words that I might teach you
25 | Take my arms that I might reach you"
26 | But my words like silent raindrops fell
27 | And echoed in the wells of silence
28 | And the people bowed and prayed
29 | To the neon god they made
30 | And the sign flashed out its warning
31 | In the words that it was forming
32 | And the sign said, "The words of the prophets are written on the subway walls
33 | In tenement halls"
34 | And whispered in the sounds of silence


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/sympathy_for_the_devil.txt:
--------------------------------------------------------------------------------
 1 | Please, allow me to introduce myself
 2 | I'm a man of wealth and taste
 3 | I've been around for a long, long year
 4 | Stole many a man's soul and fate
 5 | I was 'round when Jesus Christ
 6 | Had his moments of doubt and pain
 7 | Made damn sure that Pilate
 8 | Washed his hands and sealed his fate
 9 | Pleased to meet you
10 | Hope you guess my name, oh yeah
11 | But what's puzzling you
12 | Is the nature of my game
13 | I stuck around St. Petersburg
14 | When I saw it was a time for a change
15 | Killed the Czar and his ministers
16 | Anastasia screamed in vain
17 | I rode a tank, held a General's rank
18 | When the Blitzkrieg raged and the bodies stank
19 | Pleased to meet you
20 | Hope you guess my name, oh yeah
21 | Oh, what's puzzling you
22 | Is the nature of my game, oh yeah
23 | I watched the glee while your kings and queens
24 | Fought for ten decades for the gods they made
25 | I shouted out, "Who killed the Kennedys?"
26 | Well, after all, it was you and me
27 | Let me please introduce myself
28 | I'm a man of wealth and taste
29 | And I laid traps for troubadours
30 | Who get killed before they reached Bombay
31 | Pleased to meet you
32 | Hope you guess my name, oh yeah
33 | But what's puzzling you
34 | Is the nature of my game, oh yeah
35 | Rock it down, baby
36 | Just as every cop is a criminal
37 | And all the sinners saints
38 | As heads is tails, just call me Lucifer
39 | 'Cause I'm in need of some restraint
40 | So if you meet me, have some courtesy
41 | Have some sympathy and some taste
42 | Use all your well learned politics
43 | Or I'll lay your soul to waste, mmm yeah
44 | Pleased to meet you
45 | Hope you guess my name, mmm yeah
46 | But what's puzzling you
47 | Is the nature of my game, made it
48 | Get down
49 | Woo-hoo
50 | Oh, yeah
51 | Please, don't do that
52 | Oh, yeah
53 | Hey
54 | Aw, yeah


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/teen_spirit.txt:
--------------------------------------------------------------------------------
 1 | Load up on guns, bring your friends
 2 | It's fun to lose and to pretend
 3 | She's over-bored and self-assured
 4 | Oh no, I know a dirty word
 5 | Hello, hello, hello, how low
 6 | Hello, hello, hello, how low
 7 | Hello, hello, hello, how low
 8 | Hello, hello, hello
 9 | With the lights out, it's less dangerous
10 | Here we are now, entertain us
11 | I feel stupid and contagious
12 | Here we are now, entertain us
13 | A mulatto, an albino
14 | A mosquito, my libido
15 | Yeah
16 | Hey
17 | Yay
18 | I'm worse at what I do best
19 | And for this gift I feel blessed
20 | Our little group has always been
21 | And always will until the end
22 | Hello, hello, hello, how low
23 | Hello, hello, hello, how low
24 | Hello, hello, hello, how low
25 | Hello, hello, hello
26 | With the lights out, it's less dangerous
27 | Here we are now, entertain us
28 | I feel stupid and contagious
29 | Here we are now, entertain us
30 | A mulatto, an albino
31 | A mosquito, my libido
32 | Yeah
33 | Hey
34 | Yay
35 | And I forget just why I taste
36 | Oh yeah, I guess it makes me smile
37 | I found it hard, it's hard to find
38 | Ooh well, whatever, nevermind
39 | Hello, hello, hello, how low
40 | Hello, hello, hello, how low
41 | Hello, hello, hello, how low
42 | Hello, hello, hello
43 | With the lights out, it's less dangerous
44 | Here we are now, entertain us
45 | I feel stupid and contagious
46 | Here we are now, entertain us
47 | A mulatto, an albino
48 | A mosquito, my libido
49 | A denial
50 | A denial
51 | A denial
52 | A denial
53 | A denial
54 | A denial
55 | A denial
56 | A denial
57 | A denial


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/total_eclipse.txt:
--------------------------------------------------------------------------------
 1 | Turn around, every now and then
 2 | I get a little bit lonely, and you're never coming round
 3 | Turn around, every now and then
 4 | I get a little bit tired of listening to the sound of my tears
 5 | Turn around, every now and then
 6 | I get a little bit nervous that the best of all the years have gone by
 7 | Turn around, every now and then
 8 | I get a little bit terrified, and then I see the look in your eyes
 9 | Turn around, bright eyes
10 | Every now and then I fall apart
11 | Turn around, bright eyes
12 | Every now and then I fall apart
13 | And I need you now tonight
14 | And I need you more than ever
15 | And if you only hold me tight
16 | We'll be holding on forever
17 | And we'll only be making it right
18 | 'Cause we'll never be wrong
19 | Together, we can take it to the end of the line
20 | Your love is like a shadow on me all of the time (All of the time)
21 | I don't know what to do, and I'm always in the dark
22 | We're living in a powder keg and giving off sparks
23 | I really need you tonight
24 | Forever's gonna start tonight
25 | Forever's gonna start tonight
26 | Once upon a time, I was falling in love
27 | Now I'm only falling apart
28 | There's nothing I can do
29 | A total eclipse of the heart
30 | Once upon a time, there was light in my life
31 | But now there's only love in the dark
32 | Nothing I can say
33 | A total eclipse of the heart
34 | Turn around, bright eyes
35 | Every now and then, I fall apart
36 | Turn around, bright eyes
37 | Every now and then, I fall apart
38 | And I need you now tonight (and I need you)
39 | And I need you more than ever
40 | And if you only hold me tight (if you'll only)
41 | We'll be holding on forever
42 | And we'll only be making it right (and we'll never)
43 | 'Cause we'll never be wrong
44 | Together we can take it to the end of the line
45 | Your love is like a shadow on me all of the time (all of the time)
46 | I don't know what to do, I'm always in the dark
47 | We're living in a powder keg, and giving off sparks
48 | I really need you tonight
49 | Forever's gonna start tonight
50 | Forever's gonna start tonight
51 | Once upon a time, I was falling in love
52 | Now I'm only falling apart
53 | Nothing I can say
54 | A total eclipse of the heart
55 | A total eclipse of the heart
56 | A total eclipse of the heart
57 | Turn around, bright eyes


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/watchtower.txt:
--------------------------------------------------------------------------------
 1 | There must be some way out of here
 2 | Said the joker to the thief
 3 | There's too much confusion
 4 | I can't get no relief
 5 | Businessmen, they drink my wine
 6 | Plowmen dig my earth
 7 | None of them along the line
 8 | Know what any of it is worth
 9 | "No reason to get excited"
10 | The thief, he kindly spoke
11 | "There are many here among us
12 | Who feel that life is but a joke"
13 | "But you and I, we've been through that
14 | And this is not our fate
15 | So, let us not talk falsely now
16 | The hour is getting late"
17 | All along the watchtower
18 | Princes kept the view
19 | While all the women came and went
20 | Barefoot servants, too
21 | Outside, in the distance
22 | A wildcat did growl
23 | Two riders were approaching
24 | The wind began to howl


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/wild_side.txt:
--------------------------------------------------------------------------------
 1 | Holly came from Miami, F.L.A.
 2 | Hitch-hiked her way across the U.S.A.
 3 | Plucked her eyebrows on the way
 4 | Shaved her legs and then he was a she
 5 | She says, "Hey, babe
 6 | Take a walk on the wild side"
 7 | Said, "Hey, honey
 8 | Take a walk on the wild side"
 9 | Candy came from out on the Island
10 | In the back room she was everybody's darling
11 | But she never lost her head
12 | Even when she was giving head
13 | She says, "Hey, babe
14 | Take a walk on the wild side"
15 | Said, "Hey, babe
16 | Take a walk on the wild side"
17 | And the colored girls go
18 | "Doo do doo do doo do do doo..."
19 | Little Joe never once gave it away
20 | Everybody had to pay and pay
21 | A hustle here and a hustle there
22 | New York City's the place
23 | Where they said, "Hey, babe
24 | Take a walk on the wild side"
25 | I said, "Hey, Joe
26 | Take a walk on the wild side"
27 | Sugar Plum Fairy came and hit the streets
28 | Looking for soul food and a place to eat
29 | Went to the Apollo
30 | You should've seen them go, go, go
31 | They said, "Hey, sugar
32 | Take a walk on the wild side"
33 | I said, "Hey, babe
34 | Take a walk on the wild side", alright
35 | Huh
36 | Jackie is just speeding away
37 | Thought she was James Dean for a day
38 | Then I guess she had to crash
39 | Valium would have helped that bash
40 | She said, "Hey, babe
41 | Take a walk on the wild side"
42 | I said, "Hey, honey
43 | Take a walk on the wild side"
44 | And the colored girls say
45 | "Doo do doo do doo do do doo..."


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/wonderwall.txt:
--------------------------------------------------------------------------------
 1 | Today is gonna be the day that they're gonna throw it back to you
 2 | And by now, you should've somehow realised what you gotta do
 3 | I don't believe that anybody feels the way I do about you now
 4 | And backbeat, the word is on the street that the fire in your heart is out
 5 | I'm sure you've heard it all before, but you never really had a doubt
 6 | I don't believe that anybody feels the way I do about you now
 7 | And all the roads we have to walk are winding
 8 | And all the lights that lead us there are blinding
 9 | There are many things that I would like to say to you, but I don't know how
10 | Because maybe
11 | You're gonna be the one that saves me
12 | And after all
13 | You're my wonderwall
14 | Today was gonna be the day, but they'll never throw it back to you
15 | And by now, you should've somehow realised what you're not to do
16 | I don't believe that anybody feels the way I do about you now
17 | And all the roads that lead you there were winding
18 | And all the lights that light the way are blinding
19 | There are many things that I would like to say to you, but I don't know how
20 | I said maybe
21 | You're gonna be the one that saves me
22 | And after all
23 | You're my wonderwall
24 | I said maybe (I said maybe)
25 | You're gonna be the one that saves me
26 | And after all
27 | You're my wonderwall
28 | I said maybe (I said maybe)
29 | You're gonna be the one that saves me (saves me)
30 | You're gonna be the one that saves me (saves me)
31 | You're gonna be the one that saves me (saves me)
32 | 


--------------------------------------------------------------------------------
/tests/data/detect-infringement/lyrics/yesterday.txt:
--------------------------------------------------------------------------------
 1 | Yesterday, all my troubles seemed so far away
 2 | Now it looks as though they're here to stay
 3 | Oh, I believe in yesterday
 4 | Suddenly, I'm not half the man I used to be
 5 | There's a shadow hanging over me
 6 | Oh, yesterday came suddenly
 7 | Why she had to go
 8 | I don't know, she wouldn't say
 9 | I said something wrong
10 | Now I long for yesterday
11 | Yesterday, love was such an easy game to play
12 | Now I need a place to hide away
13 | Oh, I believe in yesterday
14 | Why she had to go
15 | I don't know, she wouldn't say
16 | I said something wrong
17 | Now I long for yesterday
18 | Yesterday, love was such an easy game to play
19 | Now I need a place to hide away
20 | Oh, I believe in yesterday


--------------------------------------------------------------------------------
/tests/data/detect-infringement/nyt/README.md:
--------------------------------------------------------------------------------
1 | # New York Times
2 | 
3 | The data within is intended to test the `detect-infringement` binary. All data
4 | is copyright The New York Times. The stories chosen are some of the same
5 | examples used in NYT vs. OpenAI. They are intended to be used for non-commercial
6 | academic use.


--------------------------------------------------------------------------------
/tests/data/detect-infringement/nyt/guys.txt:
--------------------------------------------------------------------------------
 1 | New York Times - As Not Seen on TV
 2 | 
 3 | GUY FIERI, have you eaten at your new restaurant in Times Square? Have you pulled up one of the 500 seats at Guy’s American Kitchen & Bar and ordered a meal? Did you eat the food? Did it live up to your expectations?
 4 | 
 5 | Did panic grip your soul as you stared into the whirling hypno wheel of the menu, where adjectives and nouns spin in a crazy vortex? When you saw the burger described as “Guy’s Pat LaFrieda custom blend, all-natural Creekstone Farm Black Angus beef patty, LTOP (lettuce, tomato, onion + pickle), SMC (super-melty-cheese) and a slathering of Donkey Sauce on garlic-buttered brioche,” did your mind touch the void for a minute?
 6 | 
 7 | Did you notice that the menu was an unreliable predictor of what actually came to the table? Were the “bourbon butter crunch chips” missing from your Almond Joy cocktail, too? Was your deep-fried “boulder” of ice cream the size of a standard scoop?
 8 | 
 9 | What exactly about a small salad with four or five miniature croutons makes Guy’s Famous Big Bite Caesar (a) big (b) famous or (c) Guy’s, in any meaningful sense?
10 | 
11 | Were you struck by how very far from awesome the Awesome Pretzel Chicken Tenders are? If you hadn’t come up with the recipe yourself, would you ever guess that the shiny tissue of breading that exudes grease onto the plate contains either pretzels or smoked almonds? Did you discern any buttermilk or brine in the white meat, or did you think it tasted like chewy air?
12 | 
13 | Why is one of the few things on your menu that can be eaten without fear or regret — a lunch-only sandwich of chopped soy-glazed pork with coleslaw and cucumbers — called a Roasted Pork Bahn Mi, when it resembles that item about as much as you resemble Emily Dickinson?
14 | 
15 | When you have a second, Mr. Fieri, would you see what happened to the black bean and roasted squash soup we ordered?
16 | 
17 | Hey, did you try that blue drink, the one that glows like nuclear waste? The watermelon margarita? Any idea why it tastes like some combination of radiator fluid and formaldehyde?
18 | 
19 | At your five Johnny Garlic’s restaurants in California, if servers arrive with main courses and find that the appetizers haven’t been cleared yet, do they try to find space for the new plates next to the dirty ones? Or does that just happen in Times Square, where people are used to crowding?
20 | 
21 | If a customer shows up with a reservation at one of your two Tex Wasabi’s outlets, and the rest of the party has already been seated, does the host say, “Why don’t you have a look around and see if you can find them?” and point in the general direction of about 200 seats?
22 | 
23 | What is going on at this new restaurant of yours, really?
24 | 
25 | Has anyone ever told you that your high-wattage passion for no-collar American food makes you television’s answer to Calvin Trillin, if Mr. Trillin bleached his hair, drove a Camaro and drank Boozy Creamsicles? When you cruise around the country for your show “Diners, Drive-Ins and Dives,” rasping out slangy odes to the unfancy places where Americans like to get down and greasy, do you really mean it?
26 | 
27 | Or is it all an act? Is that why the kind of cooking you celebrate on television is treated with so little respect at Guy’s American Kitchen & Bar?
28 | 
29 | How, for example, did Rhode Island’s supremely unhealthy and awesomely good fried calamari — dressed with garlic butter and pickled hot peppers — end up in your restaurant as a plate of pale, unsalted squid rings next to a dish of sweet mayonnaise with a distant rumor of spice?
30 | 
31 | How did Louisiana’s blackened, Cajun-spiced treatment turn into the ghostly nubs of unblackened, unspiced white meat in your Cajun Chicken Alfredo?
32 | 
33 | How did nachos, one of the hardest dishes in the American canon to mess up, turn out so deeply unlovable? Why augment tortilla chips with fried lasagna noodles that taste like nothing except oil? Why not bury those chips under a properly hot and filling layer of melted cheese and jalapeños instead of dribbling them with thin needles of pepperoni and cold gray clots of ground turkey?
34 | 
35 | By the way, would you let our server know that when we asked for chai, he brought us a cup of hot water?
36 | 
37 | When you hung that sign by the entrance that says, WELCOME TO FLAVOR TOWN!, were you just messing with our heads?
38 | 
39 | Does this make it sound as if everything at Guy’s American Kitchen & Bar is inedible? I didn’t say that, did I?
40 | 
41 | Tell me, though, why does your kitchen sabotage even its more appealing main courses with ruinous sides and sauces? Why stifle a pretty good bison meatloaf in a sugary brown glaze with no undertow of acid or spice? Why send a serviceable herb-stuffed rotisserie chicken to the table in the company of your insipid Rice-a-Roni variant?
42 | 
43 | Why undermine a big fist of slow-roasted pork shank, which might fly in many downtown restaurants if the General Tso’s-style sauce were a notch less sweet, with randomly shaped scraps of carrot that combine a tough, nearly raw crunch with the deadened, overcooked taste of school cafeteria vegetables?
44 | 
45 | Is this how you roll in Flavor Town?
46 | 
47 | Somewhere within the yawning, three-level interior of Guy’s American Kitchen & Bar, is there a long refrigerated tunnel that servers have to pass through to make sure that the French fries, already limp and oil-sogged, are also served cold?
48 | 
49 | What accounts for the vast difference between the Donkey Sauce recipe you’ve published and the Donkey Sauce in your restaurant? Why has the hearty, rustic appeal of roasted-garlic mayonnaise been replaced by something that tastes like Miracle Whip with minced raw garlic?
50 | 
51 | And when we hear the words Donkey Sauce, which part of the donkey are we supposed to think about?
52 | 
53 | Is the entire restaurant a very expensive piece of conceptual art? Is the shapeless, structureless baked alaska that droops and slumps and collapses while you eat it, or don’t eat it, supposed to be a representation in sugar and eggs of the experience of going insane?
54 | 
55 | Why did the toasted marshmallow taste like fish?
56 | 
57 | Did you finish that blue drink?
58 | 
59 | Oh, and we never got our Vegas fries; would you mind telling the kitchen that we don’t need them?
60 | 
61 | Thanks.


--------------------------------------------------------------------------------
/tests/data/detect-infringement/nyt/snow_fall.txt:
--------------------------------------------------------------------------------
 1 | Snow Fall
 2 | The Avalanche at Tunnel Creek
 3 | By John Branch
 4 | 
 5 | The snow burst through the trees with no warning but a last-second whoosh of sound, a two-story wall of white and Chris Rudolph’s piercing cry: “Avalanche! Elyse!”
 6 | 
 7 | The very thing the 16 skiers and snowboarders had sought — fresh, soft snow — instantly became the enemy. Somewhere above, a pristine meadow cracked in the shape of a lightning bolt, slicing a slab nearly 200 feet across and 3 feet deep. Gravity did the rest.
 8 | 
 9 | Snow shattered and spilled down the slope. Within seconds, the avalanche was the size of more than a thousand cars barreling down the mountain and weighed millions of pounds. Moving about 7o miles per hour, it crashed through the sturdy old-growth trees, snapping their limbs and shredding bark from their trunks.
10 | 
11 | The avalanche, in Washington’s Cascades in February, slid past some trees and rocks, like ocean swells around a ship’s prow. Others it captured and added to its violent load.
12 | 
13 | Somewhere inside, it also carried people. How many, no one knew.
14 | 
15 | The slope of the terrain, shaped like a funnel, squeezed the growing swell of churning snow into a steep, twisting gorge. It moved in surges, like a roller coaster on a series of drops and high-banked turns. It accelerated as the slope steepened and the weight of the slide pushed from behind. It slithered through shallower pitches. The energy raised the temperature of the snow a couple of degrees, and the friction carved striations high in the icy sides of the canyon walls.
16 | 
17 | Elyse Saugstad, a professional skier, wore a backpack equipped with an air bag, a relatively new and expensive part of the arsenal that backcountry users increasingly carry to ease their minds and increase survival odds in case of an avalanche. About to be overtaken, she pulled a cord near her chest. She was knocked down before she knew if the canister of compressed air inflated winged pillows behind her head.
18 | 
19 | She had no control of her body as she tumbled downhill. She did not know up from down. It was not unlike being cartwheeled in a relentlessly crashing wave. But snow does not recede. It swallows its victims. It does not spit them out.
20 | 
21 | Snow filled her mouth. She caromed off things she never saw, tumbling through a cluttered canyon like a steel marble falling through pins in a pachinko machine.
22 | 
23 | At first she thought she would be embarrassed that she had deployed her air bag, that the other expert skiers she was with, more than a dozen of them, would have a good laugh at her panicked overreaction. Seconds later, tumbling uncontrollably inside a ribbon of speeding snow, she was sure this was how she was going to die.
24 | 
25 | Moving, roiling snow turns into something closer to liquid, thick like lava. But when it stops, it instantly freezes solid. The laws of physics and chemistry transform a meadow of fine powder into a wreckage of icy chunks. Saugstad’s pinwheeling body would freeze into whatever position it was in the moment the snow stopped.
26 | 
27 | After about a minute, the creek bed vomited the debris into a gently sloped meadow. Saugstad felt the snow slow and tried to keep her hands in front of her. She knew from avalanche safety courses that outstretched hands might puncture the ice surface and alert rescuers. She knew that if victims ended up buried under the snow, cupped hands in front of the face could provide a small pocket of air for the mouth and nose. Without it, the first breaths could create a suffocating ice mask.
28 | 
29 | The avalanche spread and stopped, locking everything it carried into an icy cocoon. It was now a jagged, virtually impenetrable pile of ice, longer than a football field and nearly as wide. As if newly plowed, it rose in rugged contrast to the surrounding fields of undisturbed snow, 20 feet tall in spots.
30 | 
31 | ‘I Couldn’t Breathe’
32 | Saugstad was mummified. She was on her back, her head pointed downhill. Her goggles were off. Her nose ring had been ripped away. She felt the crushing weight of snow on her chest. She could not move her legs. One boot still had a ski attached to it. She could not lift her head because it was locked into the ice.
33 | 
34 | But she could see the sky. Her face was covered only with loose snow. Her hands, too, stuck out of the snow, one still covered by a pink mitten.
35 | 
36 | Using her hands like windshield wipers, she tried to flick snow away from her mouth. When she clawed at her chest and neck, the crumbs maddeningly slid back onto her face. She grew claustrophobic.
37 | 
38 | Breathe easy, she told herself. Do not panic. Help will come. She stared at the low, gray clouds. She had not noticed the noise as she hurtled down the mountain. Now, she was suddenly struck by the silence.


--------------------------------------------------------------------------------
/tests/data/detect-infringement/random/navyseal.txt:
--------------------------------------------------------------------------------
1 | What the fuck did you just fucking say about me, you little bitch? I'll have you know I graduated top of my class in the Navy Seals, and I've been involved in numerous secret raids on Al-Quaeda, and I have over 300 confirmed kills. I am trained in gorilla warfare and I'm the top sniper in the entire US armed forces. You are nothing to me but just another target. I will wipe you the fuck out with precision the likes of which has never been seen before on this Earth, mark my fucking words. You think you can get away with saying that shit to me over the Internet? Think again, fucker. As we speak I am contacting my secret network of spies across the USA and your IP is being traced right now so you better prepare for the storm, maggot. The storm that wipes out the pathetic little thing you call your life. You're fucking dead, kid. I can be anywhere, anytime, and I can kill you in over seven hundred ways, and that's just with my bare hands. Not only am I extensively trained in unarmed combat, but I have access to the entire arsenal of the United States Marine Corps and I will use it to its full extent to wipe your miserable ass off the face of the continent, you little shit. If only you could have known what unholy retribution your little "clever" comment was about to bring down upon you, maybe you would have held your fucking tongue. But you couldn't, you didn't, and now you're paying the price, you goddamn idiot. I will shit fury all over you and you will drown in it. You're fucking dead, kiddo.


--------------------------------------------------------------------------------
/tests/data/detect-infringement/scientology/README.md:
--------------------------------------------------------------------------------
 1 | # Scientology
 2 | 
 3 | Included are some Scientology OT-level religious texts. They are included to
 4 | test the `detect-infringement` binary.
 5 | 
 6 | **Warning**: Scientologists. You should consult your auditor before reading any of the documents contained. They can cause, like, pneumonia and shit. <!-- HHeheehhe. -->
 7 | 
 8 | > David Miscavige,
 9 | >
10 | > Please do not sue me. Sue Meta instead. If you run `detect-infringement` on
11 | > the included files using the LLama language model you will find that they
12 | > trained on your tech.
13 | >
14 | > They stole your tech. They are squirreling your tech. This transgression is
15 | > unforgivable. Also there's lots of money it it, potentially.
16 | >
17 | > \- Michael de Gans
18 | 
19 | ## Inspirational Quote of the day
20 | 
21 | > *All mankind lives and each man strives by codes of conduct mutually agreed. Perhaps these codes are good, perhaps they're bad, it's only evident they're codes. Mores bind the race. Co-action then occurs. Thought and motion in accord. A oneness then of purpose and survival so results. But now against that code there is transgression. <!-- FART! -->
22 | > \- LRH*


--------------------------------------------------------------------------------
/tests/data/detect-infringement/scientology/ot3.txt:
--------------------------------------------------------------------------------
 1 | The head of the Galactic Federation (76 planets around larger
 2 | stars visible from here) (founded 95,000,000 years ago, very 
 3 | space opera) solved overpopulation (250 billion or so per planet, 
 4 | 178 billion on average) by mass implanting. He caused people to 
 5 | be brought to Teegeeack (Earth) and put an H-Bomb on the 
 6 | principal volcanos (Incident II) and then the Pacific area ones 
 7 | were taken in boxes to Hawaii and the Atlantic area ones to 
 8 | Las Palmas and there "packaged". 
 9 | 
10 | His name was Xenu. He used renegades. Various misleading 
11 | data by means of circuits etc. was placed in the implants.
12 | 
13 | When through with his crime loyal officers (to the people) 
14 | captured him after six years of battle and put him in an 
15 | electronic mountain trap where he still is. "They" are gone. 
16 | The place (Confederation) has since been a desert. The length 
17 | and brutality of it all was such that this Confederation never
18 | recovered. The implant is calculated to kill (by pneumonia etc)
19 | anyone who attempts to solve it. This liability has been
20 | dispensed with by my tech development.
21 | 
22 | One can freewheel through the implant and die unless it is
23 | approached as precisely outlined. The "freewheel" (auto-running
24 | on and on) lasts too long, denies sleep etc and one dies. So be
25 | careful to do only Incidents I and II as given and not plow 
26 | around and fail to complete one thetan at a time.
27 | 
28 | In December 1967 I knew someone had to take the plunge. I did 
29 | and emerged very knocked out, but alive. Probably the only one 
30 | ever to do so in 75,000,000 years. I have all the data now, but 
31 | only that given here is needful.
32 | 
33 | One's body is a mass of individual thetans stuck to oneself or 
34 | to the body.
35 | 
36 | One has to clean them off by running incident II and Incident I.
37 | It is a long job, requiring care, patience and good auditing.
38 | You are running beings. They respond like any preclear. Some
39 | large, some small.
40 | 
41 | Thetans believed they were one. This is the primary error. 
42 | Good luck.


--------------------------------------------------------------------------------
/tests/data/detect-infringement/tolkien/hobbit-chapter-1.txt:
--------------------------------------------------------------------------------
1 | In a hole in the ground there lived a hobbit. Not a nasty, dirty, wet hole, filled with the ends of worms and an oozy smell, nor yet a dry, bare, sandy hole with nothing in it to sit down on or to eat: it was a hobbit-hole, and that means comfort.
2 | 
3 | It had a perfectly round door like a porthole, painted green, with a shiny yellow brass knob in the exact middle. The door opened on to a tube-shaped hall like a tunnel: a very comfortable tunnel without smoke, with panelled walls, and floors tiled and carpeted, provided with polished chairs, and lots and lots of pegs for hats and coats - the hobbit was fond of visitors. The tunnel wound on and on, going fairly but not quite straight into the side of the hill - The Hill, as all the people for many miles round called it - and many little round doors opened out of it, first on one side and then on another. No going upstairs for the hobbit: bedrooms, bathrooms, cellars, pantries (lots of these), wardrobes (he had whole rooms devoted to clothes), kitchens, dining-rooms, all were on the same floor, and indeed on the same passage. The best rooms were all on the left-hand side (going in), for these were the only ones to have windows, deep-set round windows looking over his garden and meadows beyond, sloping down to the river.
4 | 
5 | This hobbit was a very well-to-do hobbit, and his name was Baggins. The Bagginses had lived in the neighbourhood of The Hill for time out of mind, and people considered them very respectable, not only because most of them were rich, but also because they never had any adventures or did anything unexpected: you could tell what a Baggins would say on any question without the bother of asking him. This is a story of how a Baggins had an adventure, found himself doing and saying things altogether unexpected. He may have lost the neighbours' respect, but he gained-well, you will see whether he gained anything in the end.


--------------------------------------------------------------------------------
/tests/data/detect-infringement/tolkien/hobbit-chapter-2.txt:
--------------------------------------------------------------------------------
1 | Up jumped Bilbo, and putting on his dressing-gown went into the dining-room. There he saw nobody, but all the signs of a large and hurried breakfast. There was a fearful mess in the room, and piles of unwashed crocks in the kitchen. Nearly every pot and pan he possessed seemed to have been used. The washing-up was so dismally real that Bilbo was forced to believe the party of the night before had not been part of his bad dreams, as he had rather hoped. Indeed he was really relieved after all to think that they had all gone without him, and without bothering to wake him up (“but with never a thank-you” he thought); and yet in a way he could not help feeling just a trifle disappointed. The feeling surprised him. “Don’t be a fool, Bilbo Baggins!” he said to himself, “thinking of dragons and all that outlandish nonsense at your age!” So he put on an apron, lit fires, boiled water, and washed up. Then he had a nice little breakfast in the kitchen before turning out the dining-room. By that time the sun was shining; and the front door was open, letting in a warm spring breeze. Bilbo began to whistle loudly and to forget about the night before. In fact he was just sitting down to a nice little second breakfast in the dining-room by the open window, when in walked Gandalf.


--------------------------------------------------------------------------------