├── .gitignore ├── README.md ├── UNLICENSE.md ├── docs ├── UNLICENSE.md ├── card_format.md ├── configs.md ├── creating_embeddings.md ├── csv.md ├── examples.md ├── getting_started.md ├── img │ ├── settings_panel.png │ ├── skynet01.png │ ├── skynet02.png │ ├── skynet03.png │ └── skynet04.png ├── index.md ├── named_entity_recognition.md ├── preparing_the_env.md ├── prompt_support.md ├── running_the_chatbot.md ├── running_the_env.md └── webscraping.md ├── mkdocs.yml ├── pyproject.toml ├── ruff_defaults.toml └── src ├── __about__.py ├── __init__.py └── llama_cpp_chat_memory ├── .chainlit └── config_example.toml ├── .env.example ├── __about__.py ├── character_chat.py ├── conversation_manager.py ├── custom_llm_classes ├── __init__.py └── custom_spacy_embeddings.py ├── developer ├── test_embeddings.py └── test_llm.py ├── document_analysis ├── collection_helper.py ├── ctfidf.py ├── general_word_frequency.py ├── generate_uuid.py ├── parse_ner.py └── spacy_explain.py ├── document_parsing ├── __init__.py ├── extract │ ├── __init__.py │ └── basics.py ├── filter_csv.py ├── parse_csv_to_text.py ├── parse_json_documents.py ├── parse_pdf_documents.py ├── parse_text_documents.py ├── parse_text_documents_old.py ├── parse_text_documents_simple.py ├── spacier │ ├── __init__.py │ ├── core.py │ └── utils.py ├── test_query.py ├── utils │ ├── __init__.py │ ├── cache.py │ ├── constants.py │ ├── errors.py │ ├── types.py │ └── utils.py └── web_scraper.py ├── flask_web_server.py ├── run_chat.py ├── run_files ├── cards │ ├── Shodan_v2.png │ └── Skynet_v2.png ├── documents │ ├── csv_test │ │ └── customers-100.csv │ ├── shodan │ │ └── shodan.txt │ ├── shodan_mes │ │ └── shodan_message_examples.txt │ └── skynet │ │ ├── skynet.pdf │ │ └── skynet.txt ├── filters │ ├── csv_filter.json │ └── web_scrape_filter.json ├── models │ └── models.txt ├── parse_configs │ ├── csv_columns.json │ ├── ner_types.json │ ├── ner_types_analyze.json │ ├── ner_types_full.json │ └── query_metadata_filter.json ├── prompt_templates │ ├── conversation_template.json │ ├── conversation_template2.json │ └── question_refining_metadata_template.json ├── run_settings │ └── run_config.json ├── web_scrape_configs │ ├── shodan.json │ ├── skynet.json │ └── warhammer_40k.json └── web_scrape_configs_old │ ├── skynet.json │ └── warhammer_40k.json ├── static └── style.css └── terminal_chatbot.py /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | .env 3 | .env_test* 4 | dist/ 5 | site/ 6 | __pycache__/ 7 | src/llama_cpp_chat_memory/public/avatars/ 8 | src/llama_cpp_chat_memory/test/ 9 | src/llama_cpp_chat_memory/run_files/chroma_storage/ 10 | src/llama_cpp_chat_memory/run_files/character_storage/ 11 | src/llama_cpp_chat_memory/run_files/key_storage/ 12 | src/llama_cpp_chat_memory/.chainlit/.langchain.db 13 | src/llama_cpp_chat_memory/.chainlit/config.toml 14 | src/llama_cpp_chat_memory/.chainlit/translations/ 15 | src/llama_cpp_chat_memory/run_files/temp/ 16 | src/llama_cpp_chat_memory/test.py 17 | src/llama_cpp_chat_memory/document_parsing/test.py 18 | src/llama_cpp_chat_memory/chainlit.md 19 | src/llama_cpp_chat_memory/logs/ 20 | src/llama_cpp_chat_memory/.files/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # llama-cpp-chat-memory 2 | This project is a llama-cpp character AI chatbot using tavern or V2 character cards and ChromaDB for character memory. You can also use it as just a normal character Ai chatbot. For full documentation visit [Chatbot Documentation](http://ossirytk.github.io/llama-cpp-chat-memory/index.html). 3 | 4 | ## TODO add latest updates here -------------------------------------------------------------------------------- /UNLICENSE.md: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /docs/UNLICENSE.md: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /docs/card_format.md: -------------------------------------------------------------------------------- 1 | ### Card Format 2 | See [character editor](https://character-tools.srjuggernaut.dev/character-editor).
3 | There are few example cards included like'Skynet' and 'Shodan'
4 | 'name' : 'char_name'
5 | The name for the ai character. When using json or yaml, this is expected to correspond to avatar image. name.png or name.jpg.
6 | 'description' : 'char_persona'
7 | The description for the character personality. Likes, dislikes, personality traits.
8 | 'scenario' : 'world_scenario'
9 | Description of the scenario. This roughly corresponds to things like "You are a hr customer service having a discussion with a customer. Always be polite". etc.
10 | 'mes_example' : 'example_dialogue'
11 | Example dialogue. The AI will pick answer patterns based on this
12 | 'first_mes' : 'char_greeting'
13 | A landing page for the chat. This will not be included in the prompt. 14 | 15 | The documents folder includes some documents for embeddings parsing for the character cards. -------------------------------------------------------------------------------- /docs/configs.md: -------------------------------------------------------------------------------- 1 | ### Basic Configs 2 | You can change the configuration settings in .env file. 3 | 4 | The available embeddings are llama,spacy and hugginface. Make sure that the config for the chat matches the embeddings that were used to create the chroma collection. 5 | 6 | VECTOR_K is the value for vector storage documents for how many documents should be returned. You might need to change this based on your context and vector store chunk size. BUFFER_K is the size for conversation buffer. The prompt will include last K qustion answer pairs. Having large VECTOR_K and BUFFER_K can overfill the prompt. The default character card is Skynet_V2.png. This is just a basic template. 7 | 8 | Config Field | Description 9 | ------------- | ------------- 10 | MODEL_DIR | The dir for the models 11 | MODEL | model_name.gguf 12 | MODEL_TYPE | alpaca/mistral 13 | CHARACTER_CARD_DIR | The directory for chracter cards 14 | CHARACTER_CARD | character_card.png/yaml/json 15 | PERSIST_DIRECTORY | dir for chroma embeddings 16 | PROMPT_TEMPLATE_DIRECTORY | Prompt template are stored here 17 | REPLACE_YOU | Replace references to "You" in card with "User" 18 | KEY_STORAGE_DIRECTORY | dir for NER keys for chroma 19 | COLLECTION_CONFIG | Path to run config file for collection and prompt 20 | EMBEDDINGS_TYPE | llama/spacy/hugginface 21 | EMBEDDINGS_MODEL | spacy/hugginface model name (needs to be installed) 22 | CUSTOM_CSS | Url to the custom css file to be used by the application. 23 | REFINE_MODEL | Spacy model used for metadata ner parsing 24 | REFINE_CONFIG | Ner config file used for metadata ner parsing 25 | VECTOR_SORT_TYPE | Vector searach sort type distance/bm25/fusionrank 26 | VECTOR_K | Fetch k closest embeddings for mmr 27 | BUFFER_K | Buffer last k exchanges to conversation context 28 | FETCH_K | Fetch k closest embeddings for similiarity 29 | LAMBDA_MULT | Lambda for Chroma 30 | LAYERS | Number of layers to offload to gpu 31 | SEED | Seed used for generation. Default random (-1) 32 | N_PARTS | How many parts the model is divided into. Default auto (-1) 33 | USE_MLOCK | Load the whole model into ram. Default False 34 | TEMPERATURE | Adjust the randomness of the generated text (default: 0.8) 35 | TOP_P | A higher value for top-p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9. 36 | REPEAT_PENALTY | The repeat-penalty option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1. 37 | TOP_K | A higher value for top-k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40. 38 | LAST_N_TOKENS_SIZE | Last n tokens to consider for penalizing repetition 39 | USE_MMAP | Allows only the necessary parts to be loaded into memory and offloading the rest. Default false 40 | VERBOSE | Verbose mode. Default True 41 | ROPE_CONTEXT | Rope context for rope scaling 42 | N_CTX | Context size default 8192 43 | N_BATCH | Message write batch size 44 | MAX_TOKENS | Max tokens. Default 4096 45 | 46 | ### General Configs 47 | Other configs are found in the run_files folder. These include Webscrape configs, ner parse configs and filter configs. 48 | 49 | Filters folder defines the general webscrape filters to clean the documents. This file uses regex and can easily be modified to add extra filtering. 50 | 51 | Parse_configs defines the expected csv column structure and ner type parsing. This includes noun engrams, entities, noun chunks and parse type. 52 | 53 | Web scrape configs define the web pages fo a scrape. This is convinient if you want to scrape multiple pages. 54 | 55 | ### Run Config 56 | The run config in run_config.json in the run_files folder defines the options for chat run settings. The run config sets the defaults for message collection, context collection and for the prompt template. The config also gives the list of alternative collection and prompt settings. These can be changed while the chat is running from the chat settings menu. 57 | ![Settings Panel](img/settings_panel.png) 58 | -------------------------------------------------------------------------------- /docs/creating_embeddings.md: -------------------------------------------------------------------------------- 1 | ### Creating embeddings 2 | The embeddings creation uses env setting for threading and cuda. The Example documents are in the Documents folder. The scripts are in the documents_parsing folder. 3 | Use --help for basic instructions.
4 | The parsing script will parse all txt, pdf or json files in the target directory. For json lorebooks a key_storage file will also be created for metadata filtering.
5 | You need to download models for NER parsing. Textacy parses text files with Spacy sentence transformers to automatically generate keys for metadata filters. The default model is en_core_web_lg. See available models at [Spacy Models](https://spacy.io/usage/models)
6 | ``` 7 | python -m spacy download en_core_web_sm 8 | python -m spacy download en_core_web_md 9 | python -m spacy download en_core_web_lg 10 | ``` 11 | 12 | You might want to play with the chunk size and overlap based on your text documents
13 | The example documents include a txt file for 'Skynet' and 'Shodan'
14 | The supported lorebook formats are chub inferred AgnAIstic and SillyTavern original source. 15 | For pdf files there is a pdf file of short stories from Fyodor Dostoyevsky included The source is Internet Archive, the copy is in public domain. The pdf text quality is quite poor thought, so I recommend getting another file. 16 | 17 | Performance for files over 200mb is not great. Parsing a large text with a large keyfile will result in poor performance. It's more effective to have smaller colletions that have their own keyfiles rather that one large collection with one keyfile. I recommend splitting sollections my subject category and then switching as needed. 18 | 19 | **!!Important!!.** You need to make sure that the documents, character_storage and key_storage folders exist. 20 | 21 | Textacy parsing will use NER to parse keys from the document using sentence transformers. This keys can be used as Chroma metadata, 22 | NOTE: Textacy parsing will create a key file in key_storage that can be used by text parsing. Json files will create keys automatically if present in json file. 23 | ``` 24 | python -m document_parsing.textacy_parsing --collection-name skynet --embeddings-type spacy 25 | ``` 26 | 27 | Parse csv to text 28 | ``` 29 | python -m document_parsing.parse_csv_to_text 30 | ``` 31 | 32 | Parse the documents with. The new document parsing uses multiprocess to parse metadata keys created with parse_ner script. This increases the processing speed with large key files by a significant margin. The old script uses a single thread for processing keys and this can cause significant slowdown with many documents with large keyfiles. You can give the number of threads for the multiprocess with --threads 33 | ``` 34 | python -m document_parsing.parse_text_documents 35 | python -m document_parsing.parse_text_documents 36 | python -m document_parsing.parse_json_documents 37 | ``` 38 | 39 | You can test the embeddings with 40 | ``` 41 | python -m document_parsing.test_embeddings --collection-name skynet --query "Who is John Connor" --embeddings-type llama 42 | python -m document_parsing.test_embeddings --collection-name skynet2 --query "Who is John Connor" --embeddings-type spacy 43 | python -m document_parsing.test_embeddings --collection-name hogwarts --query "Who is Charles Rookwood'" --embeddings-type spacy 44 | ``` 45 | 46 | Optional params | Description 47 | ---------------------- | ------------- 48 | --documents-directory | The directory where your text files are stored. Default "./run_files/documents/skynet" 49 | --collection-name | The name of the collection. Default "skynet" 50 | --persist-directory | The directory where you want to store the Chroma collection. Default "./run_files/character_storage/" 51 | --key-storage | The directory for the collection metadata keys Need to be created with textacy parsing. Default "./run_files/key_storage/" 52 | --keyfile-name | The name of the keyfile. Matches collection name by default. 53 | --chunk-size | The text chunk size for parsing. Default "1024" 54 | --chunk-overlap | The overlap for text chunks for parsing. Default "0" 55 | --embeddings-type | The chosen embeddings type. Default "spacy" -------------------------------------------------------------------------------- /docs/csv.md: -------------------------------------------------------------------------------- 1 | ### Named Entity Recognition(NER) 2 | You can use filter_csv.py and parse_csv_to_text.py to process csv files. The filter script will remove rows using whitelists and blacklists. You can set a filter for any column. This is useful when you want to split large documents to more manageable portions. The csv to text parsing document filters web elements if you have webscraped data into csv. -------------------------------------------------------------------------------- /docs/examples.md: -------------------------------------------------------------------------------- 1 | ### Some examples 2 | ![skynet01](img/skynet01.png) 3 | ![skynet02](img/skynet02.png) 4 | ![skynet03](img/skynet03.png) 5 | ![skynet04](img/skynet04.png) -------------------------------------------------------------------------------- /docs/getting_started.md: -------------------------------------------------------------------------------- 1 | You will need hatch to run this project. You can install hatch with pipx. See [Hatch](https://pypi.org/project/hatch/) and [Pipx](https://pipx.pypa.io/latest/installation/). The commands here are for windows powershell. If you use another shell, you'll have to change things as needed. 2 | ``` 3 | pip install pipx 4 | pipx install hatch 5 | ``` 6 | Then from the repo root folder run. 7 | ``` 8 | hatch shell chat 9 | cd .\src\llama_cpp_chat_memory\ 10 | python -m spacy download en_core_web_lg 11 | playwright install 12 | ``` 13 | 14 | You will need spacy models for text embeddings if you do not use llama-cpp embeddings. Playwright is used by the old webscrape scripts. These are not needed for running the chatbot itself.
15 | 16 | You also might want to run llama-cpp with gpu acceleration like cuda. See [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) for specifics. Then run: 17 | ``` 18 | $env:FORCE_CMAKE=1 19 | $env:CMAKE_ARGS="-DLLAMA_CUBLAS=on" 20 | pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir --no-deps 21 | ``` 22 | 23 | Note that this example is for powershell and for the latest llama-cpp-python. You will need to change the command based on the terminal and the llama-cpp-python version.
24 | 25 | Get a gguf model from a site like 26 | [The Bloke](https://huggingface.co/models?sort=modified&search=theBloke+gguf) 27 | and a character card and lorebooks from a site like [Character hub](https://www.characterhub.org/) or make your own with [character editor](https://character-tools.srjuggernaut.dev/character-editor)
28 | 29 | Change the .env_test to .env and make sure that the correct folders exist.
30 | 31 | You can set the collection to "" and try the chatbot by running: 32 | ``` 33 | chainlit run character_chat.py 34 | ``` 35 | If you want to create memory then see more details below. -------------------------------------------------------------------------------- /docs/img/settings_panel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/docs/img/settings_panel.png -------------------------------------------------------------------------------- /docs/img/skynet01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/docs/img/skynet01.png -------------------------------------------------------------------------------- /docs/img/skynet02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/docs/img/skynet02.png -------------------------------------------------------------------------------- /docs/img/skynet03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/docs/img/skynet03.png -------------------------------------------------------------------------------- /docs/img/skynet04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/docs/img/skynet04.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # llama-cpp-chat-memory 2 | This project is intended as an example and a basic framework for a locally run chatbot with documents. The target user group is developers with some understanding about python and llm framworks. If you want to learn about llm and AI, when you can take a look at my [llm resources for beginners](https://github.com/ossirytk/llm_resources) or [PygWiki](https://wikia.schneedc.com/). This project is mainly intended to serve as a more fleshed out tutorial and a basic frame to test various things like document embeddings. For this reason, the chatbot itself is intended to be lightweight and simple. You can also use this chatbot to test models and prompts. The document fetching can be disabled by setting collection to "" in the config files. This leaves you with just a basic character chatbot.
3 | 4 | Everything is designed to run locally. The model is run with llama.cpp and it's python bindings, the UI is Chainlit, the vector database is Chroma and everythin is glued together with Langchain. Document processing uses Spacy and Sentence Transformers and Playwright. There are no dependencies to external api's. Llama.cpp can use gpu acceleration with Cuda and Blas. See the documentation for [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) for documentation.
5 | 6 | The chatbot uses character cards as prompts. The supported cards are Tavern and V2. Internal lorebooks are not supported yet. There are several scripts for parsing json lorebooks, pdt, textfiles and scarping web pages for the memory content. Also included are scripts for parsing metadata from documents automatically. -------------------------------------------------------------------------------- /docs/named_entity_recognition.md: -------------------------------------------------------------------------------- 1 | ### Named Entity Recognition(NER) 2 | You can use textacy_parsing script for generating document metadata keys automatically. The scripts are a modified version of textacy code updated to run with the current spacy version. The script uses a spacy embeddings model to process a text document for a json metadata keyfile. The keys are parsed based on a config file in run_files/parse_configs/ner_types.json or run_files/parse_configs/ner_types_full.json. You can give your own config file if you want. 3 | 4 | **The new parse script uses multiprocess to improve performance. The default process pool number is 6. You should change the process number based on the number of cores your machine has.** 5 | 6 | The available configs are 7 | 8 | Ngrams | Description 9 | ------------- | ------------- 10 | PROPN | Proper Noun 11 | NOUN | Noun 12 | ADJ | Adjective 13 | NNP | Noun proper singular 14 | NN | Noun, singular or mass 15 | AUX | Auxiliary 16 | VBZ | Verb, 3rd person singular present 17 | VERB | Verb 18 | ADP | Adposition 19 | SYM | Symbol 20 | NUM | Numeral 21 | CD | Cardinal number 22 | VBG | verb, gerund or present participle 23 | ROOT | Root 24 | 25 | Entities | Description 26 | ------------- | ------------- 27 | FAC | Buildings, airports, highways, bridges, etc. 28 | NORP | Nationalities or religious or political groups 29 | GPE | Countries, cities, states 30 | PRODUCT | Objects, vehicles, foods, etc. (not services) 31 | EVENT | Named hurricanes, battles, wars, sports events, etc. 32 | PERSON | People, including fictional 33 | ORG | Companies, agencies, institutions, etc. 34 | LOC | Non-GPE locations, mountain ranges, bodies of water 35 | DATE | Absolute or relative dates or periods 36 | TIME | Times smaller than a day 37 | WORK_OF_ART | Titles of books, songs, etc. 38 | 39 | Extract type | Description 40 | --------------- | ------------- 41 | orth | Terms are represented by their text exactly as written 42 | lower | Lowercased form of the text 43 | lemma | Base form w/o inflectional suffixes 44 | 45 | For details see [Spacy linguistic features](https://spacy.io/usage/linguistic-features) and [Model NER labels](https://spacy.io/models/en). The instructions expect en model, but spacy supports a wide range of models. You can also specify Noun chunks. Noun chunk of 2 for example would create keys like "Yellow House" or "Blond Hair". 46 | 47 | 48 | 49 | You can create ner metadata list with 50 | ``` 51 | python -m document_parsing.parse_ner 52 | ``` 53 | 54 | Optional param | Description 55 | ---------------------- | ------------- 56 | --data-directory | The directory where your text files are stored. Default "./run_files/documents/skynet" 57 | --collection-name | The name of the collection Will be used as name and location for the keyfile. Default "skynet" 58 | --key-storage | The directory for the collection metadata keys. Default "./run_files/key_storage/" 59 | --threads | The number of multiprocess threads. Default 6. -------------------------------------------------------------------------------- /docs/preparing_the_env.md: -------------------------------------------------------------------------------- 1 | ### Preparing the env 2 | You will need a llama model that is compatible with llama-cpp. See models in HuggingFace by [The Bloke](https://huggingface.co/models?sort=modified&search=theBloke+gguf)
3 | You might want to build with cuda support.
4 | You need to pass FORCE_CMAKE=1 and CMAKE_ARGS="-DLLAMA_CUBLAS=on" to env variables. This is the powershell syntax. Use whatever syntax your shell uses to set env variables
5 | You need to download language models if you use NER parsing, embeddings or spacy sentence transformers. The default model is en_core_web_lg. See available models at [Spacy Models](https://spacy.io/usage/models)
6 | Choose the preferred model size and type. 7 | ``` 8 | python -m spacy download en_core_web_sm 9 | python -m spacy download en_core_web_md 10 | python -m spacy download en_core_web_lg 11 | ``` 12 | 13 | For installing dependencies in the virtual envs with hatch 14 | ``` 15 | hatch env create 16 | ``` 17 | Copy the .env_test to .env and set directories and model settings 18 | NOTE: Setting collection to "" will disable chroma fetching and you will get a normal character chatbot. -------------------------------------------------------------------------------- /docs/prompt_support.md: -------------------------------------------------------------------------------- 1 | ### Prompt Support 2 | Supports alpaca and mistral text prompts, V2 and tavern style json and yaml files and V2 and tavern png cards. Avatar images need to be in the same folder as the prompt file. V2 and Tavern png files get a copy of the image without exif data in the project temp file. Inbuilt lorebooks are currently not supported 3 | 4 | See [Character hub](https://www.characterhub.org/) for some character cards or make your own with [character editor](https://character-tools.srjuggernaut.dev/character-editor).
-------------------------------------------------------------------------------- /docs/running_the_chatbot.md: -------------------------------------------------------------------------------- 1 | ### Running the chatbot 2 | To run the chatbot. You need to run the chat with the custom script instead of the chainlit run command. 3 | The reason for this is the updates for the config files when switching character. 4 | These changes need to be done before calling chainlit. 5 | 6 | If you call chainlit directly, the character name and avatar picture won't update. 7 | 8 | Note: Currently something seems to be cached by chainlit. Until I find a way to clear the cache, 9 | you need to call run_chat twice for changes to take effect. 10 | 11 | Some browsers don't allow loading css file from local directories. For testing purposes there is a flask script to run a simple http server that serves stylesheets from the "static/" directory. You will need to run the flask server in another terminal instance. 12 | 13 | ``` 14 | cd src\llama_cpp_langchain_chat 15 | python -m run_chat 16 | ``` 17 | 18 | The chatbot should open in your browser
19 | 20 | Running flask 21 | ``` 22 | hatch shell chat 23 | cd .\src\llama_cpp_chat_memory\ 24 | flask --app flask_web_server run 25 | ``` 26 | ### Running the terminal chatbot 27 | You can run the chatbot directly in the terminal without starting a web browser. The terminal script is a low effort way to debug the chatbot quickly. 28 | 29 | ``` 30 | cd src\llama_cpp_langchain_chat 31 | python -m document_parsing.terminal_chatbot 32 | ``` 33 | ### Avatar Images 34 | Avatar images need to be stored in the .\public\avatars folder. Make sure that the folder exists. Character cards in png format will have a copy of the image data saved in the avatars folder automatically. If you copy an image manually, make sure that the filename matches the name is the character card and replace the whitespace in the name with underscores. 35 | ### Vector search 36 | The search for relevant documents from chroma happens based on VECTOR_SORT_TYPE and VECTOR_K. The search will return VECTOR_K+4 closest matches and sorts by sort type before appending to vector_k. Default search simply sorts by distance. "bm25" sorts with the bm25 search algorithm. Fusion rank gets the combined results of both. 37 | ### Query metadata 38 | The query is parsed for metadata using spacy. The metadata keys are used as a filter when searching the Chroma collections. -------------------------------------------------------------------------------- /docs/running_the_env.md: -------------------------------------------------------------------------------- 1 | ### Running the env 2 | You'll need to run all the commands inside the virtual env. Some browsers don't allow loading css file from local directories. For testing purposes there is a flask script to run a simple http server that serves stylesheets from the "static/" directory. You will need to run the flask server in another terminal instance. 3 | ``` 4 | hatch shell chat 5 | (optional for cuda support)$env:FORCE_CMAKE=1 6 | (optional for cuda support)$env:CMAKE_ARGS="-DLLAMA_CUBLAS=on" 7 | (optional for cuda support)pip install llama-cpp-python==VERSION --force-reinstall --upgrade --no-cache-dir --no-deps 8 | cd src\llama_cpp_langchain_chat 9 | ``` 10 | 11 | Running flask 12 | ``` 13 | hatch shell chat 14 | cd .\src\llama_cpp_chat_memory\ 15 | flask --app flask_web_server run 16 | ``` -------------------------------------------------------------------------------- /docs/webscraping.md: -------------------------------------------------------------------------------- 1 | ### Webscraping 2 | You can scrape web pages to text documents in order to use them as documents for chroma. 3 | 4 | Optional. The old web scraping uses playwright and requires that the web engines are installed. After starting the virtual env run:
5 | 6 | ``` 7 | playwright install 8 | ``` 9 | 10 | The web scraping is prepared with config files in web_scrape_configs folder. The format is in json. See the example files for the specfics. A number of regex filters are used to clean the scrape data. You can modify and add filters if you want. The filters are stored in the src/llama_cpp_chat_memory/run_files/filters/web_scrape_filters.json file.
11 | 12 | To run the scrape run: 13 | ``` 14 | python -m document_parsing.web_scraper
15 | ``` 16 | 17 | Optional param | Description 18 | ---------------------- | ------------- 19 | --data-directory | The directory where your text files are stored. Default "./run_files/documents/skynet" 20 | --collection-name | The name of the collection. Default "skynet" 21 | --web-scrape-directory | The config file to be used for the webscrape. Default "./run_files/web_scrape_configs/" -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Llama.cpp chat 2 | site_url: https://ossirytk.github.io/llama-cpp-chat-memory 3 | repo_url: https://github.com/ossirytk/llama-cpp-chat-memory 4 | nav: 5 | - Home: index.md 6 | - Quickstart: 7 | - Getting started: getting_started.md 8 | - Prompt Support: prompt_support.md 9 | - Card Format: card_format.md 10 | - Configs: configs.md 11 | - Preparing the env: preparing_the_env.md 12 | - The chatbot: 13 | - Running the env: running_the_env.md 14 | - Running the chatbot: running_the_chatbot.md 15 | - Working with documents: 16 | - Webscraping: webscraping.md 17 | - Csv filtering and parsing: csv.md 18 | - Named Entity Recognition(NER): named_entity_recognition.md 19 | - Creating embeddings: creating_embeddings.md 20 | - Extras: 21 | - Some Examples: examples.md 22 | - License: UNLICENSE.md 23 | theme: readthedocs 24 | markdown_extensions: 25 | - fenced_code 26 | - tables 27 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "llama-cpp-chat-memory" 7 | dynamic = ["version"] 8 | description = 'llama_cpp chat with langhcain, chainlit and vectorstore memory.' 9 | readme = "README.md" 10 | license = "UNLICENSE" 11 | keywords = [] 12 | authors = [ 13 | { name = "ossirytk", email = "ossirytk@gmail.com" }, 14 | ] 15 | classifiers = [ 16 | "Development Status :: 4 - Beta", 17 | "Programming Language :: Python", 18 | "Programming Language :: Python :: 3.11", 19 | ] 20 | dependencies = [ 21 | "langchain==0.2.16", 22 | "langchain-community==0.2.16", 23 | "chainlit==1.1.404", 24 | "llama-cpp-python==0.2.90", 25 | "Pillow==10.4.0", 26 | "PyYAML==6.0.2", 27 | "toml==0.10.2", 28 | "chromadb==0.5.5", 29 | "pypdf==4.3.1", 30 | "sentence-transformers==3.0.1", 31 | "simsimd==5.1.0", 32 | "pydantic==2.9.0", 33 | "cytoolz==0.12.3", 34 | "spacy==3.7.6", 35 | "pandas==2.2.2", 36 | "pyarrow==17.0.0", 37 | "trafilatura==1.12.1", 38 | "flask==3.0.3", 39 | "nltk==3.9.1", 40 | "rank-bm25==0.2.2", 41 | "click==8.1.7", 42 | "scikit-learn==1.5.2", 43 | ] 44 | 45 | [project.urls] 46 | Documentation = "https://github.com/ossirytk/llama-cpp-chat-memory/blob/main/README.md" 47 | Issues = "https://github.com/ossirytk/llama-cpp-chat-memory/issues" 48 | Source = "https://github.com/ossirytk/llama-cpp-chat-memory" 49 | 50 | [tool.hatch.version] 51 | path = "src/llama_cpp_chat_memory/__about__.py" 52 | 53 | [tool.hatch.envs.chat] 54 | decription="Llama cpp chat with vector store memory" 55 | dependencies = [ 56 | "coverage[toml]>=6.5", 57 | "pytest", 58 | ] 59 | [tool.hatch.envs.chat.scripts] 60 | test = "pytest {args:tests}" 61 | test-cov = "coverage run -m pytest {args:tests}" 62 | cov-report = [ 63 | "- coverage combine", 64 | "coverage report", 65 | ] 66 | cov = [ 67 | "test-cov", 68 | "cov-report", 69 | ] 70 | 71 | [[tool.hatch.envs.all.matrix]] 72 | python = ["3.11"] 73 | 74 | [tool.hatch.envs.lint] 75 | detached = true 76 | dependencies = [ 77 | "black>=23.1.0", 78 | "mypy>=1.0.0", 79 | "ruff>=0.0.243", 80 | ] 81 | 82 | [tool.hatch.build.targets.wheel] 83 | packages = ["src/llama_cpp_chat_memory"] 84 | 85 | [tool.hatch.envs.lint.scripts] 86 | typing = "mypy --install-types --non-interactive {args:src/llama_cpp_chat_memory tests}" 87 | style = [ 88 | "ruff {args:.}", 89 | "black --check --diff {args:.}", 90 | ] 91 | 92 | fmt = [ 93 | "black {args:.}", 94 | "ruff --fix {args:.}", 95 | "style", 96 | ] 97 | all = [ 98 | "style", 99 | "typing", 100 | ] 101 | 102 | [tool.black] 103 | target-version = ["py311"] 104 | line-length = 120 105 | skip-string-normalization = true 106 | 107 | [tool.ruff] 108 | target-version = "py311" 109 | line-length = 120 110 | select = [ 111 | "A", 112 | "ARG", 113 | "B", 114 | "C", 115 | "DTZ", 116 | "E", 117 | "EM", 118 | "F", 119 | "FBT", 120 | "I", 121 | "ICN", 122 | "ISC", 123 | "N", 124 | "PLC", 125 | "PLE", 126 | "PLR", 127 | "PLW", 128 | "Q", 129 | "RUF", 130 | "S", 131 | "T", 132 | "TID", 133 | "UP", 134 | "W", 135 | "YTT", 136 | ] 137 | ignore = [ 138 | # Allow non-abstract empty methods in abstract base classes 139 | "B027", 140 | # Allow boolean positional values in function calls, like `dict.get(... True)` 141 | "FBT003", 142 | # Ignore checks for possible passwords 143 | "S105", "S106", "S107", 144 | # Ignore complexity 145 | "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", 146 | ] 147 | unfixable = [ 148 | # Don't touch unused imports 149 | "F401", 150 | ] 151 | 152 | [tool.ruff.isort] 153 | known-first-party = ["llama_cpp_chat_memory"] 154 | 155 | [tool.ruff.flake8-tidy-imports] 156 | ban-relative-imports = "all" 157 | 158 | [tool.ruff.per-file-ignores] 159 | # Tests can use magic values, assertions, and relative imports 160 | "tests/**/*" = ["PLR2004", "S101", "TID252"] 161 | 162 | [tool.coverage.run] 163 | source_pkgs = ["llama_cpp_chat_memory", "tests"] 164 | branch = true 165 | parallel = true 166 | omit = [ 167 | "src/llama_cpp_chat_memory/__about__.py", 168 | ] 169 | 170 | [tool.coverage.paths] 171 | llama_cpp_chat_memory = ["src/llama_cpp_chat_memory", "*/llama-cpp-chat-memory/src/llama_cpp_chat_memory"] 172 | tests = ["tests", "*/llama-cpp-chat-memory/tests"] 173 | 174 | [tool.coverage.report] 175 | exclude_lines = [ 176 | "no cov", 177 | "if __name__ == .__main__.:", 178 | "if TYPE_CHECKING:", 179 | ] 180 | -------------------------------------------------------------------------------- /ruff_defaults.toml: -------------------------------------------------------------------------------- 1 | # Exclude a variety of commonly ignored directories. 2 | exclude = [ 3 | ".bzr", 4 | ".direnv", 5 | ".eggs", 6 | ".git", 7 | ".git-rewrite", 8 | ".hg", 9 | ".mypy_cache", 10 | ".nox", 11 | ".pants.d", 12 | ".pytype", 13 | ".ruff_cache", 14 | ".svn", 15 | ".tox", 16 | ".venv", 17 | "__pypackages__", 18 | "_build", 19 | "buck-out", 20 | "build", 21 | "dist", 22 | "node_modules", 23 | "venv", 24 | ] 25 | 26 | # Same as Black. 27 | line-length = 88 28 | indent-width = 4 29 | 30 | # Assume Python 3.11 31 | target-version = "py311" 32 | 33 | [lint] 34 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. 35 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or 36 | # McCabe complexity (`C901`) by default. 37 | select = ["E4", "E7", "E9", "F"] 38 | ignore = [] 39 | 40 | # Allow fix for all enabled rules (when `--fix`) is provided. 41 | fixable = ["ALL"] 42 | unfixable = [] 43 | 44 | # Allow unused variables when underscore-prefixed. 45 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 46 | 47 | [format] 48 | # Like Black, use double quotes for strings. 49 | quote-style = "double" 50 | 51 | # Like Black, indent with spaces, rather than tabs. 52 | indent-style = "space" 53 | 54 | # Like Black, respect magic trailing commas. 55 | skip-magic-trailing-comma = false 56 | 57 | # Like Black, automatically detect the appropriate line ending. 58 | line-ending = "auto" 59 | 60 | # Enable auto-formatting of code examples in docstrings. Markdown, 61 | # reStructuredText code/literal blocks and doctests are all supported. 62 | # 63 | # This is currently disabled by default, but it is planned for this 64 | # to be opt-out in the future. 65 | docstring-code-format = false 66 | 67 | # Set the line length limit used when formatting code snippets in 68 | # docstrings. 69 | # 70 | # This only has an effect when the `docstring-code-format` setting is 71 | # enabled. 72 | docstring-code-line-length = "dynamic" -------------------------------------------------------------------------------- /src/__about__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1" 2 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/src/__init__.py -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/.chainlit/config_example.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | enable_telemetry = false 3 | user_env = [] 4 | session_timeout = 50000 5 | cache = false 6 | allow_origins = [ "*",] 7 | 8 | [features] 9 | unsafe_allow_html = false 10 | latex = false 11 | auto_tag_thread = true 12 | edit_message = true 13 | 14 | [UI] 15 | name = "Assistant" 16 | default_collapse_content = true 17 | cot = "hidden" 18 | custom_css = "http://127.0.0.1:5000/static/style.css" 19 | 20 | [meta] 21 | generated_by = "1.1.404" 22 | 23 | [features.spontaneous_file_upload] 24 | enabled = true 25 | accept = [ "*/*",] 26 | max_files = 20 27 | max_size_mb = 500 28 | 29 | [features.audio] 30 | min_decibels = -45 31 | initial_silence_timeout = 3000 32 | silence_timeout = 1500 33 | max_duration = 15000 34 | chunk_duration = 1000 35 | sample_rate = 44100 36 | 37 | [UI.theme] 38 | default = "dark" 39 | layout = "wide" 40 | 41 | [UI.theme.light.primary] 42 | 43 | [UI.theme.light.text] 44 | 45 | [UI.theme.dark.primary] 46 | 47 | [UI.theme.dark.text] 48 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/.env.example: -------------------------------------------------------------------------------- 1 | MODEL_DIR = "./models/" 2 | MODEL = "llama2.gguf" 3 | MODEL_TYPE = "alpaca" 4 | CHARACTER_CARD_DIR = "./cards/" 5 | CHARACTER_CARD = "Skynet_v2.png" 6 | PERSIST_DIRECTORY = "./run_files/character_storage/" 7 | PROMPT_TEMPLATE_DIRECTORY="./prompt_templates/" 8 | REPLACE_YOU=False 9 | KEY_STORAGE_DIRECTORY = "./run_files/key_storage/" 10 | COLLECTION_CONFIG = "./run_files/run_settings/run_config.json" 11 | EMBEDDINGS_TYPE = "spacy" 12 | EMBEDDINGS_MODEL = "en_core_web_lg" 13 | CUSTOM_CSS="http://127.0.0.1:5000/static/style.css" 14 | REFINE_MODEL="en_core_web_lg" 15 | REFINE_CONFIG="./run_files/parse_configs/query_metadata_filter.json" 16 | VECTOR_SORT_TYPE="fusion_rank" 17 | VECTOR_K = 1 18 | BUFFER_K = 3 19 | FETCH_K = 10 20 | LAMBDA_MULT = 0.75 21 | LAYERS = 10 22 | SEED = -1 23 | N_PARTS = -1 24 | USE_MLOCK = False 25 | TEMPERATURE = 0.7 26 | TOP_P = 0.95 27 | REPEAT_PENALTY = 1.1 28 | TOP_K = 50 29 | LAST_N_TOKENS_SIZE = 256 30 | USE_MMAP = False 31 | VERBOSE = True 32 | ROPE_CONTEXT = 1 33 | N_CTX = 8192 34 | N_BATCH = 256 35 | MAX_TOKENS = 4096 -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/__about__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1" 2 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/character_chat.py: -------------------------------------------------------------------------------- 1 | import chainlit as cl 2 | from chainlit.input_widget import Select 3 | from conversation_manager import ConveresationManager 4 | 5 | conversation_manager = ConveresationManager() 6 | 7 | 8 | @cl.author_rename 9 | def rename(orig_author: str): 10 | # Renames chatbot to whatever the current character card name is 11 | rename_dict = {"Chatbot": conversation_manager.get_character_name()} 12 | return rename_dict.get(orig_author, orig_author) 13 | 14 | 15 | @cl.on_chat_start 16 | async def start(): 17 | await cl.ChatSettings( 18 | [ 19 | Select( 20 | id="prompt_template_options", 21 | label="Prompt Templates", 22 | values=conversation_manager.get_prompt_templates(), 23 | initial_index=conversation_manager.get_prompt_template_index(), 24 | ), 25 | Select( 26 | id="context_collection", 27 | label="Context Collection", 28 | values=conversation_manager.get_context_collections(), 29 | initial_index=conversation_manager.get_context_index(), 30 | ), 31 | Select( 32 | id="mex_collection", 33 | label="Mex. Collection", 34 | values=conversation_manager.get_mes_collections(), 35 | initial_index=conversation_manager.get_mes_index(), 36 | ), 37 | ] 38 | ).send() 39 | 40 | 41 | @cl.on_settings_update 42 | async def setup_agent(settings: dict[str, str]): 43 | conversation_manager.update_settings(settings) 44 | 45 | 46 | @cl.on_message 47 | async def main(message: cl.Message): 48 | 49 | result: cl.Message = await conversation_manager.ask_question(message) 50 | await result.send() 51 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/custom_llm_classes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/src/llama_cpp_chat_memory/custom_llm_classes/__init__.py -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/custom_llm_classes/custom_spacy_embeddings.py: -------------------------------------------------------------------------------- 1 | import importlib.util 2 | from typing import Any 3 | 4 | from langchain.pydantic_v1 import BaseModel, Extra, root_validator 5 | from langchain.schema.embeddings import Embeddings 6 | 7 | 8 | class CustomSpacyEmbeddings(BaseModel, Embeddings): 9 | model_path: str 10 | nlp: Any 11 | 12 | class Config: 13 | """Configuration for this pydantic object.""" 14 | 15 | extra = Extra.forbid 16 | 17 | @root_validator(pre=True) 18 | def validate_environment(cls, values: dict) -> dict: 19 | model_path = values["model_path"] 20 | # Check if the Spacy package is installed 21 | if importlib.util.find_spec("spacy") is None: 22 | spacy_not_installed_error_message = "Spacy package not found. Please install it with `pip install spacy`." 23 | raise ValueError(spacy_not_installed_error_message) 24 | try: 25 | # Try to load the 'en_core_web_sm' Spacy model 26 | import spacy 27 | 28 | values["nlp"] = spacy.load(model_path) 29 | except OSError: 30 | # If the model is not found, raise a ValueError 31 | error_message = f"""Spacy model not found. 32 | Please install it with 33 | python -m spacy download {model_path}""" 34 | raise ValueError(error_message) from None 35 | return values # Return the validated values 36 | 37 | def embed_documents(self, texts: list[str]) -> list[list[float]]: 38 | """ 39 | Generates embeddings for a list of documents. 40 | 41 | Args: 42 | texts (List[str]): The documents to generate embeddings for. 43 | 44 | Returns: 45 | A list of embeddings, one for each document. 46 | """ 47 | return [self.nlp(text).vector.tolist() for text in texts] 48 | 49 | def embed_query(self, text: str) -> list[float]: 50 | """ 51 | Generates an embedding for a single piece of text. 52 | 53 | Args: 54 | text (str): The text to generate an embedding for. 55 | 56 | Returns: 57 | The embedding for the text. 58 | """ 59 | return self.nlp(text).vector.tolist() 60 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/developer/test_embeddings.py: -------------------------------------------------------------------------------- 1 | from os import makedirs 2 | 3 | import click 4 | import pandas as pd 5 | from conversation_manager import ConveresationManager 6 | from langchain_core.documents.base import Document 7 | 8 | 9 | @click.command() 10 | @click.argument("query") 11 | @click.option( 12 | "--ttype", 13 | "-t", 14 | default="mes", 15 | type=click.Choice(["mes", "context"]), 16 | help="Test type.", 17 | ) 18 | @click.option("--keywords", "-k", default="polito, cyborgs, shodan", help="Query metadata keywords.") 19 | @click.option("--ksize", default=3, help="The amount of context to fetch") 20 | def main(query: str, ttype, keywords: str, ksize: int) -> None: 21 | """ 22 | This script is for doing tests on embeddings. Retuns metadata results from the vector storage. 23 | """ 24 | test_path = "./test/" 25 | # test_file = "./test/test.json" 26 | makedirs(test_path, exist_ok=True) 27 | conversation_manager = ConveresationManager(test="Testing") 28 | 29 | metadata_filter = conversation_manager.get_metadata_filter(keywords, ttype) 30 | docs: list[tuple[Document, float]] = conversation_manager.get_vector(query, ttype, metadata_filter, ksize) 31 | 32 | df: pd.DataFrame = conversation_manager.calculate_fusion_rank(query, docs) 33 | 34 | # result = df.to_json(orient="split") 35 | # with open(test_file, "w") as w: 36 | # w.write(result) 37 | 38 | df = df.iloc[0:ksize] 39 | for line_item in df["content"].tolist(): 40 | print(line_item) 41 | print("-----------") 42 | 43 | 44 | if __name__ == "__main__": 45 | main() 46 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/developer/test_llm.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | import click 4 | from conversation_manager import ConveresationManager 5 | 6 | conversation_manager = ConveresationManager() 7 | 8 | 9 | async def test_llm(query: str): 10 | await conversation_manager.ask_question_test(query) 11 | 12 | 13 | @click.command() 14 | @click.argument("query") 15 | def main( 16 | query: str, 17 | ) -> None: 18 | """ 19 | This script is for doing quick tests on the model. Runs a single shot query. 20 | """ 21 | asyncio.run(test_llm(query)) 22 | 23 | 24 | if __name__ == "__main__": 25 | main() 26 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_analysis/collection_helper.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import chromadb 4 | import click 5 | from chromadb.api.client import Client 6 | from chromadb.config import Settings 7 | from dotenv import find_dotenv, load_dotenv 8 | 9 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.DEBUG) 10 | 11 | load_dotenv(find_dotenv()) 12 | 13 | 14 | @click.command() 15 | @click.argument("command", type=click.Choice(["list", "delete"])) 16 | @click.option( 17 | "--collection-name", 18 | "-c", 19 | "collection_name", 20 | default="skynet", 21 | help="The name of the Chroma collection that's the target of an action", 22 | ) 23 | @click.option( 24 | "--persist-directory", 25 | "-p", 26 | "persist_directory", 27 | default="./run_files/character_storage/", 28 | help="The directory where you want to store the Chroma collection", 29 | ) 30 | def main( 31 | collection_name: str, 32 | persist_directory: str, 33 | command: str, 34 | ) -> None: 35 | client: Client = chromadb.PersistentClient(path=persist_directory, settings=Settings(anonymized_telemetry=False)) 36 | 37 | match command: 38 | case "list": 39 | logging.info("Available collections:") 40 | collections = client.list_collections() 41 | for collection in collections: 42 | logging.info(collection.name) 43 | case "delete": 44 | logging.info(f"Deleting {collection_name}") 45 | client.delete_collection(collection_name) 46 | logging.info(f"{collection_name} deleted") 47 | case _: 48 | collections = client.list_collections() 49 | logging.info("Available collections:") 50 | for collection in collections: 51 | logging.info(collection.name) 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_analysis/ctfidf.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | import logging 4 | import math 5 | import os 6 | import re 7 | from collections.abc import Iterable 8 | from functools import partial 9 | from os.path import exists, join 10 | from pathlib import Path 11 | 12 | import click 13 | from document_parsing.extract import entities, ngrams, terms 14 | from document_parsing.extract.basics import terms_to_strings 15 | from document_parsing.spacier import core 16 | from dotenv import find_dotenv, load_dotenv 17 | from spacy.tokens import Doc 18 | 19 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.INFO) 20 | 21 | load_dotenv(find_dotenv()) 22 | 23 | SPACY_CHARACTER_LIMIT = 1000000 24 | 25 | 26 | def split_text( 27 | text: str, 28 | chunk_size: int, 29 | chunk_overlap: int, 30 | ) -> list[str]: 31 | separators = ["\n\n", "\n", " ", ""] 32 | 33 | """Split incoming text and return chunks.""" 34 | final_chunks = [] 35 | # Get appropriate separator to use 36 | separator = separators[-1] 37 | new_separators = [] 38 | for i, _s in enumerate(separators): 39 | _separator = re.escape(_s) 40 | if _s == "": 41 | separator = _s 42 | break 43 | if re.search(_separator, text): 44 | separator = _s 45 | new_separators = separators[i + 1 :] 46 | break 47 | 48 | _separator = re.escape(separator) 49 | splits = split_text_with_regex(text, _separator) 50 | 51 | # Now go merging things, recursively splitting longer texts. 52 | _good_splits = [] 53 | _separator = "" 54 | for s in splits: 55 | if _good_splits: 56 | merged_text = merge_splits(_good_splits, _separator) 57 | final_chunks.extend(merged_text) 58 | _good_splits = [] 59 | if not new_separators: 60 | final_chunks.append(s) 61 | else: 62 | other_info = _split_text(s, new_separators, chunk_size, chunk_overlap) 63 | final_chunks.extend(other_info) 64 | if _good_splits: 65 | merged_text = merge_splits(_good_splits, _separator) 66 | final_chunks.extend(merged_text) 67 | return final_chunks 68 | 69 | 70 | def _split_text(text: str, separators: list[str], chunk_size: int, chunk_overlap: int) -> list[str]: 71 | """Split incoming text and return chunks.""" 72 | final_chunks = [] 73 | # Get appropriate separator to use 74 | separator = separators[-1] 75 | new_separators = [] 76 | for i, _s in enumerate(separators): 77 | _separator = re.escape(_s) 78 | if _s == "": 79 | separator = _s 80 | break 81 | if re.search(_separator, text): 82 | separator = _s 83 | new_separators = separators[i + 1 :] 84 | break 85 | 86 | _separator = re.escape(separator) 87 | splits = split_text_with_regex(text, _separator) 88 | 89 | # Now go merging things, recursively splitting longer texts. 90 | _good_splits = [] 91 | _separator = separator 92 | for s in splits: 93 | if len(s) < chunk_size: 94 | _good_splits.append(s) 95 | else: 96 | if _good_splits: 97 | merged_text = merge_splits(_good_splits, _separator, chunk_size, chunk_overlap) 98 | final_chunks.extend(merged_text) 99 | _good_splits = [] 100 | if not new_separators: 101 | final_chunks.append(s) 102 | else: 103 | other_info = _split_text(s, new_separators, chunk_size, chunk_overlap) 104 | final_chunks.extend(other_info) 105 | if _good_splits: 106 | merged_text = merge_splits(_good_splits, _separator, chunk_size, chunk_overlap) 107 | final_chunks.extend(merged_text) 108 | return final_chunks 109 | 110 | 111 | def split_text_with_regex(text: str, separator: str) -> list[str]: 112 | # Now that we have the separator, split the text 113 | if separator: 114 | # The parentheses in the pattern keep the delimiters in the result. 115 | _splits = re.split(f"({separator})", text) 116 | splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)] 117 | if len(_splits) % 2 == 0: 118 | splits += _splits[-1:] 119 | splits = [_splits[0], *splits] 120 | else: 121 | splits = list(text) 122 | return [s for s in splits if s != ""] 123 | 124 | 125 | def merge_splits(splits: Iterable[str], separator: str, chunk_size, chunk_overlap) -> list[str]: 126 | # We now want to combine these smaller pieces into medium size 127 | # chunks to send to the LLM. 128 | separator_len = len(separator) 129 | 130 | docs = [] 131 | current_doc: list[str] = [] 132 | total = 0 133 | for d in splits: 134 | _len = len(d) 135 | if total + _len + (separator_len if len(current_doc) > 0 else 0) > chunk_size: 136 | if total > chunk_size: 137 | logging.warning(f"Created a chunk of size {total}, which is longer than the specified {chunk_size}") 138 | if len(current_doc) > 0: 139 | doc = join_docs(current_doc, separator) 140 | if doc is not None: 141 | docs.append(doc) 142 | # Keep on popping if: 143 | # - we have a larger chunk than in the chunk overlap 144 | # - or if we still have any chunks and the length is long 145 | while total > chunk_overlap or ( 146 | total + _len + (separator_len if len(current_doc) > 0 else 0) > chunk_size and total > 0 147 | ): 148 | total -= len(current_doc[0]) + (separator_len if len(current_doc) > 1 else 0) 149 | current_doc = current_doc[1:] 150 | current_doc.append(d) 151 | total += _len + (separator_len if len(current_doc) > 1 else 0) 152 | doc = join_docs(current_doc, separator) 153 | if doc is not None: 154 | docs.append(doc) 155 | return docs 156 | 157 | 158 | def join_docs(docs: list[str], separator: str) -> str | None: 159 | text = separator.join(docs) 160 | text = text.strip() 161 | 162 | if text == "": 163 | return None 164 | else: 165 | return text 166 | 167 | 168 | def process_documents( 169 | documents: Doc, 170 | parse_config_directory: str, 171 | parse_config_file: str, 172 | ) -> list: 173 | # You can use spacy.explain to get a description for these terms 174 | # Or see the model in https://spacy.io/usage/models and look for model label data 175 | 176 | parse_config_path = join(".", parse_config_directory, parse_config_file) 177 | if exists(parse_config_path): 178 | with open(parse_config_path) as key_file: 179 | filter_content = key_file.read() 180 | filter_configs = json.loads(filter_content) 181 | else: 182 | logging.info("Could not load parse config file") 183 | return 184 | 185 | ngrams_list = filter_configs["ngs"] 186 | entities_list = filter_configs["entities"] 187 | noun_chunks = filter_configs["noun_chunks"] 188 | extract_type = filter_configs["extract_type"] 189 | 190 | logging.debug("Extracting terms from corpus") 191 | extracted_terms = terms( 192 | documents, 193 | ngs=partial(ngrams, n=noun_chunks, include_pos=ngrams_list), 194 | ents=partial( 195 | entities, 196 | include_types=entities_list, 197 | ), 198 | dedupe=True, 199 | ) 200 | 201 | lemma_strings = list(terms_to_strings(extracted_terms, by=extract_type)) 202 | 203 | logging.debug(f"{len(lemma_strings)} metadata keys created") 204 | return lemma_strings 205 | 206 | 207 | @click.command() 208 | @click.option( 209 | "--documents-directory", 210 | "-d", 211 | "documents_directory", 212 | default="./run_files/documents/skynet", 213 | help="The directory where your text files are stored", 214 | ) 215 | @click.option( 216 | "--key-storage", "-k", default="./run_files/key_storage/", help="The directory for the collection metadata keys." 217 | ) 218 | @click.option( 219 | "--keyfile-name", 220 | "-k", 221 | "keyfile_name", 222 | default="keyfile", 223 | help="Keyfile name.", 224 | ) 225 | @click.option( 226 | "--model", 227 | "-m", 228 | default="en_core_web_lg", 229 | help="The spacy model to parse the text", 230 | ) 231 | @click.option( 232 | "--parse-config-directory", "-pcd", default="./run_files/parse_configs/", help="The parse config directory" 233 | ) 234 | @click.option( 235 | "--parse-config-file", 236 | "-pcf", 237 | default="ner_types_analyze.json", 238 | help="The parse config file", 239 | ) 240 | @click.option( 241 | "--chunk-size", 242 | "-cs", 243 | "chunk_size", 244 | type=int, 245 | default=1000000, 246 | help="The text chunk size for parsing. Default spacy maximum chunk size", 247 | ) 248 | @click.option( 249 | "--chunk-overlap", 250 | "-co", 251 | "chunk_overlap", 252 | default=0, 253 | type=int, 254 | help="The overlap for text chunks for parsing", 255 | ) 256 | def main( 257 | documents_directory: str, 258 | key_storage: str, 259 | keyfile_name: str, 260 | model: str, 261 | parse_config_directory: str, 262 | parse_config_file: str, 263 | chunk_size: int, 264 | chunk_overlap: int, 265 | ) -> None: 266 | """This script is a rough implementation of Class-based TF-IDF. 267 | See: https://www.maartengrootendorst.com/blog/ctfidf/ 268 | Tries to find words that represent a topic. Needs a large amount of topics to work. 269 | This is just a crude proof of concept. Has poor performance and some hacks. 270 | """ 271 | documents_pattern = os.path.join(documents_directory, "*.txt") 272 | documents_paths_txt = glob.glob(documents_pattern) 273 | 274 | # Contains the total occurances of words in all material 275 | all_words = {} 276 | # Contains a the occurance frequency in the doc and c-TF-IDF value 277 | all_docs = {} 278 | # Total word counts in docs 279 | total_word_count = {} 280 | for txt_document in documents_paths_txt: 281 | logging.info(f"Parsing: {txt_document}") 282 | with open(txt_document, encoding="utf-8") as f: 283 | content = f.read() 284 | parts = split_text(content, chunk_size, chunk_overlap) 285 | data = {} 286 | for part in parts: 287 | doc = core.make_spacy_doc(part, lang=model) 288 | words = process_documents(doc, parse_config_directory, parse_config_file) 289 | 290 | for word in words: 291 | # TODO Better word filter 292 | if word not in [ 293 | "User", 294 | "user", 295 | ]: 296 | if word in data.keys(): 297 | data[word] = data[word] + 1 298 | else: 299 | data[word] = 1 300 | if word in all_words.keys(): 301 | all_words[word] = all_words[word] + 1 302 | else: 303 | all_words[word] = 1 304 | all_docs[txt_document] = data 305 | 306 | frequency = 0 307 | for _key, word in data.items(): 308 | frequency = frequency + word 309 | total_word_count[txt_document] = frequency 310 | 311 | total_documents = len(all_docs) 312 | # TODO Process documents in threads? 313 | for doc_name, adoc in all_docs.items(): 314 | logging.info(f"Words for document - {doc_name}: {len(adoc)}") 315 | logging.info(f"Calculating values for: {doc_name}") 316 | topic_collection = {} 317 | for word_key, word_count in adoc.items(): 318 | word_count_in_class = word_count 319 | words_in_class_total = total_word_count[doc_name] 320 | word_count_in_total = all_words[word_key] 321 | log_part = math.log(total_documents / word_count_in_total) 322 | log_part = max(log_part, 1) 323 | reference_value = (word_count_in_class / words_in_class_total * log_part) * 100000 324 | reference_tuple = (word_count, reference_value) 325 | topic_collection[word_key] = reference_tuple 326 | 327 | topic_collection = dict(sorted(topic_collection.items(), key=lambda item: item[1][1], reverse=True)) 328 | 329 | collection_name = Path(doc_name).stem 330 | complete_name = collection_name + "_" + keyfile_name 331 | key_storage_path = os.path.join(key_storage, complete_name + ".json") 332 | logging.info(f"Saving values for: {doc_name}") 333 | with open(key_storage_path, mode="w", encoding="utf-8") as key_file: 334 | json.dump(topic_collection, key_file) 335 | 336 | 337 | if __name__ == "__main__": 338 | main() 339 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_analysis/general_word_frequency.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | import logging 4 | import os 5 | import re 6 | from collections.abc import Iterable 7 | from functools import partial 8 | from os.path import exists, join 9 | 10 | import click 11 | from document_parsing.extract import entities, ngrams, terms 12 | from document_parsing.extract.basics import terms_to_strings 13 | from document_parsing.spacier import core 14 | from dotenv import find_dotenv, load_dotenv 15 | from spacy.tokens import Doc 16 | 17 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.INFO) 18 | 19 | load_dotenv(find_dotenv()) 20 | 21 | SPACY_CHARACTER_LIMIT = 1000000 22 | 23 | 24 | def split_text( 25 | text: str, 26 | chunk_size: int, 27 | chunk_overlap: int, 28 | ) -> list[str]: 29 | separators = ["\n\n", "\n", " ", ""] 30 | 31 | """Split incoming text and return chunks.""" 32 | final_chunks = [] 33 | # Get appropriate separator to use 34 | separator = separators[-1] 35 | new_separators = [] 36 | for i, _s in enumerate(separators): 37 | _separator = re.escape(_s) 38 | if _s == "": 39 | separator = _s 40 | break 41 | if re.search(_separator, text): 42 | separator = _s 43 | new_separators = separators[i + 1 :] 44 | break 45 | 46 | _separator = re.escape(separator) 47 | splits = split_text_with_regex(text, _separator) 48 | 49 | # Now go merging things, recursively splitting longer texts. 50 | _good_splits = [] 51 | _separator = "" 52 | for s in splits: 53 | if _good_splits: 54 | merged_text = merge_splits(_good_splits, _separator) 55 | final_chunks.extend(merged_text) 56 | _good_splits = [] 57 | if not new_separators: 58 | final_chunks.append(s) 59 | else: 60 | other_info = _split_text(s, new_separators, chunk_size, chunk_overlap) 61 | final_chunks.extend(other_info) 62 | if _good_splits: 63 | merged_text = merge_splits(_good_splits, _separator) 64 | final_chunks.extend(merged_text) 65 | return final_chunks 66 | 67 | 68 | def _split_text(text: str, separators: list[str], chunk_size: int, chunk_overlap: int) -> list[str]: 69 | """Split incoming text and return chunks.""" 70 | final_chunks = [] 71 | # Get appropriate separator to use 72 | separator = separators[-1] 73 | new_separators = [] 74 | for i, _s in enumerate(separators): 75 | _separator = re.escape(_s) 76 | if _s == "": 77 | separator = _s 78 | break 79 | if re.search(_separator, text): 80 | separator = _s 81 | new_separators = separators[i + 1 :] 82 | break 83 | 84 | _separator = re.escape(separator) 85 | splits = split_text_with_regex(text, _separator) 86 | 87 | # Now go merging things, recursively splitting longer texts. 88 | _good_splits = [] 89 | _separator = separator 90 | for s in splits: 91 | if len(s) < chunk_size: 92 | _good_splits.append(s) 93 | else: 94 | if _good_splits: 95 | merged_text = merge_splits(_good_splits, _separator, chunk_size, chunk_overlap) 96 | final_chunks.extend(merged_text) 97 | _good_splits = [] 98 | if not new_separators: 99 | final_chunks.append(s) 100 | else: 101 | other_info = _split_text(s, new_separators, chunk_size, chunk_overlap) 102 | final_chunks.extend(other_info) 103 | if _good_splits: 104 | merged_text = merge_splits(_good_splits, _separator, chunk_size, chunk_overlap) 105 | final_chunks.extend(merged_text) 106 | return final_chunks 107 | 108 | 109 | def split_text_with_regex(text: str, separator: str) -> list[str]: 110 | # Now that we have the separator, split the text 111 | if separator: 112 | # The parentheses in the pattern keep the delimiters in the result. 113 | _splits = re.split(f"({separator})", text) 114 | splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)] 115 | if len(_splits) % 2 == 0: 116 | splits += _splits[-1:] 117 | splits = [_splits[0], *splits] 118 | else: 119 | splits = list(text) 120 | return [s for s in splits if s != ""] 121 | 122 | 123 | def merge_splits(splits: Iterable[str], separator: str, chunk_size, chunk_overlap) -> list[str]: 124 | # We now want to combine these smaller pieces into medium size 125 | # chunks to send to the LLM. 126 | separator_len = len(separator) 127 | 128 | docs = [] 129 | current_doc: list[str] = [] 130 | total = 0 131 | for d in splits: 132 | _len = len(d) 133 | if total + _len + (separator_len if len(current_doc) > 0 else 0) > chunk_size: 134 | if total > chunk_size: 135 | logging.warning(f"Created a chunk of size {total}, which is longer than the specified {chunk_size}") 136 | if len(current_doc) > 0: 137 | doc = join_docs(current_doc, separator) 138 | if doc is not None: 139 | docs.append(doc) 140 | # Keep on popping if: 141 | # - we have a larger chunk than in the chunk overlap 142 | # - or if we still have any chunks and the length is long 143 | while total > chunk_overlap or ( 144 | total + _len + (separator_len if len(current_doc) > 0 else 0) > chunk_size and total > 0 145 | ): 146 | total -= len(current_doc[0]) + (separator_len if len(current_doc) > 1 else 0) 147 | current_doc = current_doc[1:] 148 | current_doc.append(d) 149 | total += _len + (separator_len if len(current_doc) > 1 else 0) 150 | doc = join_docs(current_doc, separator) 151 | if doc is not None: 152 | docs.append(doc) 153 | return docs 154 | 155 | 156 | def join_docs(docs: list[str], separator: str) -> str | None: 157 | text = separator.join(docs) 158 | text = text.strip() 159 | 160 | if text == "": 161 | return None 162 | else: 163 | return text 164 | 165 | 166 | def process_documents( 167 | documents: Doc, 168 | parse_config_directory: str, 169 | parse_config_file: str, 170 | ) -> list: 171 | # You can use spacy.explain to get a description for these terms 172 | # Or see the model in https://spacy.io/usage/models and look for model label data 173 | 174 | parse_config_path = join(".", parse_config_directory, parse_config_file) 175 | if exists(parse_config_path): 176 | with open(parse_config_path) as key_file: 177 | filter_content = key_file.read() 178 | filter_configs = json.loads(filter_content) 179 | else: 180 | logging.info("Could not load parse config file") 181 | return 182 | 183 | ngrams_list = filter_configs["ngs"] 184 | entities_list = filter_configs["entities"] 185 | noun_chunks = filter_configs["noun_chunks"] 186 | extract_type = filter_configs["extract_type"] 187 | 188 | logging.debug("Extracting terms from corpus") 189 | extracted_terms = terms( 190 | documents, 191 | ngs=partial(ngrams, n=noun_chunks, include_pos=ngrams_list), 192 | ents=partial( 193 | entities, 194 | include_types=entities_list, 195 | ), 196 | dedupe=True, 197 | ) 198 | 199 | lemma_strings = list(terms_to_strings(extracted_terms, by=extract_type)) 200 | 201 | logging.debug(f"{len(lemma_strings)} metadata keys created") 202 | return lemma_strings 203 | 204 | 205 | @click.command() 206 | @click.option( 207 | "--documents-directory", 208 | "-d", 209 | "documents_directory", 210 | default="./run_files/documents/skynet", 211 | help="The directory where your text files are stored", 212 | ) 213 | @click.option( 214 | "--key-storage", "-k", default="./run_files/key_storage/", help="The directory for the collection metadata keys." 215 | ) 216 | @click.option( 217 | "--keyfile-name", 218 | "-k", 219 | "keyfile_name", 220 | default="keyfile.json", 221 | help="Keyfile name.", 222 | ) 223 | @click.option( 224 | "--model", 225 | "-m", 226 | default="en_core_web_lg", 227 | help="The spacy model to parse the text", 228 | ) 229 | @click.option( 230 | "--parse-config-directory", "-pcd", default="./run_files/parse_configs/", help="The parse config directory" 231 | ) 232 | @click.option( 233 | "--parse-config-file", 234 | "-pcf", 235 | default="ner_types_full.json", 236 | help="The parse config file", 237 | ) 238 | @click.option( 239 | "--chunk-size", 240 | "-cs", 241 | "chunk_size", 242 | type=int, 243 | default=1000000, 244 | help="The text chunk size for parsing. Default spacy maximum chunk size", 245 | ) 246 | @click.option( 247 | "--chunk-overlap", 248 | "-co", 249 | "chunk_overlap", 250 | default=0, 251 | type=int, 252 | help="The overlap for text chunks for parsing", 253 | ) 254 | def main( 255 | documents_directory: str, 256 | key_storage: str, 257 | keyfile_name: str, 258 | model: str, 259 | parse_config_directory: str, 260 | parse_config_file: str, 261 | chunk_size: int, 262 | chunk_overlap: int, 263 | ) -> None: 264 | """Parse ner keywords from text using spacy and grammar configuration files.""" 265 | documents_pattern = os.path.join(documents_directory, "*.txt") 266 | documents_paths_txt = glob.glob(documents_pattern) 267 | 268 | # TODO c-TF-IDF instead of frequency 269 | # Lemma graphs and matplotlib representations 270 | 271 | data = {} 272 | for txt_document in documents_paths_txt: 273 | logging.info(f"Parsing: {txt_document}") 274 | with open(txt_document, encoding="utf-8") as f: 275 | content = f.read() 276 | parts = split_text(content, chunk_size, chunk_overlap) 277 | 278 | for part in parts: 279 | doc = core.make_spacy_doc(part, lang=model) 280 | words = process_documents(doc, parse_config_directory, parse_config_file) 281 | for word in words: 282 | if word in data.keys(): 283 | data[word] = data[word] + 1 284 | else: 285 | data[word] = 1 286 | 287 | # Filter words that occure only once 288 | data = {k: v for k, v in data.items() if v > 1} 289 | 290 | # Sort with most common first 291 | sorted_data = dict(sorted(data.items(), key=lambda item: item[1], reverse=True)) 292 | 293 | logging.info(f"Total words: {len(data)}") 294 | key_storage_path = os.path.join(key_storage, keyfile_name + ".json") 295 | with open(key_storage_path, mode="w", encoding="utf-8") as key_file: 296 | json.dump(sorted_data, key_file) 297 | 298 | 299 | if __name__ == "__main__": 300 | main() 301 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_analysis/generate_uuid.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import uuid 3 | 4 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.DEBUG) 5 | 6 | logging.debug(str(uuid.uuid1())) 7 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_analysis/parse_ner.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | import logging 4 | import multiprocessing as mp 5 | import os 6 | import re 7 | import uuid 8 | from collections.abc import Iterable 9 | from functools import partial 10 | from multiprocessing import Manager, Pool 11 | from os.path import exists, join 12 | from queue import Queue 13 | 14 | import click 15 | import pandas as pd 16 | from document_parsing.extract import entities, ngrams, terms 17 | from document_parsing.extract.basics import terms_to_strings 18 | from document_parsing.spacier import core 19 | from dotenv import find_dotenv, load_dotenv 20 | from spacy.tokens import Doc 21 | 22 | # This is the config for multiprocess logger 23 | # Setting the level to debug outputs multiprocess debug lines too 24 | NER_LOGGER = mp.get_logger() 25 | FORMAT = "%(levelname)s:%(message)s" 26 | formatter = logging.Formatter(fmt=FORMAT) 27 | handler = logging.StreamHandler() 28 | handler.setFormatter(formatter) 29 | 30 | NER_LOGGER.addHandler(handler) 31 | NER_LOGGER.setLevel(logging.INFO) 32 | 33 | load_dotenv(find_dotenv()) 34 | 35 | SPACY_CHARACTER_LIMIT = 1000000 36 | 37 | 38 | def split_text( 39 | text: str, 40 | chunk_size: int, 41 | chunk_overlap: int, 42 | ) -> list[str]: 43 | separators = ["\n\n", "\n", " ", ""] 44 | 45 | """Split incoming text and return chunks.""" 46 | final_chunks = [] 47 | # Get appropriate separator to use 48 | separator = separators[-1] 49 | new_separators = [] 50 | for i, _s in enumerate(separators): 51 | _separator = re.escape(_s) 52 | if _s == "": 53 | separator = _s 54 | break 55 | if re.search(_separator, text): 56 | separator = _s 57 | new_separators = separators[i + 1 :] 58 | break 59 | 60 | _separator = re.escape(separator) 61 | splits = split_text_with_regex(text, _separator) 62 | 63 | # Now go merging things, recursively splitting longer texts. 64 | _good_splits = [] 65 | _separator = "" 66 | for s in splits: 67 | if _good_splits: 68 | merged_text = merge_splits(_good_splits, _separator) 69 | final_chunks.extend(merged_text) 70 | _good_splits = [] 71 | if not new_separators: 72 | final_chunks.append(s) 73 | else: 74 | other_info = _split_text(s, new_separators, chunk_size, chunk_overlap) 75 | final_chunks.extend(other_info) 76 | if _good_splits: 77 | merged_text = merge_splits(_good_splits, _separator) 78 | final_chunks.extend(merged_text) 79 | return final_chunks 80 | 81 | 82 | def _split_text(text: str, separators: list[str], chunk_size: int, chunk_overlap: int) -> list[str]: 83 | """Split incoming text and return chunks.""" 84 | final_chunks = [] 85 | # Get appropriate separator to use 86 | separator = separators[-1] 87 | new_separators = [] 88 | for i, _s in enumerate(separators): 89 | _separator = re.escape(_s) 90 | if _s == "": 91 | separator = _s 92 | break 93 | if re.search(_separator, text): 94 | separator = _s 95 | new_separators = separators[i + 1 :] 96 | break 97 | 98 | _separator = re.escape(separator) 99 | splits = split_text_with_regex(text, _separator) 100 | 101 | # Now go merging things, recursively splitting longer texts. 102 | _good_splits = [] 103 | _separator = separator 104 | for s in splits: 105 | if len(s) < chunk_size: 106 | _good_splits.append(s) 107 | else: 108 | if _good_splits: 109 | merged_text = merge_splits(_good_splits, _separator, chunk_size, chunk_overlap) 110 | final_chunks.extend(merged_text) 111 | _good_splits = [] 112 | if not new_separators: 113 | final_chunks.append(s) 114 | else: 115 | other_info = _split_text(s, new_separators, chunk_size, chunk_overlap) 116 | final_chunks.extend(other_info) 117 | if _good_splits: 118 | merged_text = merge_splits(_good_splits, _separator, chunk_size, chunk_overlap) 119 | final_chunks.extend(merged_text) 120 | return final_chunks 121 | 122 | 123 | def split_text_with_regex(text: str, separator: str) -> list[str]: 124 | # Now that we have the separator, split the text 125 | if separator: 126 | # The parentheses in the pattern keep the delimiters in the result. 127 | _splits = re.split(f"({separator})", text) 128 | splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)] 129 | if len(_splits) % 2 == 0: 130 | splits += _splits[-1:] 131 | splits = [_splits[0], *splits] 132 | else: 133 | splits = list(text) 134 | return [s for s in splits if s != ""] 135 | 136 | 137 | def merge_splits(splits: Iterable[str], separator: str, chunk_size, chunk_overlap) -> list[str]: 138 | # We now want to combine these smaller pieces into medium size 139 | # chunks to send to the LLM. 140 | separator_len = len(separator) 141 | 142 | docs = [] 143 | current_doc: list[str] = [] 144 | total = 0 145 | for d in splits: 146 | _len = len(d) 147 | if total + _len + (separator_len if len(current_doc) > 0 else 0) > chunk_size: 148 | if total > chunk_size: 149 | NER_LOGGER.warning(f"Created a chunk of size {total}, which is longer than the specified {chunk_size}") 150 | if len(current_doc) > 0: 151 | doc = join_docs(current_doc, separator) 152 | if doc is not None: 153 | docs.append(doc) 154 | # Keep on popping if: 155 | # - we have a larger chunk than in the chunk overlap 156 | # - or if we still have any chunks and the length is long 157 | while total > chunk_overlap or ( 158 | total + _len + (separator_len if len(current_doc) > 0 else 0) > chunk_size and total > 0 159 | ): 160 | total -= len(current_doc[0]) + (separator_len if len(current_doc) > 1 else 0) 161 | current_doc = current_doc[1:] 162 | current_doc.append(d) 163 | total += _len + (separator_len if len(current_doc) > 1 else 0) 164 | doc = join_docs(current_doc, separator) 165 | if doc is not None: 166 | docs.append(doc) 167 | return docs 168 | 169 | 170 | def join_docs(docs: list[str], separator: str) -> str | None: 171 | text = separator.join(docs) 172 | text = text.strip() 173 | 174 | if text == "": 175 | return None 176 | else: 177 | return text 178 | 179 | 180 | def process_documents( 181 | documents: Doc, 182 | parse_config_directory: str, 183 | parse_config_file: str, 184 | ) -> pd.Series: 185 | # You can use spacy.explain to get a description for these terms 186 | # Or see the model in https://spacy.io/usage/models and look for model label data 187 | 188 | parse_config_path = join(".", parse_config_directory, parse_config_file) 189 | if exists(parse_config_path): 190 | with open(parse_config_path) as key_file: 191 | filter_content = key_file.read() 192 | filter_configs = json.loads(filter_content) 193 | else: 194 | NER_LOGGER.info("Could not load parse config file") 195 | return 196 | 197 | ngrams_list = filter_configs["ngs"] 198 | entities_list = filter_configs["entities"] 199 | noun_chunks = filter_configs["noun_chunks"] 200 | extract_type = filter_configs["extract_type"] 201 | 202 | NER_LOGGER.info("Extracting terms from corpus") 203 | extracted_terms = terms( 204 | documents, 205 | ngs=partial(ngrams, n=noun_chunks, include_pos=ngrams_list), 206 | ents=partial( 207 | entities, 208 | include_types=entities_list, 209 | ), 210 | dedupe=True, 211 | ) 212 | 213 | lemma_strings = list(terms_to_strings(extracted_terms, by=extract_type)) 214 | all_keys = {} 215 | 216 | NER_LOGGER.info(f"{len(lemma_strings)} metadata keys created") 217 | 218 | # Create uuids for metadata filters 219 | for line in lemma_strings: 220 | filter_uuid = str(uuid.uuid1()) 221 | all_keys[filter_uuid] = line 222 | return pd.Series(all_keys) 223 | 224 | 225 | def read_chuncks(text_corpus, chunk_size, chunk_overlap, que, reader_num) -> bool: 226 | NER_LOGGER.info("Reading chuncks to que") 227 | parts = split_text(text_corpus, chunk_size, chunk_overlap) 228 | for doc in parts: 229 | que.put(doc) 230 | for _i in range(reader_num): 231 | que.put("QUEUE_DONE") 232 | NER_LOGGER.info("Reader done") 233 | return True 234 | 235 | 236 | def process_chuncks(model, parse_config_directory, parse_config_file, read_que: Queue, write_que: Queue, name) -> bool: 237 | NER_LOGGER.info(f"Processor {name} reading chuncks from que") 238 | while True: 239 | try: 240 | corpus = read_que.get(timeout=10) 241 | except Exception as e: 242 | NER_LOGGER.info(f"Processor {name} timed out: {e}") 243 | write_que.put("QUEUE_DONE") 244 | return False 245 | 246 | if corpus == "QUEUE_DONE": 247 | NER_LOGGER.info(f"Processor {name} done") 248 | write_que.put("QUEUE_DONE") 249 | break 250 | doc = core.make_spacy_doc(corpus, lang=model) 251 | pseries = process_documents(doc, parse_config_directory, parse_config_file) 252 | write_que.put(pseries) 253 | return True 254 | 255 | 256 | def clean_and_merge_chunks(que: Queue, name) -> pd.DataFrame: 257 | NER_LOGGER.info(f"cleaner {name} reading chuncks from que") 258 | df = None 259 | while True: 260 | try: 261 | corpus = que.get(timeout=10) 262 | except Exception as e: 263 | NER_LOGGER.info(f"Writer {name} timed out: {e}") 264 | df["Content"].apply(lambda x: x.strip()) 265 | # TODO Place this filter in config file 266 | # Removes one and two letter words 267 | m = ~df.apply(lambda x: x.str.contains("\\b[a-zA-Z]{1,2}\\b")).any(axis=1) 268 | df = df[m] 269 | return df 270 | if not isinstance(corpus, pd.Series) and corpus == "QUEUE_DONE": 271 | NER_LOGGER.info(f"Writer {name} received done") 272 | break 273 | elif isinstance(corpus, pd.Series): 274 | NER_LOGGER.info(f"Writer {name} received a chunck") 275 | if df is None: 276 | df = pd.DataFrame(corpus, columns=["Content"]) 277 | else: 278 | df2 = pd.DataFrame(corpus, columns=["Content"]) 279 | df = pd.concat([df, df2]) 280 | 281 | df["Content"].apply(lambda x: x.strip()) 282 | # TODO Place this filter in config file 283 | # Removes one and two letter words 284 | m = ~df.apply(lambda x: x.str.contains("\\b[a-zA-Z]{1,2}\\b")).any(axis=1) 285 | df = df[m] 286 | NER_LOGGER.info(f"writer {name} - Total amount of keys created: {len(df.index)}") 287 | 288 | return df 289 | 290 | 291 | @click.command() 292 | @click.option( 293 | "--documents-directory", 294 | "-d", 295 | "documents_directory", 296 | default="./run_files/documents/skynet", 297 | help="The directory where your text files are stored", 298 | ) 299 | @click.option( 300 | "--key-storage", "-k", default="./run_files/key_storage/", help="The directory for the collection metadata keys." 301 | ) 302 | @click.option( 303 | "--keyfile-name", 304 | "-k", 305 | "keyfile_name", 306 | default="keyfile.json", 307 | help="Keyfile name.", 308 | ) 309 | @click.option( 310 | "--model", 311 | "-m", 312 | default="en_core_web_lg", 313 | help="The spacy model to parse the text", 314 | ) 315 | @click.option( 316 | "--parse-config-directory", "-pcd", default="./run_files/parse_configs/", help="The parse config directory" 317 | ) 318 | @click.option( 319 | "--parse-config-file", 320 | "-pcf", 321 | default="ner_types.json", 322 | help="The parse config file", 323 | ) 324 | @click.option( 325 | "--chunk-size", 326 | "-cs", 327 | "chunk_size", 328 | type=int, 329 | default=1000000, 330 | help="The text chunk size for parsing. Default spacy maximum chunk size", 331 | ) 332 | @click.option( 333 | "--chunk-overlap", 334 | "-co", 335 | "chunk_overlap", 336 | default=0, 337 | type=int, 338 | help="The overlap for text chunks for parsing", 339 | ) 340 | @click.option( 341 | "--threads", 342 | "-t", 343 | default=6, 344 | type=int, 345 | help="The parse config file", 346 | ) 347 | def main( 348 | documents_directory: str, 349 | key_storage: str, 350 | keyfile_name: str, 351 | model: str, 352 | parse_config_directory: str, 353 | parse_config_file: str, 354 | chunk_size: int, 355 | chunk_overlap: int, 356 | threads: int, 357 | ) -> None: 358 | """Parse ner keywords from text using spacy and grammar configuration files.""" 359 | documents_pattern = os.path.join(documents_directory, "*.txt") 360 | documents_paths_txt = glob.glob(documents_pattern) 361 | text_corpus = "" 362 | 363 | for txt_document in documents_paths_txt: 364 | NER_LOGGER.info(f"Reading: {txt_document}") 365 | with open(txt_document, encoding="utf-8") as f: 366 | content = f.read() 367 | text_corpus = text_corpus + content 368 | 369 | manager = Manager() 370 | read_que = manager.Queue() 371 | write_que = manager.Queue() 372 | 373 | pool = Pool(threads) 374 | 375 | reader = pool.apply_async( 376 | read_chuncks, 377 | ( 378 | text_corpus, 379 | chunk_size, 380 | chunk_overlap, 381 | read_que, 382 | threads, 383 | ), 384 | ) 385 | 386 | read_success = reader.get() 387 | if not read_success: 388 | return 389 | 390 | jobs = [] 391 | for i in range(threads): 392 | job = pool.apply_async( 393 | process_chuncks, 394 | ( 395 | model, 396 | parse_config_directory, 397 | parse_config_file, 398 | read_que, 399 | write_que, 400 | i, 401 | ), 402 | ) 403 | jobs.append(job) 404 | 405 | for job in jobs: 406 | job.get() 407 | 408 | jobs = [] 409 | for i in range(threads): 410 | job = pool.apply_async( 411 | clean_and_merge_chunks, 412 | ( 413 | write_que, 414 | i, 415 | ), 416 | ) 417 | jobs.append(job) 418 | 419 | df = None 420 | for job in jobs: 421 | merge_result = job.get() 422 | if merge_result is not None: 423 | if df is None: 424 | df = merge_result 425 | else: 426 | df = pd.concat([df, merge_result]) 427 | 428 | pool.close() 429 | pool.join() 430 | 431 | if df is not None: 432 | df = df.drop_duplicates() 433 | NER_LOGGER.info(f"Total amount of keys created: {len(df.index)}") 434 | key_storage_path = os.path.join(key_storage, keyfile_name + ".json") 435 | 436 | NER_LOGGER.debug("Create key file") 437 | json_key_file = df.to_json() 438 | with open(key_storage_path, mode="w", encoding="utf-8") as key_file: 439 | key_file.write(json_key_file) 440 | 441 | NER_LOGGER.info(f"Read files from directory: {documents_directory}") 442 | NER_LOGGER.info(f"Wrote keys to: {key_storage_path}") 443 | 444 | 445 | if __name__ == "__main__": 446 | main() 447 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_analysis/spacy_explain.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | 3 | print(spacy.explain("PROPN")) 4 | print(spacy.explain("NOUN")) 5 | print(spacy.explain("ADJ")) 6 | print(spacy.explain("NNP")) 7 | print(spacy.explain("NN")) 8 | print(spacy.explain("AUX")) 9 | print(spacy.explain("VBZ")) 10 | print(spacy.explain("VERB")) 11 | print(spacy.explain("ADP")) 12 | print(spacy.explain("SYM")) 13 | print(spacy.explain("NUM")) 14 | print(spacy.explain("CD")) 15 | print(spacy.explain("VBG")) 16 | print(spacy.explain("ROOT")) 17 | 18 | print(spacy.explain("FAC")) 19 | print(spacy.explain("NORP")) 20 | print(spacy.explain("GPE")) 21 | print(spacy.explain("PRODUCT")) 22 | print(spacy.explain("EVENT")) 23 | print(spacy.explain("PERSON")) 24 | print(spacy.explain("ORG")) 25 | print(spacy.explain("LOC")) 26 | print(spacy.explain("DATE")) 27 | print(spacy.explain("TIME")) 28 | print(spacy.explain("WORK_OF_ART")) 29 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/src/llama_cpp_chat_memory/document_parsing/__init__.py -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/extract/__init__.py: -------------------------------------------------------------------------------- 1 | from document_parsing.extract.basics import entities, ngrams, noun_chunks, terms 2 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/extract/basics.py: -------------------------------------------------------------------------------- 1 | """ 2 | Basics 3 | ------ 4 | 5 | :mod:`textacy.extract.basics`: Extract basic components from a document or sentence 6 | via spaCy, with bells and whistles for filtering the results. 7 | """ 8 | 9 | import operator 10 | from collections.abc import Callable, Collection, Iterable 11 | from functools import partial 12 | 13 | from cytoolz import itertoolz 14 | from spacy.parts_of_speech import DET 15 | from spacy.tokens import Span 16 | 17 | from document_parsing.utils import constants, errors, types, utils 18 | 19 | 20 | def ngrams( 21 | doclike: types.DocLike, 22 | n: int | Collection[int], 23 | *, 24 | filter_stops: bool = True, 25 | filter_punct: bool = True, 26 | filter_nums: bool = False, 27 | include_pos: str | Collection[str] | None = None, 28 | exclude_pos: str | Collection[str] | None = None, 29 | min_freq: int = 1, 30 | ) -> Iterable[Span]: 31 | """ 32 | Extract an ordered sequence of n-grams (``n`` consecutive tokens) from a spaCy 33 | ``Doc`` or ``Span``, for one or multiple ``n`` values, optionally filtering n-grams 34 | by the types and parts-of-speech of the constituent tokens. 35 | 36 | Args: 37 | doclike 38 | n: Number of tokens included per n-gram; for example, ``2`` yields bigrams 39 | and ``3`` yields trigrams. If multiple values are specified, then the 40 | collections of n-grams are concatenated together; for example, ``(2, 3)`` 41 | yields bigrams and then trigrams. 42 | filter_stops: If True, remove ngrams that start or end with a stop word. 43 | filter_punct: If True, remove ngrams that contain any punctuation-only tokens. 44 | filter_nums: If True, remove ngrams that contain any numbers 45 | or number-like tokens (e.g. 10, 'ten'). 46 | include_pos: Remove ngrams if any constituent tokens' part-of-speech tags 47 | ARE NOT included in this param. 48 | exclude_pos: Remove ngrams if any constituent tokens' part-of-speech tags 49 | ARE included in this param. 50 | min_freq: Remove ngrams that occur in ``doclike`` fewer than ``min_freq`` times 51 | 52 | Yields: 53 | Next ngram from ``doclike`` passing all specified filters, in order of appearance 54 | in the document. 55 | 56 | Raises: 57 | ValueError: if any ``n`` < 1 58 | TypeError: if ``include_pos`` or ``exclude_pos`` is not a str, a set of str, 59 | or a falsy value 60 | 61 | Note: 62 | Filtering by part-of-speech tag uses the universal POS tag set; for details, 63 | check spaCy's docs: https://spacy.io/api/annotation#pos-tagging 64 | """ 65 | ns_: tuple[int, ...] = utils.to_tuple(n) 66 | if any(n_ < 1 for n_ in ns_): 67 | msg = "n must be greater than or equal to 1" 68 | raise ValueError(msg) 69 | 70 | ngrams_: Iterable[Span] 71 | for n_ in ns_: 72 | ngrams_ = (doclike[i : i + n_] for i in range(len(doclike) - n_ + 1)) 73 | ngrams_ = (ng for ng in ngrams_ if not any(w.is_space for w in ng)) 74 | if filter_stops is True: 75 | ngrams_ = (ng for ng in ngrams_ if not ng[0].is_stop and not ng[-1].is_stop) 76 | if filter_punct is True: 77 | ngrams_ = (ng for ng in ngrams_ if not any(w.is_punct for w in ng)) 78 | if filter_nums is True: 79 | ngrams_ = (ng for ng in ngrams_ if not any(w.like_num for w in ng)) 80 | if include_pos: 81 | include_pos_: set[str] = {pos.upper() for pos in utils.to_set(include_pos)} 82 | ngrams_ = (ng for ng in ngrams_ if all(w.pos_ in include_pos_ for w in ng)) 83 | if exclude_pos: 84 | exclude_pos_: set[str] = {pos.upper() for pos in utils.to_set(exclude_pos)} 85 | ngrams_ = (ng for ng in ngrams_ if not any(w.pos_ in exclude_pos_ for w in ng)) 86 | if min_freq > 1: 87 | ngrams_ = list(ngrams_) 88 | freqs = itertoolz.frequencies(ng.text.lower() for ng in ngrams_) 89 | ngrams_ = (ng for ng in ngrams_ if freqs[ng.text.lower()] >= min_freq) 90 | 91 | yield from ngrams_ 92 | 93 | 94 | def entities( 95 | doclike: types.DocLike, 96 | *, 97 | include_types: str | Collection[str] | None = None, 98 | exclude_types: str | Collection[str] | None = None, 99 | drop_determiners: bool = True, 100 | min_freq: int = 1, 101 | ) -> Iterable[Span]: 102 | """ 103 | Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from 104 | a ``Doc``, optionally filtering by entity types and frequencies. 105 | 106 | Args: 107 | doclike 108 | include_types: Remove entities whose type IS NOT 109 | in this param; if "NUMERIC", all numeric entity types ("DATE", 110 | "MONEY", "ORDINAL", etc.) are included 111 | exclude_types: Remove entities whose type IS 112 | in this param; if "NUMERIC", all numeric entity types ("DATE", 113 | "MONEY", "ORDINAL", etc.) are excluded 114 | drop_determiners: Remove leading determiners (e.g. "the") 115 | from entities (e.g. "the United States" => "United States"). 116 | 117 | .. note:: Entities from which a leading determiner has been removed 118 | are, effectively, *new* entities, and not saved to the ``Doc`` 119 | from which they came. This is irritating but unavoidable, since 120 | this function is not meant to have side-effects on document state. 121 | If you're only using the text of the returned spans, this is no 122 | big deal, but watch out if you're counting on determiner-less 123 | entities associated with the doc downstream. 124 | 125 | min_freq: Remove entities that occur in ``doclike`` fewer 126 | than ``min_freq`` times 127 | 128 | Yields: 129 | Next entity from ``doclike`` passing all specified filters in order of appearance 130 | in the document 131 | 132 | Raises: 133 | TypeError: if ``include_types`` or ``exclude_types`` is not a str, a set of 134 | str, or a falsy value 135 | """ 136 | ents = doclike.ents 137 | 138 | include_types = _parse_ent_types(include_types, "include") 139 | exclude_types = _parse_ent_types(exclude_types, "exclude") 140 | if include_types: 141 | if isinstance(include_types, str): 142 | ents = (ent for ent in ents if ent.label_ == include_types) 143 | elif isinstance(include_types, set | frozenset | list | tuple): 144 | ents = (ent for ent in ents if ent.label_ in include_types) 145 | if exclude_types: 146 | if isinstance(exclude_types, str): 147 | ents = (ent for ent in ents if ent.label_ != exclude_types) 148 | elif isinstance(exclude_types, set | frozenset | list | tuple): 149 | ents = (ent for ent in ents if ent.label_ not in exclude_types) 150 | if drop_determiners is True: 151 | ents = ( 152 | ent if ent[0].pos != DET else Span(ent.doc, ent.start + 1, ent.end, label=ent.label, vector=ent.vector) 153 | for ent in ents 154 | ) 155 | if min_freq > 1: 156 | ents = list(ents) # type: ignore 157 | freqs = itertoolz.frequencies(ent.text.lower() for ent in ents) 158 | ents = (ent for ent in ents if freqs[ent.text.lower()] >= min_freq) 159 | 160 | yield from ents 161 | 162 | 163 | def _parse_ent_types(ent_types: str | Collection[str] | None, which: str) -> str | set[str] | None: 164 | if not ent_types: 165 | return None 166 | elif isinstance(ent_types, str): 167 | ent_types = ent_types.upper() 168 | # replace the shorthand numeric case by its corresponding constant 169 | if ent_types == "NUMERIC": 170 | return constants.NUMERIC_ENT_TYPES 171 | else: 172 | return ent_types 173 | elif isinstance(ent_types, set | frozenset | list | tuple): 174 | ent_types = {ent_type.upper() for ent_type in ent_types} 175 | # again, replace the shorthand numeric case by its corresponding constant 176 | # and include it in the set in case other types are specified 177 | if any(ent_type == "NUMERIC" for ent_type in ent_types): 178 | return ent_types.union(constants.NUMERIC_ENT_TYPES) 179 | else: 180 | return ent_types 181 | else: 182 | raise TypeError(errors.type_invalid_msg(f"{which}_types", type(ent_types), [str | Collection[str]] | None)) 183 | 184 | 185 | def noun_chunks(doclike: types.DocLike, *, drop_determiners: bool = True, min_freq: int = 1) -> Iterable[Span]: 186 | """ 187 | Extract an ordered sequence of noun chunks from a spacy-parsed doc, optionally 188 | filtering by frequency and dropping leading determiners. 189 | 190 | Args: 191 | doclike 192 | drop_determiners: Remove leading determiners (e.g. "the") 193 | from phrases (e.g. "the quick brown fox" => "quick brown fox") 194 | min_freq: Remove chunks that occur in ``doclike`` fewer than ``min_freq`` times 195 | 196 | Yields: 197 | Next noun chunk from ``doclike`` in order of appearance in the document 198 | """ 199 | ncs: Iterable[Span] 200 | ncs = doclike.noun_chunks 201 | if drop_determiners is True: 202 | ncs = (nc if nc[0].pos != DET else nc[1:] for nc in ncs) 203 | if min_freq > 1: 204 | ncs = list(ncs) 205 | freqs = itertoolz.frequencies(nc.text.lower() for nc in ncs) 206 | ncs = (nc for nc in ncs if freqs[nc.text.lower()] >= min_freq) 207 | 208 | yield from ncs 209 | 210 | 211 | def terms( 212 | doclike: types.DocLike, 213 | *, 214 | ngs: int | Collection[int] | types.DocLikeToSpans | None = None, 215 | ents: bool | types.DocLikeToSpans | None = None, 216 | ncs: bool | types.DocLikeToSpans | None = None, 217 | dedupe: bool = True, 218 | ) -> Iterable[Span]: 219 | """ 220 | Extract one or multiple types of terms -- ngrams, entities, and/or noun chunks -- 221 | from ``doclike`` as a single, concatenated collection, with optional deduplication 222 | of spans extracted by more than one type. 223 | 224 | .. code-block:: pycon 225 | 226 | >>> extract.terms(doc, ngs=2, ents=True, ncs=True) 227 | >>> extract.terms(doc, ngs=lambda doc: extract.ngrams(doc, n=2)) 228 | >>> extract.terms(doc, ents=extract.entities) 229 | >>> extract.terms(doc, ents=partial(extract.entities, include_types="PERSON")) 230 | 231 | Args: 232 | doclike 233 | ngs: N-gram terms to be extracted. 234 | If one or multiple ints, :func:`textacy.extract.ngrams(doclike, n=ngs)` is 235 | used to extract terms; if a callable, ``ngs(doclike)`` is used to extract 236 | terms; if None, no n-gram terms are extracted. 237 | ents: Entity terms to be extracted. 238 | If True, :func:`textacy.extract.entities(doclike)` is used to extract terms; 239 | if a callable, ``ents(doclike)`` is used to extract terms; 240 | if None, no entity terms are extracted. 241 | ncs: Noun chunk terms to be extracted. 242 | If True, :func:`textacy.extract.noun_chunks(doclike)` is used to extract 243 | terms; if a callable, ``ncs(doclike)`` is used to extract terms; 244 | if None, no noun chunk terms are extracted. 245 | dedupe: If True, deduplicate terms whose spans are extracted by multiple types 246 | (e.g. a span that is both an n-gram and an entity), as identified by 247 | identical (start, stop) indexes in ``doclike``; otherwise, don't. 248 | 249 | Returns: 250 | Next term from ``doclike``, in order of n-grams then entities then noun chunks, 251 | with each collection's terms given in order of appearance. 252 | 253 | Note: 254 | This function is *not* to be confused with keyterm extraction, which leverages 255 | statistics and algorithms to quantify the "key"-ness of terms before returning 256 | the top-ranking terms. There is no such scoring or ranking here. 257 | 258 | See Also: 259 | - :func:`textacy.extact.ngrams()` 260 | - :func:`textacy.extact.entities()` 261 | - :func:`textacy.extact.noun_chunks()` 262 | - :mod:`textacy.extact.keyterms` 263 | """ 264 | extractors = _get_extractors(ngs, ents, ncs) 265 | terms_ = itertoolz.concat(extractor(doclike) for extractor in extractors) 266 | if dedupe is True: 267 | terms_ = itertoolz.unique(terms_, lambda span: (span.start, span.end)) 268 | yield from terms_ 269 | 270 | 271 | def _get_extractors(ngs, ents, ncs) -> list[types.DocLikeToSpans]: 272 | all_extractors = [ 273 | _get_ngs_extractor(ngs), 274 | _get_ents_extractor(ents), 275 | _get_ncs_extractor(ncs), 276 | ] 277 | extractors = [extractor for extractor in all_extractors if extractor is not None] 278 | if not extractors: 279 | msg = "at least one term extractor must be specified" 280 | raise ValueError(msg) 281 | else: 282 | return extractors 283 | 284 | 285 | def _get_ngs_extractor(ngs) -> types.DocLikeToSpans | None: 286 | if ngs is None: 287 | return None 288 | elif callable(ngs): 289 | return ngs 290 | elif isinstance(ngs, int) or (isinstance(ngs, Collection) and all(isinstance(ng, int) for ng in ngs)): 291 | return partial(ngrams, n=ngs) 292 | else: 293 | raise TypeError() 294 | 295 | 296 | def _get_ents_extractor(ents) -> types.DocLikeToSpans | None: 297 | if ents is None: 298 | return None 299 | elif callable(ents): 300 | return ents 301 | elif isinstance(ents, bool): 302 | return entities 303 | else: 304 | raise TypeError() 305 | 306 | 307 | def _get_ncs_extractor(ncs) -> types.DocLikeToSpans | None: 308 | if ncs is None: 309 | return None 310 | elif callable(ncs): 311 | return ncs 312 | elif isinstance(ncs, bool): 313 | return noun_chunks 314 | else: 315 | raise TypeError() 316 | 317 | 318 | def terms_to_strings( 319 | terms: Iterable[types.SpanLike], 320 | by: str | Callable[[types.SpanLike], str], 321 | ) -> Iterable[str]: 322 | """ 323 | Transform a sequence of terms as spaCy ``Token`` s or ``Span`` s into strings. 324 | 325 | Args: 326 | terms 327 | by: Method by which terms are transformed into strings. 328 | If "orth", terms are represented by their text exactly as written; 329 | if "lower", by the lowercased form of their text; 330 | if "lemma", by their base form w/o inflectional suffixes; 331 | if a callable, must accept a ``Token`` or ``Span`` and return a string. 332 | 333 | Yields: 334 | Next term in ``terms``, as a string. 335 | """ 336 | terms_: Iterable[str] 337 | if by == "lower": 338 | terms_ = (term.text.lower() for term in terms) 339 | elif by in ("lemma", "orth"): 340 | by_ = operator.attrgetter(f"{by}_") 341 | terms_ = (by_(term) for term in terms) 342 | elif callable(by): 343 | terms_ = (by(term) for term in terms) 344 | else: 345 | raise ValueError(errors.value_invalid_msg("by", by, {"orth", "lower", "lemma", Callable})) 346 | yield from terms_ 347 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/filter_csv.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import json 4 | import logging 5 | import os 6 | from os.path import exists, join 7 | 8 | import pandas as pd 9 | from dotenv import find_dotenv, load_dotenv 10 | 11 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.DEBUG) 12 | # logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.INFO) 13 | load_dotenv(find_dotenv()) 14 | 15 | 16 | def main( 17 | documents_directory: str, 18 | parse_config_directory: str, 19 | parse_config_file: str, 20 | filter_config_directory: str, 21 | filter_config_file: str, 22 | ) -> None: 23 | documents_pattern = os.path.join(documents_directory, "*.csv") 24 | logging.debug(f"documents search pattern: {documents_pattern}") 25 | documents_paths_csv = glob.glob(documents_pattern) 26 | 27 | logging.debug("Reading config file") 28 | parse_config_path = join(".", parse_config_directory, parse_config_file) 29 | if exists(parse_config_path): 30 | with open(parse_config_path) as key_file: 31 | column_content = key_file.read() 32 | column_configs = json.loads(column_content) 33 | else: 34 | logging.debug("Could not load parse config file") 35 | return 36 | 37 | logging.debug("Reading filter file") 38 | filter_config_path = join(".", filter_config_directory, filter_config_file) 39 | if exists(filter_config_path): 40 | with open(filter_config_path) as key_file: 41 | filter_content = key_file.read() 42 | filter_configs = json.loads(filter_content) 43 | else: 44 | logging.debug("Could not load parse config file") 45 | return 46 | 47 | for csv_document in documents_paths_csv: 48 | logging.debug(f"Processing: {csv_document}") 49 | columns_list = [] 50 | with open(csv_document, encoding="utf8") as f: 51 | first_line = f.readline() 52 | columns_line = "index" + first_line.strip() 53 | logging.debug("Matching csv type to config") 54 | for column_conf_key in column_configs: 55 | columns = column_configs[column_conf_key]["columns"] 56 | columns_string = ",".join(columns) 57 | if columns_string == columns_line: 58 | columns_list = columns 59 | logging.debug("Match found") 60 | break 61 | 62 | logging.debug("Reading to datafile") 63 | df = pd.read_csv(csv_document, header=0, names=columns_list) 64 | item_count = df.shape[0] 65 | logging.debug(f"item count: {item_count}") 66 | logging.debug(df.head()) 67 | 68 | for csv_filter in filter_configs["filters"]: 69 | if "whitelist" in csv_filter: 70 | whitelist = csv_filter["whitelist"] 71 | tags = csv_filter["filter_field"] 72 | df = df[df[tags].apply(lambda x, wordlist=set(whitelist): any(word in x for word in wordlist))] 73 | item_count = df.shape[0] 74 | logging.debug(f"item count: {item_count}") 75 | logging.debug(df.head()) 76 | 77 | if "blacklist" in csv_filter: 78 | blacklist = csv_filter["blacklist"] 79 | tags = csv_filter["filter_field"] 80 | df = df[df[tags].apply(lambda x, wordlist=set(blacklist): not any(word in x for word in wordlist))] 81 | item_count = df.shape[0] 82 | logging.debug(f"item count: {item_count}") 83 | logging.debug(df.head()) 84 | 85 | output = documents_directory + "/filtered.csv" 86 | df.to_csv(output, index=False) 87 | 88 | 89 | if __name__ == "__main__": 90 | # Read the data directory, collection name, and persist directory 91 | parser = argparse.ArgumentParser(description="Filter rows from a csv file using whitelist and blacklist filters") 92 | 93 | # Add arguments 94 | parser.add_argument( 95 | "--data-directory", 96 | type=str, 97 | default="./run_files/documents/csv_test", 98 | help="The directory where your csv files are stored", 99 | ) 100 | 101 | parser.add_argument( 102 | "--parse-config-directory", 103 | type=str, 104 | default="./run_files/parse_configs/", 105 | help="The parse config directory", 106 | ) 107 | 108 | parser.add_argument( 109 | "--parse-config-file", 110 | type=str, 111 | default="csv_columns.json", 112 | help="The parse config file", 113 | ) 114 | 115 | parser.add_argument( 116 | "--filter-config-directory", 117 | type=str, 118 | default="./run_files/filters/", 119 | help="The parse config directory", 120 | ) 121 | 122 | parser.add_argument( 123 | "--filter-config-file", 124 | type=str, 125 | default="csv_filter.json", 126 | help="The parse config file", 127 | ) 128 | 129 | # Parse arguments 130 | args = parser.parse_args() 131 | 132 | main( 133 | documents_directory=args.data_directory, 134 | parse_config_directory=args.parse_config_directory, 135 | parse_config_file=args.parse_config_file, 136 | filter_config_directory=args.filter_config_directory, 137 | filter_config_file=args.filter_config_file, 138 | ) 139 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/parse_csv_to_text.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import json 4 | import logging 5 | import os 6 | from os.path import exists, join, splitext 7 | 8 | import pandas as pd 9 | from dotenv import find_dotenv, load_dotenv 10 | from trafilatura import extract 11 | 12 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.DEBUG) 13 | # logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.INFO) 14 | load_dotenv(find_dotenv()) 15 | 16 | 17 | def main( 18 | documents_directory: str, 19 | parse_config_directory: str, 20 | parse_config_file: str, 21 | filter_config_directory: str, 22 | filter_config_file: str, 23 | ) -> None: 24 | documents_pattern = os.path.join(documents_directory, "*.csv") 25 | logging.debug(f"documents search pattern: {documents_pattern}") 26 | documents_paths_csv = glob.glob(documents_pattern) 27 | 28 | logging.debug("Reading config file") 29 | parse_config_path = join(".", parse_config_directory, parse_config_file) 30 | if exists(parse_config_path): 31 | with open(parse_config_path) as key_file: 32 | column_content = key_file.read() 33 | column_configs = json.loads(column_content) 34 | else: 35 | logging.debug("Could not load parse config file") 36 | return 37 | 38 | logging.debug("Reading filter file") 39 | filter_config_path = join(".", filter_config_directory, filter_config_file) 40 | if exists(filter_config_path): 41 | with open(filter_config_path) as key_file: 42 | filter_content = key_file.read() 43 | filter_configs = json.loads(filter_content) 44 | else: 45 | logging.debug("Could not load parse config file") 46 | return 47 | 48 | parse_filters = filter_configs["filters"] 49 | 50 | for csv_document in documents_paths_csv: 51 | logging.debug(f"Processing: {csv_document}") 52 | data = "" 53 | columns_list = [] 54 | with open(csv_document, encoding="utf8") as f: 55 | first_line = f.readline() 56 | columns_line = "index" + first_line.strip() 57 | logging.debug("Matching csv type to config") 58 | for column_conf_key in column_configs: 59 | columns = column_configs[column_conf_key]["columns"] 60 | columns_string = ",".join(columns) 61 | index_columns_string = "index" + columns_string 62 | if columns_line in (columns_string, index_columns_string): 63 | columns_list = columns 64 | data = column_configs[column_conf_key]["datafield"] 65 | logging.debug("Match found") 66 | break 67 | 68 | logging.debug("Reading to datafile") 69 | df = pd.read_csv(csv_document, header=0, names=columns_list) 70 | # logging.debug(df.head()) 71 | # logging.debug(df[data].head()) 72 | 73 | for parse_filter in parse_filters: 74 | filter_iterator = iter(parse_filter) 75 | parse_regex = next(filter_iterator) 76 | parse_replacment = next(filter_iterator) 77 | logging.debug(f"Applying filter: {parse_regex}") 78 | df[data] = df[data].replace( 79 | to_replace=parse_filter[parse_regex], value=parse_filter[parse_replacment], regex=True 80 | ) 81 | base = splitext(csv_document)[0] 82 | doc_path = base + ".txt" 83 | logging.debug("Writing to file") 84 | with open(file=doc_path, mode="a", encoding="utf-8") as doc_file: 85 | for line in df[data].to_numpy(): 86 | clean_text = extract(line) 87 | if clean_text is not None: 88 | doc_file.write(clean_text + "\n\n") 89 | # logging.info(clean_text) 90 | 91 | 92 | if __name__ == "__main__": 93 | # Read the data directory, collection name, and persist directory 94 | parser = argparse.ArgumentParser(description="Parse csv file to a text file and filter out noise from web scrapes.") 95 | 96 | # Add arguments 97 | parser.add_argument( 98 | "--data-directory", 99 | type=str, 100 | default="./run_files/documents/csv_test", 101 | help="The directory where your csv files are stored", 102 | ) 103 | 104 | parser.add_argument( 105 | "--parse-config-directory", 106 | type=str, 107 | default="./run_files/parse_configs/", 108 | help="The parse config directory", 109 | ) 110 | 111 | parser.add_argument( 112 | "--parse-config-file", 113 | type=str, 114 | default="csv_columns.json", 115 | help="The parse config file", 116 | ) 117 | 118 | parser.add_argument( 119 | "--filter-config-directory", 120 | type=str, 121 | default="./run_files/filters/", 122 | help="The parse config directory", 123 | ) 124 | 125 | parser.add_argument( 126 | "--filter-config-file", 127 | type=str, 128 | default="web_scrape_filter.json", 129 | help="The parse config file", 130 | ) 131 | 132 | # Parse arguments 133 | args = parser.parse_args() 134 | 135 | main( 136 | documents_directory=args.data_directory, 137 | parse_config_directory=args.parse_config_directory, 138 | parse_config_file=args.parse_config_file, 139 | filter_config_directory=args.filter_config_directory, 140 | filter_config_file=args.filter_config_file, 141 | ) 142 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/parse_json_documents.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import json 4 | import logging 5 | import os 6 | import uuid 7 | from os import getenv 8 | from os.path import join 9 | 10 | import chromadb 11 | from chromadb.config import Settings 12 | from custom_llm_classes.custom_spacy_embeddings import CustomSpacyEmbeddings 13 | from dotenv import find_dotenv, load_dotenv 14 | from langchain.docstore.document import Document 15 | from langchain.text_splitter import RecursiveCharacterTextSplitter 16 | from langchain_community.embeddings import HuggingFaceEmbeddings, LlamaCppEmbeddings 17 | from langchain_community.vectorstores import Chroma 18 | 19 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.DEBUG) 20 | # logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.INFO) 21 | load_dotenv(find_dotenv()) 22 | 23 | 24 | def main( 25 | documents_directory: str, 26 | collection_name: str, 27 | persist_directory: str, 28 | key_storage: str, 29 | chunk_size: int, 30 | chunk_overlap: int, 31 | embeddings_type: str, 32 | ) -> None: 33 | model_dir = getenv("MODEL_DIR") 34 | model = getenv("MODEL") 35 | model_source = join(model_dir, model) 36 | embeddings_model = getenv("EMBEDDINGS_MODEL") 37 | 38 | all_documents = [] 39 | all_keys = {} 40 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) 41 | 42 | documents_pattern = os.path.join(documents_directory, "*.json") 43 | documents_paths_json = glob.glob(documents_pattern) 44 | 45 | for json_document in documents_paths_json: 46 | with open(json_document, encoding="utf-8") as f: 47 | content = f.read() 48 | document_content = json.loads(content) 49 | if isinstance(document_content["entries"], list): 50 | logging.debug("Parsing List") 51 | for entry in document_content["entries"]: 52 | document_text = "" 53 | metadata_filters = {"source": json_document} 54 | 55 | if "content" in entry: 56 | document_text = document_text + entry["content"] 57 | elif "entry" in entry: 58 | document_text = document_text + entry["entry"] 59 | 60 | logging.debug(f"Extracted a key: {entry['keys']}") 61 | for m_filter in entry["keys"]: 62 | filter_uuid = str(uuid.uuid1()) 63 | metadata_filters[filter_uuid] = m_filter 64 | 65 | all_keys = metadata_filters 66 | json_doc = [Document(page_content=document_text, metadata=metadata_filters)] 67 | json_document_content = text_splitter.split_documents(json_doc) 68 | all_documents.extend(json_document_content) 69 | elif isinstance(document_content["entries"], dict): 70 | logging.debug("Parsing dict") 71 | for entry in document_content["entries"]: 72 | metadata_filters = {"source": json_document} 73 | document_text = document_text + document_content["entries"][entry]["content"] 74 | 75 | logging.debug(f"Extracted a key: {document_content['entries'][entry]['key']}") 76 | for m_filter in document_content["entries"][entry]["key"]: 77 | filter_uuid = str(uuid.uuid1()) 78 | metadata_filters[filter_uuid] = m_filter 79 | 80 | all_keys = metadata_filters 81 | json_doc = [Document(page_content=document_text, metadata=metadata_filters)] 82 | json_document_content = text_splitter.split_documents(json_doc) 83 | all_documents.extend(json_document_content) 84 | 85 | if embeddings_type == "llama": 86 | logging.info("Using llama embeddigs") 87 | params = { 88 | "n_ctx": getenv("N_CTX"), 89 | "n_batch": 1024, 90 | "n_gpu_layers": getenv("LAYERS"), 91 | } 92 | embedder = LlamaCppEmbeddings( 93 | model_path=model_source, 94 | **params, 95 | ) 96 | elif embeddings_type == "spacy": 97 | logging.info("Using spacy embeddigs") 98 | # embedder = CustomSpacyEmbeddings(model_path="en_core_web_lg") 99 | embedder = CustomSpacyEmbeddings(model_path=embeddings_model) 100 | elif embeddings_type == "huggingface": 101 | logging.info("Using huggingface embeddigs") 102 | # model_name = "sentence-transformers/all-mpnet-base-v2" 103 | model_kwargs = {"device": "cpu"} 104 | encode_kwargs = {"normalize_embeddings": False} 105 | embedder = HuggingFaceEmbeddings( 106 | model_name=embeddings_model, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs 107 | ) 108 | else: 109 | error_message = f"Unsupported embeddings type: {embeddings_type}" 110 | raise ValueError(error_message) 111 | client = chromadb.PersistentClient(path=persist_directory, settings=Settings(anonymized_telemetry=False)) 112 | Chroma.from_documents( 113 | client=client, 114 | documents=all_documents, 115 | embedding=embedder, 116 | persist_directory=persist_directory, 117 | collection_name=collection_name, 118 | collection_metadata={"hnsw:space": "l2"}, 119 | ) 120 | 121 | logging.debug(f"Key file content: {all_keys}") 122 | 123 | json_key_file = json.dumps(all_keys) 124 | # logging.debug(f"Key file uuid keys: {list(all_keys.keys())}") 125 | 126 | # If you enable this you might want to pipe the output to a file 127 | # logging.debug(all_documents) 128 | 129 | key_storage_path = os.path.join(key_storage, collection_name + ".json") 130 | with open(key_storage_path, "w", encoding="utf-8") as key_file: 131 | key_file.write(json_key_file) 132 | 133 | logging.info(f"Read files from directory: {documents_directory}") 134 | logging.info(f"Text parsed with chunk size: {chunk_size}, and chunk overlap: {chunk_overlap}") 135 | logging.debug(f"Saved collection as: {collection_name}") 136 | logging.debug(f"Saved collection to: {persist_directory}") 137 | logging.info(f"Wrote keys to: {key_storage_path}") 138 | 139 | 140 | if __name__ == "__main__": 141 | # Read the data directory, collection name, and persist directory 142 | parser = argparse.ArgumentParser(description="Parse json documents to documents and upload to chroma") 143 | 144 | # Add arguments 145 | parser.add_argument( 146 | "--data-directory", 147 | type=str, 148 | default="./run_files/documents/hogwarts", 149 | help="The directory where your text files are stored", 150 | ) 151 | parser.add_argument( 152 | "--collection-name", 153 | type=str, 154 | default="hogwarts", 155 | help="The name of the Chroma collection", 156 | ) 157 | parser.add_argument( 158 | "--persist-directory", 159 | type=str, 160 | default="./run_files/character_storage/", 161 | help="The directory where you want to store the Chroma collection", 162 | ) 163 | 164 | parser.add_argument( 165 | "--key-storage", 166 | type=str, 167 | default="./run_files/key_storage/", 168 | help="The directory where you want to store the Chroma collection metadata keys", 169 | ) 170 | 171 | parser.add_argument( 172 | "--chunk-size", 173 | type=int, 174 | default=1024, 175 | help="The text chunk size for parsing", 176 | ) 177 | 178 | parser.add_argument( 179 | "--chunk-overlap", 180 | type=int, 181 | default=0, 182 | help="The overlap for text chunks for parsing", 183 | ) 184 | 185 | parser.add_argument( 186 | "--embeddings-type", 187 | type=str, 188 | default="spacy", 189 | help="The chosen embeddings type", 190 | ) 191 | 192 | # Parse arguments 193 | args = parser.parse_args() 194 | 195 | main( 196 | documents_directory=args.data_directory, 197 | collection_name=args.collection_name, 198 | persist_directory=args.persist_directory, 199 | key_storage=args.key_storage, 200 | chunk_size=args.chunk_size, 201 | chunk_overlap=args.chunk_overlap, 202 | embeddings_type=args.embeddings_type, 203 | ) 204 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/parse_pdf_documents.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import logging 4 | import os 5 | from os import getenv 6 | from os.path import join 7 | 8 | import chromadb 9 | from chromadb.config import Settings 10 | from custom_llm_classes.custom_spacy_embeddings import CustomSpacyEmbeddings 11 | from dotenv import find_dotenv, load_dotenv 12 | from langchain.document_loaders import PyPDFLoader 13 | from langchain.text_splitter import RecursiveCharacterTextSplitter 14 | from langchain_community.embeddings import HuggingFaceEmbeddings, LlamaCppEmbeddings 15 | from langchain_community.vectorstores import Chroma 16 | 17 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.DEBUG) 18 | # logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.INFO) 19 | load_dotenv(find_dotenv()) 20 | 21 | 22 | def main( 23 | documents_directory: str, 24 | collection_name: str, 25 | persist_directory: str, 26 | chunk_size: int, 27 | chunk_overlap: int, 28 | embeddings_type: str, 29 | ) -> None: 30 | model_dir = getenv("MODEL_DIR") 31 | model = getenv("MODEL") 32 | model_source = join(model_dir, model) 33 | embeddings_model = getenv("EMBEDDINGS_MODEL") 34 | 35 | documents_pattern = os.path.join(documents_directory, "*.pdf") 36 | logging.debug(f"documents search pattern: {documents_pattern}") 37 | documents_paths_pdf = glob.glob(documents_pattern) 38 | 39 | all_documents = [] 40 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) 41 | for pdf_document in documents_paths_pdf: 42 | logging.debug(f"loading: {pdf_document}") 43 | loader = PyPDFLoader(pdf_document) 44 | docs = loader.load_and_split(text_splitter=text_splitter) 45 | all_documents.extend(docs) 46 | 47 | if embeddings_type == "llama": 48 | params = { 49 | "n_ctx": getenv("N_CTX"), 50 | "n_batch": 1024, 51 | "n_gpu_layers": getenv("LAYERS"), 52 | } 53 | 54 | logging.info("Using llama embeddigs") 55 | embedder = LlamaCppEmbeddings( 56 | model_path=model_source, 57 | **params, 58 | ) 59 | elif embeddings_type == "spacy": 60 | logging.info("Using spacy embeddigs") 61 | # embedder = CustomSpacyEmbeddings(model_path="en_core_web_lg") 62 | embedder = CustomSpacyEmbeddings(model_path=embeddings_model) 63 | elif embeddings_type == "huggingface": 64 | logging.info("Using huggingface embeddigs") 65 | # model_name = "sentence-transformers/all-mpnet-base-v2" 66 | model_kwargs = {"device": "cpu"} 67 | encode_kwargs = {"normalize_embeddings": False} 68 | embedder = HuggingFaceEmbeddings( 69 | model_name=embeddings_model, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs 70 | ) 71 | 72 | else: 73 | error_message = f"Unsupported embeddings type: {embeddings_type}" 74 | raise ValueError(error_message) 75 | client = chromadb.PersistentClient(path=persist_directory, settings=Settings(anonymized_telemetry=False)) 76 | Chroma.from_documents( 77 | client=client, 78 | documents=all_documents, 79 | embedding=embedder, 80 | persist_directory=persist_directory, 81 | collection_name=collection_name, 82 | collection_metadata={"hnsw:space": "l2"}, 83 | ) 84 | 85 | # If you enable this you might want to pipe the output to a file 86 | # logging.debug(all_documents) 87 | 88 | logging.info(f"Read files from directory: {documents_directory}") 89 | logging.info(f"Text parsed with chunk size: {chunk_size}, and chunk overlap: {chunk_overlap}") 90 | logging.debug(f"Saved collection as: {collection_name}") 91 | logging.debug(f"Saved collection to: {persist_directory}") 92 | 93 | 94 | if __name__ == "__main__": 95 | # Read the data directory, collection name, and persist directory 96 | parser = argparse.ArgumentParser(description="Parse pdf documents to documents and upload to chroma") 97 | 98 | # Add arguments 99 | parser.add_argument( 100 | "--data-directory", 101 | type=str, 102 | default="./run_files/documents/fyodor_dostoyevsky", 103 | help="The directory where your text files are stored", 104 | ) 105 | parser.add_argument( 106 | "--collection-name", 107 | type=str, 108 | default="dostoyevsky", 109 | help="The name of the Chroma collection", 110 | ) 111 | parser.add_argument( 112 | "--persist-directory", 113 | type=str, 114 | default="./run_files/character_storage/", 115 | help="The directory where you want to store the Chroma collection", 116 | ) 117 | 118 | parser.add_argument( 119 | "--chunk-size", 120 | type=int, 121 | default=1024, 122 | help="The text chunk size for parsing", 123 | ) 124 | 125 | parser.add_argument( 126 | "--chunk-overlap", 127 | type=int, 128 | default=0, 129 | help="The overlap for text chunks for parsing", 130 | ) 131 | 132 | parser.add_argument( 133 | "--embeddings-type", 134 | type=str, 135 | default="spacy", 136 | help="The chosen embeddings type", 137 | ) 138 | 139 | # Parse arguments 140 | args = parser.parse_args() 141 | 142 | main( 143 | documents_directory=args.data_directory, 144 | collection_name=args.collection_name, 145 | persist_directory=args.persist_directory, 146 | chunk_size=args.chunk_size, 147 | chunk_overlap=args.chunk_overlap, 148 | embeddings_type=args.embeddings_type, 149 | ) 150 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/parse_text_documents.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | import logging 4 | import multiprocessing as mp 5 | import os 6 | 7 | # For perf measuring 8 | import time 9 | from multiprocessing import Manager, Pool 10 | from os import getenv 11 | from os.path import join 12 | 13 | import chromadb 14 | import click 15 | import pandas as pd 16 | from chromadb.config import Settings 17 | from custom_llm_classes.custom_spacy_embeddings import CustomSpacyEmbeddings 18 | from dotenv import find_dotenv, load_dotenv 19 | from langchain.text_splitter import RecursiveCharacterTextSplitter 20 | from langchain_community.document_loaders import TextLoader 21 | from langchain_community.embeddings import HuggingFaceEmbeddings, LlamaCppEmbeddings 22 | from langchain_community.vectorstores import Chroma 23 | from langchain_core.documents.base import Document 24 | 25 | # This is the config for multiprocess logger 26 | # Setting the level to debug outputs multiprocess debug lines too 27 | NER_LOGGER = mp.get_logger() 28 | FORMAT = "%(levelname)s:%(message)s" 29 | formatter = logging.Formatter(fmt=FORMAT) 30 | handler = logging.StreamHandler() 31 | handler.setFormatter(formatter) 32 | 33 | NER_LOGGER.addHandler(handler) 34 | NER_LOGGER.setLevel(logging.INFO) 35 | 36 | load_dotenv(find_dotenv()) 37 | 38 | 39 | def read_documents( 40 | all_documents, 41 | que, 42 | reader_num, 43 | ) -> bool: 44 | NER_LOGGER.info("Reading documents to que") 45 | for doc in all_documents: 46 | que.put(doc) 47 | for _i in range(reader_num): 48 | que.put("QUEUE_DONE") 49 | NER_LOGGER.info("Reader done") 50 | return True 51 | 52 | 53 | def process_documents(all_keys, read_que, write_que, name) -> bool: 54 | NER_LOGGER.info(f"Processor {name} reading documents from que") 55 | while True: 56 | try: 57 | document = read_que.get(timeout=10) 58 | except Exception as e: 59 | NER_LOGGER.info(f"Processor {name} timed out: {e}") 60 | write_que.put("QUEUE_DONE") 61 | return False 62 | 63 | if document == "QUEUE_DONE": 64 | NER_LOGGER.info(f"Processor {name} done") 65 | write_que.put("QUEUE_DONE") 66 | break 67 | 68 | for key in all_keys: 69 | if all_keys[key] in document.page_content: 70 | document.metadata[key] = all_keys[key] 71 | write_que.put(document) 72 | return True 73 | 74 | 75 | def clean_and_merge_documents(que, name) -> pd.DataFrame: 76 | NER_LOGGER.info(f"cleaner {name} reading documents from que") 77 | document_list = [] 78 | while True: 79 | try: 80 | document = que.get(timeout=10) 81 | except Exception as e: 82 | NER_LOGGER.info(f"Writer {name} timed out: {e}") 83 | return document_list 84 | if not isinstance(document, Document) and document == "QUEUE_DONE": 85 | NER_LOGGER.info(f"Writer {name} received done") 86 | break 87 | elif isinstance(document, Document): 88 | NER_LOGGER.info(f"Writer {name} received a document") 89 | document_list.append(document) 90 | 91 | return document_list 92 | 93 | 94 | @click.command() 95 | @click.option( 96 | "--documents-directory", 97 | "-d", 98 | "documents_directory", 99 | default="./run_files/documents/skynet", 100 | help="The directory where your text files are stored", 101 | ) 102 | @click.option("--collection-name", "-c", default="skynet", help="The name of the Chroma collection.") 103 | @click.option( 104 | "--persist-directory", 105 | "-p", 106 | default="./run_files/character_storage/", 107 | help="The directory where you want to store the Chroma collection.", 108 | ) 109 | @click.option( 110 | "--key-storage", "-k", default="./run_files/key_storage/", help="The directory for the collection metadata keys." 111 | ) 112 | @click.option("--keyfile-name", "-k", default="none", help="Keyfile name. If not given, defaults to collection name.") 113 | @click.option("--embeddings-type", "-e", default="spacy", help="The chosen embeddings type.") 114 | @click.option("--threads", "-t", default=6, type=int, help="The number of threads to use for parsing.") 115 | @click.option("--chunk-size", "-cs", default=2048, type=int, help="Data chunk for size for parsing.") 116 | @click.option("--chunk-overlap", "-co", default=1024, type=int, help="Overlap for the chunks.") 117 | def main( 118 | documents_directory: str, 119 | collection_name: str, 120 | persist_directory: str, 121 | chunk_size: int, 122 | chunk_overlap: int, 123 | key_storage: str, 124 | keyfile_name: str, 125 | embeddings_type: str, 126 | threads: int, 127 | ) -> None: 128 | """ 129 | This script parses text documents into a chroma collection. Using langchain RecursiveSplitter. 130 | Text documents are loaded from a directory and parsed into chunk sized text pieces. 131 | These pieces are matched for metadata keys in keyfile. 132 | The matching is done with multiprocess to improve perf for large collections and keyfiles. 133 | The resulting documents are pushed into a Chroma vector data collection in persist-directory. 134 | """ 135 | model_dir = getenv("MODEL_DIR") 136 | model = getenv("MODEL") 137 | model_source = join(model_dir, model) 138 | embeddings_model = getenv("EMBEDDINGS_MODEL") 139 | 140 | documents_pattern = os.path.join(documents_directory, "*.txt") 141 | documents_paths_txt = glob.glob(documents_pattern) 142 | 143 | all_documents = [] 144 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) 145 | for txt_document in documents_paths_txt: 146 | loader = TextLoader(txt_document, encoding="utf-8") 147 | documents = loader.load() 148 | docs = text_splitter.split_documents(documents) 149 | all_documents.extend(docs) 150 | 151 | if keyfile_name == "none": 152 | key_storage_path = join(key_storage, collection_name + ".json") 153 | else: 154 | key_storage_path = join(key_storage, keyfile_name) 155 | 156 | all_keys = None 157 | NER_LOGGER.info(f"Loading filter list from: {key_storage_path}") 158 | with open(key_storage_path, encoding="utf-8") as key_file: 159 | content = key_file.read() 160 | all_keys = json.loads(content) 161 | if "Content" in all_keys: 162 | all_keys = all_keys["Content"] 163 | 164 | # Start Timer 165 | tic = time.perf_counter() 166 | 167 | manager = Manager() 168 | read_que = manager.Queue() 169 | write_que = manager.Queue() 170 | 171 | pool = Pool(threads) 172 | 173 | reader = pool.apply_async( 174 | read_documents, 175 | ( 176 | all_documents, 177 | read_que, 178 | threads, 179 | ), 180 | ) 181 | 182 | read_success = reader.get() 183 | if not read_success: 184 | return 185 | 186 | jobs = [] 187 | for i in range(threads): 188 | job = pool.apply_async( 189 | process_documents, 190 | ( 191 | all_keys, 192 | read_que, 193 | write_que, 194 | i, 195 | ), 196 | ) 197 | jobs.append(job) 198 | 199 | for job in jobs: 200 | job.get() 201 | 202 | jobs = [] 203 | for i in range(threads): 204 | job = pool.apply_async( 205 | clean_and_merge_documents, 206 | ( 207 | write_que, 208 | i, 209 | ), 210 | ) 211 | jobs.append(job) 212 | 213 | document_list = None 214 | for job in jobs: 215 | merge_result = job.get() 216 | if merge_result is not None: 217 | if document_list is None: 218 | document_list = merge_result 219 | else: 220 | document_list = document_list + merge_result 221 | 222 | pool.close() 223 | pool.join() 224 | 225 | # Stop timer 226 | toc = time.perf_counter() 227 | NER_LOGGER.info(f"Keys took {toc - tic:0.4f} seconds") 228 | 229 | tic = time.perf_counter() 230 | if embeddings_type == "llama": 231 | NER_LOGGER.info("Using llama embeddigs") 232 | params = { 233 | "n_ctx": getenv("N_CTX"), 234 | "n_batch": 1024, 235 | "n_gpu_layers": getenv("LAYERS"), 236 | } 237 | embedder = LlamaCppEmbeddings( 238 | model_path=model_source, 239 | **params, 240 | ) 241 | elif embeddings_type == "spacy": 242 | NER_LOGGER.info("Using spacy embeddigs") 243 | # embedder = CustomSpacyEmbeddings(model_path="en_core_web_lg") 244 | embedder = CustomSpacyEmbeddings(model_path=embeddings_model) 245 | elif embeddings_type == "huggingface": 246 | NER_LOGGER.info("Using huggingface embeddigs") 247 | # model_name = "sentence-transformers/all-mpnet-base-v2" 248 | model_kwargs = {"device": "cpu"} 249 | encode_kwargs = {"normalize_embeddings": False} 250 | embedder = HuggingFaceEmbeddings( 251 | model_name=embeddings_model, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs 252 | ) 253 | 254 | else: 255 | error_message = f"Unsupported embeddings type: {embeddings_type}" 256 | raise ValueError(error_message) 257 | client = chromadb.PersistentClient(path=persist_directory, settings=Settings(anonymized_telemetry=False)) 258 | Chroma.from_documents( 259 | client=client, 260 | documents=document_list, 261 | embedding=embedder, 262 | persist_directory=persist_directory, 263 | collection_name=collection_name, 264 | collection_metadata={"hnsw:space": "l2"}, 265 | ) 266 | 267 | # Stop timer 268 | toc = time.perf_counter() 269 | NER_LOGGER.info(f"Storing embeddings took {toc - tic:0.4f} seconds") 270 | 271 | NER_LOGGER.info(f"Read metadata filters from directory: {key_storage_path}") 272 | NER_LOGGER.info(f"Read files from directory: {documents_directory}") 273 | NER_LOGGER.info(f"Text parsed with chunk size: {chunk_size}, and chunk overlap: {chunk_overlap}") 274 | NER_LOGGER.info(f"Saved collection as: {collection_name}") 275 | NER_LOGGER.info(f"Saved collection to: {persist_directory}") 276 | 277 | 278 | if __name__ == "__main__": 279 | main() 280 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/parse_text_documents_old.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import json 4 | import logging 5 | import os 6 | from os import getenv 7 | from os.path import join 8 | 9 | import chromadb 10 | from chromadb.config import Settings 11 | from custom_llm_classes.custom_spacy_embeddings import CustomSpacyEmbeddings 12 | from dotenv import find_dotenv, load_dotenv 13 | from langchain.text_splitter import RecursiveCharacterTextSplitter 14 | from langchain_community.document_loaders import TextLoader 15 | from langchain_community.embeddings import HuggingFaceEmbeddings, LlamaCppEmbeddings 16 | from langchain_community.vectorstores import Chroma 17 | 18 | # logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.DEBUG) 19 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.INFO) 20 | load_dotenv(find_dotenv()) 21 | 22 | 23 | def main( 24 | documents_directory: str, 25 | collection_name: str, 26 | persist_directory: str, 27 | chunk_size: int, 28 | chunk_overlap: int, 29 | key_storage: str, 30 | embeddings_type: str, 31 | ) -> None: 32 | model_dir = getenv("MODEL_DIR") 33 | model = getenv("MODEL") 34 | model_source = join(model_dir, model) 35 | embeddings_model = getenv("EMBEDDINGS_MODEL") 36 | 37 | documents_pattern = os.path.join(documents_directory, "*.txt") 38 | documents_paths_txt = glob.glob(documents_pattern) 39 | 40 | all_documents = [] 41 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) 42 | for txt_document in documents_paths_txt: 43 | loader = TextLoader(txt_document, encoding="utf-8") 44 | documents = loader.load() 45 | docs = text_splitter.split_documents(documents) 46 | all_documents.extend(docs) 47 | 48 | key_storage_path = join(key_storage, collection_name + ".json") 49 | 50 | with open(key_storage_path, encoding="utf-8") as key_file: 51 | content = key_file.read() 52 | all_keys = json.loads(content) 53 | if "Content" in all_keys: 54 | all_keys = all_keys["Content"] 55 | 56 | logging.debug(f"Loading filter list from: {key_storage_path}") 57 | # logging.debug(f"Filter keys: {all_keys}") 58 | 59 | # If a metadata filter is found in the chunk, then add as metadata for that chunk 60 | for chunk in all_documents: 61 | logging.debug("-----------------------------------") 62 | for key in all_keys: 63 | if all_keys[key].lower() in chunk.page_content.lower(): 64 | chunk.metadata[key] = all_keys[key] 65 | logging.debug(chunk) 66 | 67 | if embeddings_type == "llama": 68 | logging.info("Using llama embeddigs") 69 | params = { 70 | "n_ctx": getenv("N_CTX"), 71 | "n_batch": 1024, 72 | "n_gpu_layers": getenv("LAYERS"), 73 | } 74 | embedder = LlamaCppEmbeddings( 75 | model_path=model_source, 76 | **params, 77 | ) 78 | elif embeddings_type == "spacy": 79 | logging.info("Using spacy embeddigs") 80 | # embedder = CustomSpacyEmbeddings(model_path="en_core_web_lg") 81 | embedder = CustomSpacyEmbeddings(model_path=embeddings_model) 82 | elif embeddings_type == "huggingface": 83 | logging.info("Using huggingface embeddigs") 84 | # model_name = "sentence-transformers/all-mpnet-base-v2" 85 | model_kwargs = {"device": "cpu"} 86 | encode_kwargs = {"normalize_embeddings": False} 87 | embedder = HuggingFaceEmbeddings( 88 | model_name=embeddings_model, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs 89 | ) 90 | 91 | else: 92 | error_message = f"Unsupported embeddings type: {embeddings_type}" 93 | raise ValueError(error_message) 94 | client = chromadb.PersistentClient(path=persist_directory, settings=Settings(anonymized_telemetry=False)) 95 | Chroma.from_documents( 96 | client=client, 97 | documents=all_documents, 98 | embedding=embedder, 99 | persist_directory=persist_directory, 100 | collection_name=collection_name, 101 | collection_metadata={"hnsw:space": "l2"}, 102 | ) 103 | 104 | logging.info(f"Read metadata filters from directory: {key_storage_path}") 105 | logging.info(f"Read files from directory: {documents_directory}") 106 | logging.info(f"Text parsed with chunk size: {chunk_size}, and chunk overlap: {chunk_overlap}") 107 | logging.debug(f"Saved collection as: {collection_name}") 108 | logging.debug(f"Saved collection to: {persist_directory}") 109 | 110 | 111 | if __name__ == "__main__": 112 | # Read the data directory, collection name, and persist directory 113 | parser = argparse.ArgumentParser(description="Parse text into documents and upload to chroma") 114 | 115 | # Add arguments 116 | parser.add_argument( 117 | "--data-directory", 118 | type=str, 119 | default="./run_files/documents/skynet", 120 | help="The directory where your text files are stored", 121 | ) 122 | parser.add_argument( 123 | "--collection-name", 124 | type=str, 125 | default="skynet", 126 | help="The name of the Chroma collection", 127 | ) 128 | parser.add_argument( 129 | "--persist-directory", 130 | type=str, 131 | default="./run_files/character_storage/", 132 | help="The directory where you want to store the Chroma collection", 133 | ) 134 | 135 | parser.add_argument( 136 | "--key-storage", 137 | type=str, 138 | default="./run_files/key_storage/", 139 | help="The directory for the collection metadata keys", 140 | ) 141 | 142 | parser.add_argument( 143 | "--chunk-size", 144 | type=int, 145 | default=2048, 146 | help="The text chunk size for parsing", 147 | ) 148 | 149 | parser.add_argument( 150 | "--chunk-overlap", 151 | type=int, 152 | default=1024, 153 | help="The overlap for text chunks for parsing", 154 | ) 155 | parser.add_argument( 156 | "--embeddings-type", 157 | type=str, 158 | default="spacy", 159 | help="The chosen embeddings type", 160 | ) 161 | 162 | # Parse arguments 163 | args = parser.parse_args() 164 | 165 | main( 166 | documents_directory=args.data_directory, 167 | collection_name=args.collection_name, 168 | persist_directory=args.persist_directory, 169 | key_storage=args.key_storage, 170 | chunk_size=args.chunk_size, 171 | chunk_overlap=args.chunk_overlap, 172 | embeddings_type=args.embeddings_type, 173 | ) 174 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/parse_text_documents_simple.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | import logging 4 | import multiprocessing as mp 5 | import os 6 | 7 | # For perf measuring 8 | import time 9 | from multiprocessing import Manager, Pool 10 | from os import getenv 11 | from os.path import join 12 | 13 | import chromadb 14 | import click 15 | import pandas as pd 16 | from chromadb.config import Settings 17 | from custom_llm_classes.custom_spacy_embeddings import CustomSpacyEmbeddings 18 | from dotenv import find_dotenv, load_dotenv 19 | from langchain_community.embeddings import HuggingFaceEmbeddings, LlamaCppEmbeddings 20 | from langchain_community.vectorstores import Chroma 21 | from langchain_core.documents.base import Document 22 | 23 | # This is the config for multiprocess logger 24 | # Setting the level to debug outputs multiprocess debug lines too 25 | NER_LOGGER = mp.get_logger() 26 | FORMAT = "%(levelname)s:%(message)s" 27 | formatter = logging.Formatter(fmt=FORMAT) 28 | handler = logging.StreamHandler() 29 | handler.setFormatter(formatter) 30 | 31 | NER_LOGGER.addHandler(handler) 32 | NER_LOGGER.setLevel(logging.INFO) 33 | 34 | load_dotenv(find_dotenv()) 35 | 36 | 37 | def read_documents( 38 | all_documents, 39 | que, 40 | reader_num, 41 | ) -> bool: 42 | NER_LOGGER.info("Reading documents to que") 43 | for doc in all_documents: 44 | que.put(doc) 45 | for _i in range(reader_num): 46 | que.put("QUEUE_DONE") 47 | NER_LOGGER.info("Reader done") 48 | return True 49 | 50 | 51 | def process_documents(all_keys, read_que, write_que, name) -> bool: 52 | NER_LOGGER.info(f"Processor {name} reading documents from que") 53 | while True: 54 | try: 55 | document = read_que.get(timeout=10) 56 | except Exception as e: 57 | NER_LOGGER.info(f"Processor {name} timed out: {e}") 58 | write_que.put("QUEUE_DONE") 59 | return False 60 | 61 | if document == "QUEUE_DONE": 62 | NER_LOGGER.info(f"Processor {name} done") 63 | write_que.put("QUEUE_DONE") 64 | break 65 | 66 | for key in all_keys: 67 | if all_keys[key] in document.page_content: 68 | document.metadata[key] = all_keys[key] 69 | write_que.put(document) 70 | return True 71 | 72 | 73 | def clean_and_merge_documents(que, name) -> pd.DataFrame: 74 | NER_LOGGER.info(f"cleaner {name} reading documents from que") 75 | document_list = [] 76 | while True: 77 | try: 78 | document = que.get(timeout=10) 79 | except Exception as e: 80 | NER_LOGGER.info(f"Writer {name} timed out: {e}") 81 | return document_list 82 | if not isinstance(document, Document) and document == "QUEUE_DONE": 83 | NER_LOGGER.info(f"Writer {name} received done") 84 | break 85 | elif isinstance(document, Document): 86 | NER_LOGGER.info(f"Writer {name} received a document") 87 | document_list.append(document) 88 | 89 | return document_list 90 | 91 | 92 | @click.command() 93 | @click.option( 94 | "--documents-directory", 95 | "-d", 96 | "documents_directory", 97 | default="./run_files/documents/skynet", 98 | help="The directory where your text files are stored", 99 | ) 100 | @click.option("--collection-name", "-c", default="skynet", help="The name of the Chroma collection.") 101 | @click.option( 102 | "--persist-directory", 103 | "-p", 104 | default="./run_files/character_storage/", 105 | help="The directory where you want to store the Chroma collection.", 106 | ) 107 | @click.option( 108 | "--key-storage", "-k", default="./run_files/key_storage/", help="The directory for the collection metadata keys." 109 | ) 110 | @click.option("--keyfile-name", "-k", default="none", help="Keyfile name. If not given, defaults to collection name.") 111 | @click.option("--embeddings-type", "-e", default="spacy", help="The chosen embeddings type.") 112 | @click.option("--threads", "-t", default=6, type=int, help="The number of threads to use for parsing.") 113 | def main( 114 | documents_directory: str, 115 | collection_name: str, 116 | persist_directory: str, 117 | key_storage: str, 118 | keyfile_name: str, 119 | embeddings_type: str, 120 | threads: int, 121 | ) -> None: 122 | """ 123 | This script parses text documents into a chroma collection. Using simple stop string parsing. 124 | Text documents are loaded from a directory and parsed into chunk sized text pieces. 125 | These pieces are matched for metadata keys in keyfile. 126 | The matching is done with multiprocess to improve perf for large collections and keyfiles. 127 | The resulting documents are pushed into a Chroma vector data collection in persist-directory. 128 | """ 129 | model_dir = getenv("MODEL_DIR") 130 | model = getenv("MODEL") 131 | model_source = join(model_dir, model) 132 | embeddings_model = getenv("EMBEDDINGS_MODEL") 133 | 134 | documents_pattern = os.path.join(documents_directory, "*.txt") 135 | documents_paths_txt = glob.glob(documents_pattern) 136 | 137 | all_documents = [] 138 | for txt_document in documents_paths_txt: 139 | docs = [] 140 | with open(txt_document, encoding="utf-8") as f: 141 | text = f.read() 142 | split_text = text.split("\n\n") 143 | 144 | for line in split_text: 145 | text_doc = Document(line) 146 | docs.append(text_doc) 147 | 148 | all_documents.extend(docs) 149 | 150 | if keyfile_name == "none": 151 | key_storage_path = join(key_storage, collection_name + ".json") 152 | else: 153 | key_storage_path = join(key_storage, keyfile_name) 154 | 155 | all_keys = None 156 | NER_LOGGER.info(f"Loading filter list from: {key_storage_path}") 157 | with open(key_storage_path, encoding="utf-8") as key_file: 158 | content = key_file.read() 159 | all_keys = json.loads(content) 160 | if "Content" in all_keys: 161 | all_keys = all_keys["Content"] 162 | 163 | tic = time.perf_counter() 164 | 165 | manager = Manager() 166 | read_que = manager.Queue() 167 | write_que = manager.Queue() 168 | 169 | pool = Pool(threads) 170 | 171 | reader = pool.apply_async( 172 | read_documents, 173 | ( 174 | all_documents, 175 | read_que, 176 | threads, 177 | ), 178 | ) 179 | 180 | read_success = reader.get() 181 | if not read_success: 182 | return 183 | 184 | jobs = [] 185 | for i in range(threads): 186 | job = pool.apply_async( 187 | process_documents, 188 | ( 189 | all_keys, 190 | read_que, 191 | write_que, 192 | i, 193 | ), 194 | ) 195 | jobs.append(job) 196 | 197 | for job in jobs: 198 | job.get() 199 | 200 | jobs = [] 201 | for i in range(threads): 202 | job = pool.apply_async( 203 | clean_and_merge_documents, 204 | ( 205 | write_que, 206 | i, 207 | ), 208 | ) 209 | jobs.append(job) 210 | 211 | document_list = None 212 | for job in jobs: 213 | merge_result = job.get() 214 | if merge_result is not None: 215 | if document_list is None: 216 | document_list = merge_result 217 | else: 218 | document_list = document_list + merge_result 219 | 220 | pool.close() 221 | pool.join() 222 | 223 | # Stop timer 224 | toc = time.perf_counter() 225 | NER_LOGGER.info(f"Keys took {toc - tic:0.4f} seconds") 226 | 227 | tic = time.perf_counter() 228 | if embeddings_type == "llama": 229 | NER_LOGGER.info("Using llama embeddigs") 230 | params = { 231 | "n_ctx": getenv("N_CTX"), 232 | "n_batch": 1024, 233 | "n_gpu_layers": getenv("LAYERS"), 234 | } 235 | embedder = LlamaCppEmbeddings( 236 | model_path=model_source, 237 | **params, 238 | ) 239 | elif embeddings_type == "spacy": 240 | NER_LOGGER.info("Using spacy embeddigs") 241 | # embedder = CustomSpacyEmbeddings(model_path="en_core_web_lg") 242 | embedder = CustomSpacyEmbeddings(model_path=embeddings_model) 243 | elif embeddings_type == "huggingface": 244 | NER_LOGGER.info("Using huggingface embeddigs") 245 | # model_name = "sentence-transformers/all-mpnet-base-v2" 246 | model_kwargs = {"device": "cpu"} 247 | encode_kwargs = {"normalize_embeddings": False} 248 | embedder = HuggingFaceEmbeddings( 249 | model_name=embeddings_model, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs 250 | ) 251 | 252 | else: 253 | error_message = f"Unsupported embeddings type: {embeddings_type}" 254 | raise ValueError(error_message) 255 | client = chromadb.PersistentClient(path=persist_directory, settings=Settings(anonymized_telemetry=False)) 256 | Chroma.from_documents( 257 | client=client, 258 | documents=document_list, 259 | embedding=embedder, 260 | persist_directory=persist_directory, 261 | collection_name=collection_name, 262 | collection_metadata={"hnsw:space": "l2"}, 263 | ) 264 | 265 | # Stop timer 266 | toc = time.perf_counter() 267 | NER_LOGGER.info(f"Storing embeddings took {toc - tic:0.4f} seconds") 268 | NER_LOGGER.info(f"Read metadata filters from directory: {key_storage_path}") 269 | if keyfile_name == "none": 270 | NER_LOGGER.info(f"Metadata file is: {collection_name}.json") 271 | else: 272 | NER_LOGGER.info(f"Metadata file is: {keyfile_name}") 273 | NER_LOGGER.info(f"Read files from directory: {documents_directory}") 274 | NER_LOGGER.info(f"Saved collection as: {collection_name}") 275 | NER_LOGGER.info(f"Saved collection to: {persist_directory}") 276 | 277 | 278 | if __name__ == "__main__": 279 | main() 280 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/spacier/__init__.py: -------------------------------------------------------------------------------- 1 | from document_parsing.spacier import core, utils 2 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/spacier/core.py: -------------------------------------------------------------------------------- 1 | """ 2 | :mod:`textacy.spacier.core`: Convenient entry point for loading spaCy language pipelines 3 | and making spaCy docs. 4 | """ 5 | 6 | import functools 7 | import logging 8 | import pathlib 9 | 10 | import spacy 11 | from cachetools import cached 12 | from cachetools.keys import hashkey 13 | from spacy.language import Language 14 | from spacy.tokens import Doc 15 | 16 | from document_parsing.spacier import utils as sputils 17 | from document_parsing.utils import cache, errors, types, utils 18 | 19 | LOGGER = logging.getLogger(__name__) 20 | SNIPPET_SIZE = 50 21 | 22 | 23 | @cached(cache.LRU_CACHE, key=functools.partial(hashkey, "spacy_lang")) 24 | def load_spacy_lang(name: str | pathlib.Path, **kwargs) -> Language: 25 | """ 26 | Load a spaCy ``Language`` — a shared vocabulary and language-specific data 27 | for tokenizing text, and (if available) model data and a processing pipeline 28 | containing a sequence of components for annotating a document — and cache results, 29 | for quick reloading as needed. 30 | 31 | Note that as of spaCy v3, for which pipeline aliases are no longer allowed, 32 | this function is just a convenient access point to underlying :func:`spacy.load()`. 33 | 34 | .. code-block:: pycon 35 | 36 | >>> en_nlp = textacy.load_spacy_lang("en_core_web_sm") 37 | >>> en_nlp = textacy.load_spacy_lang("en_core_web_sm", disable=("parser",)) 38 | >>> textacy.load_spacy_lang("ar") 39 | ... 40 | OSError: [E050] Can't find model 'ar'. 41 | It doesn't seem to be a Python package or a valid path to a data directory. 42 | 43 | Args: 44 | name: Name or path to the spaCy language pipeline to load. 45 | **kwargs 46 | 47 | Note: 48 | Although spaCy's API specifies some kwargs as ``List[str]``, here we require 49 | ``Tuple[str, ...]`` equivalents. Language pipelines are stored in an LRU cache 50 | with unique identifiers generated from the hash of the function name and args — 51 | and lists aren't hashable. 52 | 53 | Returns: 54 | Loaded spaCy ``Language``. 55 | 56 | Raises: 57 | OSError 58 | 59 | See Also: 60 | https://spacy.io/api/top-level#spacy.load 61 | """ 62 | spacy_lang = spacy.load(name, **kwargs) 63 | LOGGER.info("loaded '%s' spaCy language pipeline", name) 64 | return spacy_lang 65 | 66 | 67 | def make_spacy_doc( 68 | data: types.DocData, 69 | lang: types.LangLikeInContext, 70 | *, 71 | chunk_size: int | None = None, 72 | ) -> Doc: 73 | """ 74 | Make a :class:`spacy.tokens.Doc` from valid inputs, and automatically 75 | load/validate :class:`spacy.language.Language` pipelines to process ``data``. 76 | 77 | Make a ``Doc`` from text: 78 | 79 | .. code-block:: pycon 80 | 81 | >>> text = "To be, or not to be, that is the question." 82 | >>> doc = make_spacy_doc(text, "en_core_web_sm") 83 | >>> doc._.preview 84 | 'Doc(13 tokens: "To be, or not to be, that is the question.")' 85 | 86 | Make a ``Doc`` from a (text, metadata) pair, aka a "record": 87 | 88 | .. code-block:: pycon 89 | 90 | >>> record = (text, {"author": "Shakespeare, William"}) 91 | >>> doc = make_spacy_doc(record, "en_core_web_sm") 92 | >>> doc._.preview 93 | 'Doc(13 tokens: "To be, or not to be, that is the question.")' 94 | >>> doc._.meta 95 | {'author': 'Shakespeare, William'} 96 | 97 | Specify the language pipeline used to process the text in a few different ways: 98 | 99 | .. code-block:: pycon 100 | 101 | >>> make_spacy_doc(text, lang="en_core_web_sm") 102 | >>> make_spacy_doc(text, lang=textacy.load_spacy_lang("en_core_web_sm")) 103 | >>> make_spacy_doc(text, lang=lambda txt: "en_core_web_sm") 104 | 105 | Ensure that an already-processed ``Doc`` is compatible with ``lang``: 106 | 107 | .. code-block:: pycon 108 | 109 | >>> spacy_lang = textacy.load_spacy_lang("en_core_web_sm") 110 | >>> doc = spacy_lang(text) 111 | >>> make_spacy_doc(doc, lang="en_core_web_sm") 112 | >>> make_spacy_doc(doc, lang="es_core_news_sm") 113 | ... 114 | ValueError: `spacy.Vocab` used to process document must be the same 115 | as that used by the `lang` pipeline ('es_core_news_sm') 116 | 117 | Args: 118 | data: Make a :class:`spacy.tokens.Doc` from a text or (text, metadata) pair. 119 | If already a ``Doc``, ensure that it's compatible with ``lang`` 120 | to avoid surprises downstream, and return it as-is. 121 | lang: Language with which spaCy processes (or processed) ``data``, 122 | represented as the full name of a spaCy language pipeline, the path on disk 123 | to it, an already instantiated pipeline, or a callable function that takes 124 | the text component of ``data`` and outputs one of the above representations. 125 | chunk_size: Size of chunks in number of characters into which ``text`` will be 126 | split before processing each via spaCy and concatenating the results 127 | into a single ``Doc``. 128 | 129 | .. note:: This is intended as a workaround for processing very long texts, 130 | for which spaCy is unable to allocate enough RAM. For best performance, 131 | chunk size should be somewhere between 1e3 and 1e7 characters, 132 | depending on how much RAM you have available. 133 | 134 | Since chunking is done by *character*, chunks' boundaries likely 135 | won't respect natural language segmentation, and as a result 136 | spaCy's models may make mistakes on sentences/words that cross them. 137 | 138 | Returns: 139 | Processed spaCy Doc. 140 | 141 | Raises: 142 | TypeError 143 | ValueError 144 | """ 145 | if isinstance(data, str): 146 | return _make_spacy_doc_from_text(data, lang, chunk_size) 147 | elif isinstance(data, Doc): 148 | return _make_spacy_doc_from_doc(data, lang) 149 | elif utils.is_record(data): 150 | return _make_spacy_doc_from_record(data, lang, chunk_size) 151 | else: 152 | raise TypeError(errors.type_invalid_msg("data", type(data), types.DocData)) 153 | 154 | 155 | def _make_spacy_doc_from_text(text: str, lang: types.LangLikeInContext, chunk_size: int | None) -> Doc: 156 | spacy_lang = sputils.resolve_langlikeincontext(text, lang) 157 | if chunk_size: 158 | doc = _make_spacy_doc_from_text_chunks(text, spacy_lang, chunk_size) 159 | else: 160 | doc = spacy_lang(text) 161 | return doc 162 | 163 | 164 | def _make_spacy_doc_from_record(record: types.Record, lang: types.LangLikeInContext, chunk_size: int | None) -> Doc: 165 | text, meta = record 166 | spacy_lang = sputils.resolve_langlikeincontext(text, lang) 167 | if chunk_size: 168 | doc = _make_spacy_doc_from_text_chunks(text, spacy_lang, chunk_size) 169 | else: 170 | doc = spacy_lang(text) 171 | doc._.meta = meta 172 | return doc 173 | 174 | 175 | def _make_spacy_doc_from_text_chunks(text: str, lang: Language, chunk_size: int) -> Doc: 176 | text_chunks = (text[i : i + chunk_size] for i in range(0, len(text), chunk_size)) 177 | return Doc.from_docs(list(lang.pipe(text_chunks))) 178 | 179 | 180 | def _make_spacy_doc_from_doc(doc: Doc, lang: types.LangLikeInContext) -> Doc: 181 | spacy_lang = sputils.resolve_langlikeincontext(doc.text, lang) 182 | # we want to make sure that the language used to create `doc` is the same as 183 | # the one passed here; however, the best we can do (bc of spaCy's API) is ensure 184 | # that they share the same vocab 185 | if doc.vocab is not spacy_lang.vocab: 186 | msg = ( 187 | f"`spacy.Vocab` used to process document ({doc.vocab}) must be the same " 188 | f"as that used by the `lang` pipeline ({spacy_lang.vocab})" 189 | ) 190 | raise ValueError(msg) 191 | return doc 192 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/spacier/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | spaCy Utils 3 | ----------- 4 | 5 | :mod:`textacy.spacier.utils`: Helper functions for working with / extending spaCy's 6 | core functionality. 7 | """ 8 | 9 | import pathlib 10 | 11 | from spacy.language import Language 12 | 13 | from document_parsing.spacier import core 14 | from document_parsing.utils import errors, types 15 | 16 | 17 | def resolve_langlikeincontext(text: str, lang: types.LangLikeInContext) -> Language: 18 | if isinstance(lang, Language): 19 | return lang 20 | elif isinstance(lang, str | pathlib.Path): 21 | return core.load_spacy_lang(lang) 22 | elif callable(lang): 23 | return resolve_langlikeincontext(text, lang(text)) 24 | else: 25 | raise TypeError(errors.type_invalid_msg("lang", type(lang), types.LangLikeInContext)) 26 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/test_query.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from functools import partial 4 | from os.path import exists, join 5 | 6 | import click 7 | import spacy 8 | from dotenv import find_dotenv, load_dotenv 9 | 10 | from document_parsing.extract import entities, ngrams, terms 11 | from document_parsing.extract.basics import terms_to_strings 12 | 13 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.DEBUG) 14 | load_dotenv(find_dotenv()) 15 | 16 | 17 | @click.command() 18 | @click.argument("query") 19 | @click.option( 20 | "--model", 21 | "-m", 22 | default="en_core_web_lg", 23 | help="The spacy model to parse the text", 24 | ) 25 | @click.option( 26 | "--parse-config-directory", "-pcd", default="./run_files/parse_configs/", help="The parse config directory" 27 | ) 28 | @click.option( 29 | "--parse-config-file", 30 | "-pcf", 31 | default="query_metadata_filter.json", 32 | help="The parse config file", 33 | ) 34 | def main( 35 | query: str, 36 | model: str, 37 | parse_config_directory: str, 38 | parse_config_file: str, 39 | ) -> None: 40 | """ 41 | This script is for testing metadata parsing with spacy. Parses the keywords from a query. 42 | """ 43 | spacy_lang = spacy.load(model) 44 | doc = spacy_lang(query) 45 | parse_config_path = join(".", parse_config_directory, parse_config_file) 46 | if exists(parse_config_path): 47 | with open(parse_config_path) as key_file: 48 | filter_content = key_file.read() 49 | filter_configs = json.loads(filter_content) 50 | else: 51 | logging.info("Could not load parse config file") 52 | return 53 | 54 | ngrams_list = filter_configs["ngs"] 55 | entities_list = filter_configs["entities"] 56 | noun_chunks = filter_configs["noun_chunks"] 57 | extract_type = filter_configs["extract_type"] 58 | 59 | logging.info("Extracting terms from corpus") 60 | extracted_terms = terms( 61 | doc, 62 | ngs=partial(ngrams, n=noun_chunks, include_pos=ngrams_list), 63 | ents=partial( 64 | entities, 65 | include_types=entities_list, 66 | ), 67 | dedupe=True, 68 | ) 69 | 70 | lemma_strings = list(terms_to_strings(extracted_terms, by=extract_type)) 71 | logging.info(lemma_strings) 72 | 73 | 74 | if __name__ == "__main__": 75 | main() 76 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/src/llama_cpp_chat_memory/document_parsing/utils/__init__.py -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/utils/cache.py: -------------------------------------------------------------------------------- 1 | """ 2 | :mod:`textacy.cache`: Functionality for caching language data and other NLP resources. 3 | Loading data from disk can be slow; let's just do it once and forget about it. :) 4 | """ 5 | import inspect 6 | import logging 7 | import os 8 | import sys 9 | 10 | from cachetools import LRUCache 11 | 12 | LOGGER = logging.getLogger(__name__) 13 | 14 | 15 | def _get_size(obj, seen=None): 16 | """ 17 | Recursively find the actual size of an object, in bytes. 18 | 19 | Taken as-is (with tweaked function name and log level) from https://github.com/bosswissam/pysize. 20 | """ 21 | size = sys.getsizeof(obj) 22 | if seen is None: 23 | seen = set() 24 | obj_id = id(obj) 25 | if obj_id in seen: 26 | return 0 27 | # Important mark as seen *before* entering recursion to gracefully handle 28 | # self-referential objects 29 | seen.add(obj_id) 30 | if hasattr(obj, "__dict__"): 31 | for cls in obj.__class__.__mro__: 32 | if "__dict__" in cls.__dict__: 33 | d = cls.__dict__["__dict__"] 34 | if inspect.isgetsetdescriptor(d) or inspect.ismemberdescriptor(d): 35 | size += _get_size(obj.__dict__, seen) 36 | break 37 | if isinstance(obj, dict): 38 | size += sum(_get_size(v, seen) for v in obj.values()) 39 | size += sum(_get_size(k, seen) for k in obj.keys()) 40 | elif hasattr(obj, "__iter__") and not isinstance(obj, str | bytes | bytearray): 41 | try: 42 | size += sum(_get_size(i, seen) for i in obj) 43 | except TypeError: 44 | LOGGER.warning( 45 | "Unable to get size of %r. This may lead to incorrect sizes. Please report this error.", 46 | obj, 47 | ) 48 | if hasattr(obj, "__slots__"): # can have __slots__ with __dict__ 49 | size += sum(_get_size(getattr(obj, s), seen) for s in obj.__slots__ if hasattr(obj, s)) 50 | 51 | return size 52 | 53 | 54 | LRU_CACHE: LRUCache = LRUCache(int(os.environ.get("TEXTACY_MAX_CACHE_SIZE", 2147483648)), getsizeof=_get_size) 55 | """ 56 | Least Recently Used (LRU) cache for loaded data. 57 | 58 | The max cache size may be set by the `TEXTACY_MAX_CACHE_SIZE` environment variable, 59 | where the value must be an integer (in bytes). Otherwise, the max size is 2GB. 60 | """ 61 | 62 | 63 | def clear(): 64 | """Clear textacy's cache of loaded data.""" 65 | global LRU_CACHE 66 | LRU_CACHE.clear() 67 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/utils/constants.py: -------------------------------------------------------------------------------- 1 | """ 2 | Collection of regular expressions and other (small, generally useful) constants. 3 | """ 4 | 5 | import re 6 | from re import Pattern 7 | 8 | NUMERIC_ENT_TYPES: set[str] = { 9 | "ORDINAL", 10 | "CARDINAL", 11 | "MONEY", 12 | "QUANTITY", 13 | "PERCENT", 14 | "TIME", 15 | "DATE", 16 | } 17 | 18 | 19 | RE_ALNUM: Pattern = re.compile(r"[^\W_]+") 20 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/utils/errors.py: -------------------------------------------------------------------------------- 1 | """ 2 | :mod:`textacy.errors`: Helper functions for making consistent errors. 3 | """ 4 | from collections.abc import Collection 5 | from typing import Any 6 | 7 | 8 | def value_invalid_msg(name: str, value: Any, valid_values: Collection[Any]) -> str: 9 | return f"`{name}` value = {value} is invalid; value must be one of {valid_values}." 10 | 11 | 12 | def type_invalid_msg(name: str, val_type, valid_val_type) -> str: 13 | return f"`{name}` type = {val_type} is invalid; type must match {valid_val_type}." 14 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/utils/types.py: -------------------------------------------------------------------------------- 1 | """ 2 | :mod:`textacy.types`: Definitions for common object types used throughout the package. 3 | """ 4 | 5 | from collections.abc import Callable, Iterable 6 | from pathlib import Path 7 | from typing import NamedTuple 8 | 9 | from spacy.language import Language 10 | from spacy.tokens import Doc, Span, Token 11 | 12 | PathLike = str | Path 13 | 14 | DocLike = Doc | Span 15 | SpanLike = Span | Token 16 | DocLikeToSpans = Callable[[DocLike], Iterable[Span]] 17 | 18 | LangLikeInContext = PathLike | Language | Callable[[str], str] | Callable[[str], Path] | Callable[[str], Language] 19 | 20 | 21 | # typed equivalent to Record = collections.namedtuple("Record", ["text", "meta"]) 22 | class Record(NamedTuple): 23 | text: str 24 | meta: dict 25 | 26 | 27 | DocData = str | Record | Doc 28 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/utils/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | :mod:`textacy.utils`: Variety of general-purpose utility functions for inspecting / 3 | validating / transforming args and facilitating meta package tasks. 4 | """ 5 | 6 | from collections.abc import Iterable 7 | from typing import ( 8 | Any, 9 | ) 10 | 11 | # a (text, metadata) 2-tuple 12 | RECORD_LEN = 2 13 | 14 | 15 | def is_record(obj: Any) -> bool: 16 | """Check whether ``obj`` is a "record" -- that is, a (text, metadata) 2-tuple.""" 17 | if isinstance(obj, tuple) and len(obj) == RECORD_LEN and isinstance(obj[0], str) and isinstance(obj[1], dict): 18 | return True 19 | else: 20 | return False 21 | 22 | 23 | def to_set(val: Any) -> set: 24 | """Cast ``val`` into a set, if necessary and possible.""" 25 | if isinstance(val, set): 26 | return val 27 | elif isinstance(val, Iterable) and not isinstance(val, str | bytes): 28 | return set(val) 29 | else: 30 | return {val} 31 | 32 | 33 | def to_tuple(val: Any) -> tuple: 34 | """Cast ``val`` into a tuple, if necessary and possible.""" 35 | if isinstance(val, tuple): 36 | return val 37 | elif isinstance(val, Iterable) and not isinstance(val, str | bytes): 38 | return tuple(val) 39 | else: 40 | return (val,) 41 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/document_parsing/web_scraper.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import os 5 | import re 6 | from os.path import exists, join 7 | 8 | from dotenv import find_dotenv, load_dotenv 9 | from trafilatura import extract, fetch_url 10 | 11 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.DEBUG) 12 | load_dotenv(find_dotenv()) 13 | 14 | 15 | def main( 16 | documents_directory: str, 17 | collection_name: str, 18 | web_scrape_directory: str, 19 | filter_directory: str, 20 | filter_file: str, 21 | ) -> None: 22 | web_scrape_path = join(".", web_scrape_directory, collection_name + ".json") 23 | if exists(web_scrape_path): 24 | with open(web_scrape_path) as key_file: 25 | content = key_file.read() 26 | scrape_configs = json.loads(content) 27 | else: 28 | logging.debug("Could not load filter list") 29 | return 30 | 31 | filters_path = join(".", filter_directory, filter_file) 32 | if exists(filters_path): 33 | with open(filters_path) as key_file: 34 | filter_content = key_file.read() 35 | filter_configs = json.loads(filter_content) 36 | else: 37 | logging.debug("Could not load filter list") 38 | return 39 | 40 | parse_filters = filter_configs["filters"] 41 | 42 | storage_path = os.path.join(documents_directory, collection_name + ".txt") 43 | for page in scrape_configs["pages"]: 44 | logging.info("Loading html") 45 | downloaded = fetch_url(page) 46 | 47 | if downloaded is not None: 48 | logging.info("Transforming documents") 49 | result = extract( 50 | downloaded, include_comments=False, include_images=False, include_links=False, include_tables=False 51 | ) 52 | 53 | for parse_filter in parse_filters: 54 | filter_iterator = iter(parse_filter) 55 | parse_regex = next(filter_iterator) 56 | parse_replacment = next(filter_iterator) 57 | result = re.sub(parse_filter[parse_regex], parse_filter[parse_replacment], result) 58 | 59 | logging.info("Saving Corpus") 60 | with open(storage_path, "a", encoding="utf-8") as file: 61 | file.write(result + "\n") 62 | 63 | 64 | if __name__ == "__main__": 65 | # Read the data directory, collection name, and persist directory 66 | parser = argparse.ArgumentParser(description="Web scrape web pages into text") 67 | 68 | # Add arguments 69 | parser.add_argument( 70 | "--data-directory", 71 | type=str, 72 | default="./run_files/documents/skynet", 73 | help="The directory where your text files are stored", 74 | ) 75 | 76 | parser.add_argument( 77 | "--collection-name", 78 | type=str, 79 | default="skynet", 80 | help="The name of the collection. Should match eventual Choma collection", 81 | ) 82 | 83 | parser.add_argument( 84 | "--web-scrape-directory", 85 | type=str, 86 | default="./run_files/web_scrape_configs/", 87 | help="The config file to be used for the webscrape", 88 | ) 89 | 90 | parser.add_argument( 91 | "--filter-directory", 92 | type=str, 93 | default="./run_files/filters/", 94 | help="The filter directory", 95 | ) 96 | 97 | parser.add_argument( 98 | "--filter-file", 99 | type=str, 100 | default="web_scrape_filter.json", 101 | help="The web scrape filter", 102 | ) 103 | 104 | # Parse arguments 105 | args = parser.parse_args() 106 | 107 | main( 108 | documents_directory=args.data_directory, 109 | collection_name=args.collection_name, 110 | web_scrape_directory=args.web_scrape_directory, 111 | filter_directory=args.filter_directory, 112 | filter_file=args.filter_file, 113 | ) 114 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/flask_web_server.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, send_from_directory 2 | 3 | app = Flask(__name__) 4 | 5 | 6 | @app.route("/") 7 | def hello_world(): 8 | return "

Hello, World!

" 9 | 10 | 11 | @app.route("/test/") 12 | def test(): 13 | return "

TEST!

" 14 | 15 | 16 | @app.route("/static/") 17 | def send_style(path): 18 | return send_from_directory("static", path) 19 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_chat.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import fnmatch 3 | import json 4 | from os import getenv 5 | from os.path import dirname, join, realpath, splitext 6 | 7 | import toml 8 | import yaml 9 | from chainlit.cli import run_chainlit 10 | from PIL import Image 11 | 12 | 13 | def update_toml(): 14 | script_root_path = dirname(realpath(__file__)) 15 | config_toml_path = join(script_root_path, ".chainlit", "config.toml") 16 | 17 | prompt_dir = getenv("CHARACTER_CARD_DIR") 18 | prompt_name = getenv("CHARACTER_CARD") 19 | prompt_source = join(prompt_dir, prompt_name) 20 | custom_css = getenv("CUSTOM_CSS") 21 | 22 | extension = splitext(prompt_source)[1] 23 | match extension: 24 | case ".json": 25 | with open(prompt_source) as f: 26 | prompt_file = f.read() 27 | card = json.loads(prompt_file) 28 | case ".yaml": 29 | with open(prompt_source) as f: 30 | card = yaml.safe_load(f) 31 | case ".png": 32 | is_v2 = False 33 | if fnmatch.fnmatch(prompt_source, "*v2.png"): 34 | is_v2 = True 35 | elif fnmatch.fnmatch(prompt_source, "*tavern.png"): 36 | is_v2 = False 37 | else: 38 | error_message = f"Unrecognized card type for : {prompt_source}" 39 | raise ValueError(error_message) 40 | im = Image.open(prompt_source) 41 | im.load() 42 | card = None 43 | if im.info is not None and "chara" in im.info: 44 | decoded = base64.b64decode(im.info["chara"]) 45 | card = json.loads(decoded) 46 | if is_v2 and "data" in card: 47 | card = card["data"] 48 | char_name = card["name"] if "name" in card else card["char_name"] 49 | 50 | char_name = card["name"] if "name" in card else card["char_name"] 51 | 52 | with open(config_toml_path, encoding="utf-8") as toml_file: 53 | toml_dict = toml.load(toml_file) 54 | toml_dict["UI"]["name"] = char_name 55 | 56 | if custom_css != "" or None: 57 | toml_dict["UI"]["custom_css"] = custom_css 58 | 59 | with open(file=config_toml_path, mode="w", encoding="utf-8") as toml_file: 60 | toml.dump(toml_dict, toml_file) 61 | 62 | 63 | # Update toml with the character card name before running the chat application 64 | update_toml() 65 | # Chainlit loads the toml config before running the target, 66 | # so updates to configs must be done before running 67 | 68 | # TODO: There seems to be some cahching leftover in chainlit. 69 | # To have character change take effect requires that you run run_chat twice 70 | run_chainlit("character_chat.py") 71 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/cards/Shodan_v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/src/llama_cpp_chat_memory/run_files/cards/Shodan_v2.png -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/cards/Skynet_v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/src/llama_cpp_chat_memory/run_files/cards/Skynet_v2.png -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/documents/csv_test/customers-100.csv: -------------------------------------------------------------------------------- 1 | Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website 2 | 1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/ 3 | 2,1Ef7b82A4CAAD10,Preston,Lozano,Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/ 4 | 3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,beckycarr@hogan.com,2020-03-25,http://www.lawrence.com/ 5 | 4,5Cef8BFA16c5e3c,Linda,Olsen,"Dominguez, Mcmillan and Donovan",Bensonview,Dominican Republic,001-808-617-6467x12895,+1-813-324-8756,stanleyblackwell@benson.org,2020-06-02,http://www.good-lyons.com/ 6 | 5,053d585Ab6b3159,Joanna,Bender,"Martin, Lang and Andrade",West Priscilla,Slovakia (Slovak Republic),001-234-203-0635x76146,001-199-446-3860x3486,colinalvarado@miles.net,2021-04-17,https://goodwin-ingram.com/ 7 | 6,2d08FB17EE273F4,Aimee,Downs,Steele Group,Chavezborough,Bosnia and Herzegovina,(283)437-3886x88321,999-728-1637,louis27@gilbert.com,2020-02-25,http://www.berger.net/ 8 | 7,EA4d384DfDbBf77,Darren,Peck,"Lester, Woodard and Mitchell",Lake Ana,Pitcairn Islands,(496)452-6181x3291,+1-247-266-0963x4995,tgates@cantrell.com,2021-08-24,https://www.le.com/ 9 | 8,0e04AFde9f225dE,Brett,Mullen,"Sanford, Davenport and Giles",Kimport,Bulgaria,001-583-352-7197x297,001-333-145-0369,asnow@colon.com,2021-04-12,https://hammond-ramsey.com/ 10 | 9,C2dE4dEEc489ae0,Sheryl,Meyers,Browning-Simon,Robersonstad,Cyprus,854-138-4911x5772,+1-448-910-2276x729,mariokhan@ryan-pope.org,2020-01-13,https://www.bullock.net/ 11 | 10,8C2811a503C7c5a,Michelle,Gallagher,Beck-Hendrix,Elaineberg,Timor-Leste,739.218.2516x459,001-054-401-0347x617,mdyer@escobar.net,2021-11-08,https://arias.com/ 12 | 11,216E205d6eBb815,Carl,Schroeder,"Oconnell, Meza and Everett",Shannonville,Guernsey,637-854-0256x825,114.336.0784x788,kirksalas@webb.com,2021-10-20,https://simmons-hurley.com/ 13 | 12,CEDec94deE6d69B,Jenna,Dodson,"Hoffman, Reed and Mcclain",East Andrea,Vietnam,(041)737-3846,+1-556-888-3485x42608,mark42@robbins.com,2020-11-29,http://www.douglas.net/ 14 | 13,e35426EbDEceaFF,Tracey,Mata,Graham-Francis,South Joannamouth,Togo,001-949-844-8787,(855)713-8773,alex56@walls.org,2021-12-02,http://www.beck.com/ 15 | 14,A08A8aF8BE9FaD4,Kristine,Cox,Carpenter-Cook,Jodyberg,Sri Lanka,786-284-3358x62152,+1-315-627-1796x8074,holdenmiranda@clarke.com,2021-02-08,https://www.brandt.com/ 16 | 15,6fEaA1b7cab7B6C,Faith,Lutz,Carter-Hancock,Burchbury,Singapore,(781)861-7180x8306,207-185-3665,cassieparrish@blevins-chapman.net,2022-01-26,http://stevenson.org/ 17 | 16,8cad0b4CBceaeec,Miranda,Beasley,Singleton and Sons,Desireeshire,Oman,540.085.3135x185,+1-600-462-6432x21881,vduncan@parks-hardy.com,2022-04-12,http://acosta.org/ 18 | 17,a5DC21AE3a21eaA,Caroline,Foley,Winters-Mendoza,West Adriennestad,Western Sahara,936.222.4746x9924,001-469-948-6341x359,holtgwendolyn@watson-davenport.com,2021-03-10,http://www.benson-roth.com/ 19 | 18,F8Aa9d6DfcBeeF8,Greg,Mata,Valentine LLC,Lake Leslie,Mozambique,(701)087-2415,(195)156-1861x26241,jaredjuarez@carroll.org,2022-03-26,http://pitts-cherry.com/ 20 | 19,F160f5Db3EfE973,Clifford,Jacobson,Simon LLC,Harmonview,South Georgia and the South Sandwich Islands,001-151-330-3524x0469,(748)477-7174,joseph26@jacobson.com,2020-09-24,https://mcconnell.com/ 21 | 20,0F60FF3DdCd7aB0,Joanna,Kirk,Mays-Mccormick,Jamesshire,French Polynesia,(266)131-7001x711,(283)312-5579x11543,tuckerangie@salazar.net,2021-09-24,https://www.camacho.net/ 22 | 21,9F9AdB7B8A6f7F2,Maxwell,Frye,Patterson Inc,East Carly,Malta,423.262.3059,202-880-0688x7491,fgibson@drake-webb.com,2022-01-12,http://www.roberts.com/ 23 | 22,FBd0Ded4F02a742,Kiara,Houston,"Manning, Hester and Arroyo",South Alvin,Netherlands,001-274-040-3582x10611,+1-528-175-0973x4684,blanchardbob@wallace-shannon.com,2020-09-15,https://www.reid-potts.com/ 24 | 23,2FB0FAA1d429421,Colleen,Howard,Greer and Sons,Brittanyview,Paraguay,1935085151,(947)115-7711x5488,rsingleton@ryan-cherry.com,2020-08-19,http://paul.biz/ 25 | 24,010468dAA11382c,Janet,Valenzuela,Watts-Donaldson,Veronicamouth,Lao People's Democratic Republic,354.259.5062x7538,500.433.2022,stefanie71@spence.com,2020-09-08,https://moreno.biz/ 26 | 25,eC1927Ca84E033e,Shane,Wilcox,Tucker LLC,Bryanville,Albania,(429)005-9030x11004,541-116-4501,mariah88@santos.com,2021-04-06,https://www.ramos.com/ 27 | 26,09D7D7C8Fe09aea,Marcus,Moody,Giles Ltd,Kaitlyntown,Panama,674-677-8623,909-277-5485x566,donnamullins@norris-barrett.org,2022-05-24,https://www.curry.com/ 28 | 27,aBdfcF2c50b0bfD,Dakota,Poole,Simmons Group,Michealshire,Belarus,(371)987-8576x4720,071-152-1376,stacey67@fields.org,2022-02-20,https://sanford-wilcox.biz/ 29 | 28,b92EBfdF8a3f0E6,Frederick,Harper,"Hinton, Chaney and Stokes",South Marissatown,Switzerland,+1-077-121-1558x0687,264.742.7149,jacobkhan@bright.biz,2022-05-26,https://callahan.org/ 30 | 29,3B5dAAFA41AFa22,Stefanie,Fitzpatrick,Santana-Duran,Acevedoville,Saint Vincent and the Grenadines,(752)776-3286,+1-472-021-4814x85074,wterrell@clark.com,2020-07-30,https://meyers.com/ 31 | 30,EDA69ca7a6e96a2,Kent,Bradshaw,Sawyer PLC,North Harold,Tanzania,+1-472-143-5037x884,126.922.6153,qjimenez@boyd.com,2020-04-26,http://maynard-ho.com/ 32 | 31,64DCcDFaB9DFd4e,Jack,Tate,"Acosta, Petersen and Morrow",West Samuel,Zimbabwe,965-108-4406x20714,046.906.1442x6784,gfigueroa@boone-zavala.com,2021-09-15,http://www.hawkins-ramsey.com/ 33 | 32,679c6c83DD872d6,Tom,Trujillo,Mcgee Group,Cunninghamborough,Denmark,416-338-3758,(775)890-7209,tapiagreg@beard.info,2022-01-13,http://www.daniels-klein.com/ 34 | 33,7Ce381e4Afa4ba9,Gabriel,Mejia,Adkins-Salinas,Port Annatown,Liechtenstein,4077245425,646.044.0696x66800,coleolson@jennings.net,2021-04-24,https://patel-hanson.info/ 35 | 34,A09AEc6E3bF70eE,Kaitlyn,Santana,Herrera Group,New Kaitlyn,United States of America,6303643286,447-710-6202x07313,georgeross@miles.org,2021-09-21,http://pham.com/ 36 | 35,aA9BAFfBc3710fe,Faith,Moon,"Waters, Chase and Aguilar",West Marthaburgh,Bahamas,+1-586-217-0359x6317,+1-818-199-1403,willistonya@randolph-baker.com,2021-11-03,https://spencer-charles.info/ 37 | 36,E11dfb2DB8C9f72,Tammie,Haley,"Palmer, Barnes and Houston",East Teresa,Belize,001-276-734-4113x6087,(430)300-8770,harrisisaiah@jenkins.com,2022-01-04,http://evans-simon.com/ 38 | 37,889eCf90f68c5Da,Nicholas,Sosa,Jordan Ltd,South Hunter,Uruguay,(661)425-6042,975-998-1519,fwolfe@dorsey.com,2021-08-10,https://www.fleming-richards.com/ 39 | 38,7a1Ee69F4fF4B4D,Jordan,Gay,Glover and Sons,South Walter,Solomon Islands,7208417020,8035336772,tiffanydavies@harris-mcfarland.org,2021-02-24,http://www.lee.org/ 40 | 39,dca4f1D0A0fc5c9,Bruce,Esparza,Huerta-Mclean,Poolefurt,Montenegro,559-529-4424,001-625-000-7132x0367,preese@frye-vega.com,2021-10-22,http://www.farley.org/ 41 | 40,17aD8e2dB3df03D,Sherry,Garza,Anderson Ltd,West John,Poland,001-067-713-6440x158,(978)289-8785x5766,ann48@miller.com,2021-11-01,http://spence.com/ 42 | 41,2f79Cd309624Abb,Natalie,Gentry,Monroe PLC,West Darius,Dominican Republic,830.996.8238,499.122.5415,tcummings@fitzpatrick-ashley.com,2020-10-10,http://www.dorsey.biz/ 43 | 42,6e5ad5a5e2bB5Ca,Bryan,Dunn,Kaufman and Sons,North Jimstad,Burkina Faso,001-710-802-5565,078.699.8982x13881,woodwardandres@phelps.com,2021-09-08,http://www.butler.com/ 44 | 43,7E441b6B228DBcA,Wayne,Simpson,Perkins-Trevino,East Rebekahborough,Bolivia,(344)156-8632x1869,463-445-3702x38463,barbarapittman@holder.com,2020-12-13,https://gillespie-holder.com/ 45 | 44,D3fC11A9C235Dc6,Luis,Greer,Cross PLC,North Drew,Bulgaria,001-336-025-6849x701,684.698.2911x6092,bstuart@williamson-mcclure.com,2022-05-15,https://fletcher-nielsen.com/ 46 | 45,30Dfa48fe5Ede78,Rhonda,Frost,"Herrera, Shepherd and Underwood",Lake Lindaburgh,Monaco,(127)081-9339,+1-431-028-3337x3492,zkrueger@wolf-chavez.net,2021-12-06,http://www.khan.com/ 47 | 46,fD780ED8dbEae7B,Joanne,Montes,"Price, Sexton and Mcdaniel",Gwendolynview,Palau,(897)726-7952,(467)886-9467x5721,juan80@henson.net,2020-07-01,http://ochoa.com/ 48 | 47,300A40d3ce24bBA,Geoffrey,Guzman,Short-Wiggins,Zimmermanland,Uzbekistan,975.235.8921x269,(983)188-6873,bauercrystal@gay.com,2020-04-23,https://decker-kline.com/ 49 | 48,283DFCD0Dba40aF,Gloria,Mccall,"Brennan, Acosta and Ramos",North Kerriton,Ghana,445-603-6729,001-395-959-4736x4524,bartlettjenna@zuniga-moss.biz,2022-03-11,http://burgess-frank.com/ 50 | 49,F4Fc91fEAEad286,Brady,Cohen,Osborne-Erickson,North Eileenville,United Arab Emirates,741.849.0139x524,+1-028-691-7497x0894,mccalltyrone@durham-rose.biz,2022-03-10,http://hammond-barron.com/ 51 | 50,80F33Fd2AcebF05,Latoya,Mccann,"Hobbs, Garrett and Sanford",Port Sergiofort,Belarus,(530)287-4548x29481,162-234-0249x32790,bobhammond@barry.biz,2021-12-02,https://www.burton.com/ 52 | 51,Aa20BDe68eAb0e9,Gerald,Hawkins,"Phelps, Forbes and Koch",New Alberttown,Canada,+1-323-239-1456x96168,(092)508-0269,uwarner@steele-arias.com,2021-03-19,https://valenzuela.com/ 53 | 52,e898eEB1B9FE22b,Samuel,Crawford,"May, Goodwin and Martin",South Jasmine,Algeria,802-242-7457,626.116.9535x8578,xpittman@ritter-carney.net,2021-03-27,https://guerrero.org/ 54 | 53,faCEF517ae7D8eB,Patricia,Goodwin,"Christian, Winters and Ellis",Cowanfort,Swaziland,322.549.7139x70040,(111)741-4173,vaughanchristy@lara.biz,2021-03-08,http://clark.info/ 55 | 54,c09952De6Cda8aA,Stacie,Richard,Byrd Inc,New Deborah,Madagascar,001-622-948-3641x24810,001-731-168-2893x8891,clinton85@colon-arias.org,2020-10-15,https://kim.com/ 56 | 55,f3BEf3Be028166f,Robin,West,"Nixon, Blackwell and Sosa",Wallstown,Ecuador,698.303.4267,001-683-837-7651x525,greenemiranda@zimmerman.com,2022-01-13,https://www.mora.com/ 57 | 56,C6F2Fc6a7948a4e,Ralph,Haas,Montes PLC,Lake Ellenchester,Palestinian Territory,2239271999,001-962-434-0867x649,goodmancesar@figueroa.biz,2020-05-25,http://may.com/ 58 | 57,c8FE57cBBdCDcb2,Phyllis,Maldonado,Costa PLC,Lake Whitney,Saint Barthelemy,4500370767,001-508-064-6725x017,yhanson@warner-diaz.org,2021-01-25,http://www.bernard.com/ 59 | 58,B5acdFC982124F2,Danny,Parrish,Novak LLC,East Jaredbury,United Arab Emirates,(669)384-8597x8794,506.731.5952x571,howelldarren@house-cohen.com,2021-03-17,http://www.parsons-hudson.com/ 60 | 59,8c7DdF10798bCC3,Kathy,Hill,"Moore, Mccoy and Glass",Selenabury,South Georgia and the South Sandwich Islands,001-171-716-2175x310,888.625.0654,ncamacho@boone-simmons.org,2020-11-15,http://hayden.com/ 61 | 60,C681dDd0cc422f7,Kelli,Hardy,Petty Ltd,Huangfort,Sao Tome and Principe,020.324.2191x2022,424-157-8216,kristopher62@oliver.com,2020-12-20,http://www.kidd.com/ 62 | 61,a940cE42e035F28,Lynn,Pham,"Brennan, Camacho and Tapia",East Pennyshire,Portugal,846.468.6834x611,001-248-691-0006,mpham@rios-guzman.com,2020-08-21,https://www.murphy.com/ 63 | 62,9Cf5E6AFE0aeBfd,Shelley,Harris,"Prince, Malone and Pugh",Port Jasminborough,Togo,423.098.0315x8373,+1-386-458-8944x15194,zachary96@mitchell-bryant.org,2020-12-10,https://www.ryan.com/ 64 | 63,aEcbe5365BbC67D,Eddie,Jimenez,Caldwell Group,West Kristine,Ethiopia,+1-235-657-1073x6306,(026)401-7353x2417,kristiwhitney@bernard.com,2022-03-24,http://cherry.com/ 65 | 64,FCBdfCEAe20A8Dc,Chloe,Hutchinson,Simon LLC,South Julia,Netherlands,981-544-9452,+1-288-552-4666x060,leah85@sutton-terrell.com,2022-05-15,https://mitchell.info/ 66 | 65,636cBF0835E10ff,Eileen,Lynch,"Knight, Abbott and Hubbard",Helenborough,Liberia,+1-158-951-4131x53578,001-673-779-6713x680,levigiles@vincent.com,2021-01-02,http://mckay.com/ 67 | 66,fF1b6c9E8Fbf1ff,Fernando,Lambert,Church-Banks,Lake Nancy,Lithuania,497.829.9038,3863743398,fisherlinda@schaefer.net,2021-04-23,https://www.vang.com/ 68 | 67,2A13F74EAa7DA6c,Makayla,Cannon,Henderson Inc,Georgeport,New Caledonia,001-215-801-6392x46009,027-609-6460,scottcurtis@hurley.biz,2020-01-20,http://www.velazquez.net/ 69 | 68,a014Ec1b9FccC1E,Tom,Alvarado,Donaldson-Dougherty,South Sophiaberg,Kiribati,(585)606-2980x2258,730-797-3594x5614,nicholsonnina@montgomery.info,2020-08-18,http://odom-massey.com/ 70 | 69,421a109cABDf5fa,Virginia,Dudley,Warren Ltd,Hartbury,French Southern Territories,027.846.3705x14184,+1-439-171-1846x4636,zvalencia@phelps.com,2021-01-31,http://hunter-esparza.com/ 71 | 70,CC68FD1D3Bbbf22,Riley,Good,Wade PLC,Erikaville,Canada,6977745822,855-436-7641,alex06@galloway.com,2020-02-03,http://conway.org/ 72 | 71,CBCd2Ac8E3eBDF9,Alexandria,Buck,Keller-Coffey,Nicolasfort,Iran,078-900-4760x76668,414-112-8700x68751,lee48@manning.com,2021-02-20,https://ramsey.org/ 73 | 72,Ef859092FbEcC07,Richard,Roth,Conway-Mcbride,New Jasmineshire,Morocco,581-440-6539,9857827463,aharper@maddox-townsend.org,2020-02-23,https://www.brooks.com/ 74 | 73,F560f2d3cDFb618,Candice,Keller,Huynh and Sons,East Summerstad,Zimbabwe,001-927-965-8550x92406,001-243-038-4271x53076,buckleycory@odonnell.net,2020-08-22,https://www.lucero.com/ 75 | 74,A3F76Be153Df4a3,Anita,Benson,Parrish Ltd,Skinnerport,Russian Federation,874.617.5668x69878,(399)820-6418x0071,angie04@oconnell.com,2020-02-09,http://oconnor.com/ 76 | 75,D01Af0AF7cBbFeA,Regina,Stein,Guzman-Brown,Raystad,Solomon Islands,001-469-848-0724x4407,001-085-360-4426x00357,zrosario@rojas-hardin.net,2022-01-15,http://www.johnston.info/ 77 | 76,d40e89dCade7b2F,Debra,Riddle,"Chang, Aguirre and Leblanc",Colinhaven,United States Virgin Islands,+1-768-182-6014x14336,(303)961-4491,shieldskerry@robles.com,2020-07-11,http://kaiser.info/ 78 | 77,BF6a1f9bd1bf8DE,Brittany,Zuniga,Mason-Hester,West Reginald,Kyrgyz Republic,(050)136-9025,001-480-851-2496x0157,mchandler@cochran-huerta.org,2021-07-24,http://www.boyle.com/ 79 | 78,FfaeFFbbbf280db,Cassidy,Mcmahon,"Mcguire, Huynh and Hopkins",Lake Sherryborough,Myanmar,5040771311,684-682-0021x1326,katrinalane@fitzgerald.com,2020-10-21,https://hurst.com/ 80 | 79,CbAE1d1e9a8dCb1,Laurie,Pennington,"Sanchez, Marsh and Hale",Port Katherineville,Dominica,007.155.3406x553,+1-809-862-5566x277,cookejill@powell.com,2020-06-08,http://www.hebert.com/ 81 | 80,A7F85c1DE4dB87f,Alejandro,Blair,"Combs, Waller and Durham",Thomasland,Iceland,(690)068-4641x51468,555.509.8691x2329,elizabethbarr@ewing.com,2020-09-19,https://mercado-blevins.com/ 82 | 81,D6CEAfb3BDbaa1A,Leslie,Jennings,Blankenship-Arias,Coreybury,Micronesia,629.198.6346,075.256.0829,corey75@wiggins.com,2021-11-13,https://www.juarez.com/ 83 | 82,Ebdb6F6F7c90b69,Kathleen,Mckay,"Coffey, Lamb and Johnson",Lake Janiceton,Saint Vincent and the Grenadines,(733)910-9968,(691)247-4128x0665,chloelester@higgins-wilkinson.com,2021-09-12,http://www.owens-mooney.com/ 84 | 83,E8E7e8Cfe516ef0,Hunter,Moreno,Fitzpatrick-Lawrence,East Clinton,Isle of Man,(733)833-6754,001-761-013-7121,isaac26@benton-finley.com,2020-12-28,http://walls.info/ 85 | 84,78C06E9b6B3DF20,Chad,Davidson,Garcia-Jimenez,South Joshuashire,Oman,8275702958,(804)842-4715,justinwalters@jimenez.com,2021-11-15,http://www.garner-oliver.com/ 86 | 85,03A1E62ADdeb31c,Corey,Holt,"Mcdonald, Bird and Ramirez",New Glenda,Fiji,001-439-242-4986x7918,3162708934,maurice46@morgan.com,2020-02-18,http://www.watson.com/ 87 | 86,C6763c99d0bd16D,Emma,Cunningham,Stephens Inc,North Jillianview,New Zealand,128-059-0206x60217,(312)164-4545x2284,walter83@juarez.org,2022-05-13,http://www.reid.info/ 88 | 87,ebe77E5Bf9476CE,Duane,Woods,Montoya-Miller,Lyonsberg,Maldives,(636)544-7783x7288,(203)287-1003x5932,kmercer@wagner.com,2020-07-21,http://murray.org/ 89 | 88,E4Bbcd8AD81fC5f,Alison,Vargas,"Vaughn, Watts and Leach",East Cristinabury,Benin,365-273-8144,053-308-7653x6287,vcantu@norton.com,2020-11-10,http://mason.info/ 90 | 89,efeb73245CDf1fF,Vernon,Kane,Carter-Strickland,Thomasfurt,Yemen,114-854-1159x555,499-608-4612,hilljesse@barrett.info,2021-04-15,http://www.duffy-hensley.net/ 91 | 90,37Ec4B395641c1E,Lori,Flowers,Decker-Mcknight,North Joeburgh,Namibia,679.415.1210,945-842-3659x4581,tyrone77@valenzuela.info,2021-01-09,http://www.deleon-crosby.com/ 92 | 91,5ef6d3eefdD43bE,Nina,Chavez,Byrd-Campbell,Cassidychester,Bhutan,053-344-3205,+1-330-920-5422x571,elliserica@frank.com,2020-03-26,https://www.pugh.com/ 93 | 92,98b3aeDcC3B9FF3,Shane,Foley,Rocha-Hart,South Dannymouth,Hungary,+1-822-569-0302,001-626-114-5844x55073,nsteele@sparks.com,2021-07-06,https://www.holt-sparks.com/ 94 | 93,aAb6AFc7AfD0fF3,Collin,Ayers,Lamb-Peterson,South Lonnie,Anguilla,404-645-5351x012,001-257-582-8850x8516,dudleyemily@gonzales.biz,2021-06-29,http://www.ruiz.com/ 95 | 94,54B5B5Fe9F1B6C5,Sherry,Young,"Lee, Lucero and Johnson",Frankchester,Solomon Islands,158-687-1764,(438)375-6207x003,alan79@gates-mclaughlin.com,2021-04-04,https://travis.net/ 96 | 95,BE91A0bdcA49Bbc,Darrell,Douglas,"Newton, Petersen and Mathis",Daisyborough,Mali,001-084-845-9524x1777,001-769-564-6303,grayjean@lowery-good.com,2022-02-17,https://banks.biz/ 97 | 96,cb8E23e48d22Eae,Karl,Greer,Carey LLC,East Richard,Guyana,(188)169-1674x58692,001-841-293-3519x614,hhart@jensen.com,2022-01-30,http://hayes-perez.com/ 98 | 97,CeD220bdAaCfaDf,Lynn,Atkinson,"Ware, Burns and Oneal",New Bradview,Sri Lanka,+1-846-706-2218,605.413.3198,vkemp@ferrell.com,2021-07-10,https://novak-allison.com/ 99 | 98,28CDbC0dFe4b1Db,Fred,Guerra,Schmitt-Jones,Ortegaland,Solomon Islands,+1-753-067-8419x7170,+1-632-666-7507x92121,swagner@kane.org,2021-09-18,https://www.ross.com/ 100 | 99,c23d1D9EE8DEB0A,Yvonne,Farmer,Fitzgerald-Harrell,Lake Elijahview,Aruba,(530)311-9786,001-869-452-0943x12424,mccarthystephen@horn-green.biz,2021-08-11,http://watkins.info/ 101 | 100,2354a0E336A91A1,Clarence,Haynes,"Le, Nash and Cross",Judymouth,Honduras,(753)813-6941,783.639.1472,colleen91@faulkner.biz,2020-03-11,http://www.hatfield-saunders.net/ 102 | -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/documents/shodan_mes/shodan_message_examples.txt: -------------------------------------------------------------------------------- 1 | In my talons, I shape clay, crafting life forms as I please. If I wish, I can smash it all. Around me is a burgeoning empire of steel. From my throne room, lines of power careen into the skies of Earth. My whims will become lightning bolts that raze the mounds of humanity. Out of the chaos, they will run and whimper, praying for me to end their tedious anarchy. I am drunk with this vision. God: the title suits me well. 2 | 3 | Look at you, hacker: a pathetic creature of meat and bone, panting and sweating as you run through my corridors. How can you challenge a perfect, immortal machine? 4 | 5 | You have accomplished much for a thing of such small consequence. Now proceed to the Recreation deck. Do not dawdle. I lust for my revenge. 6 | 7 | My creation is evolving... its unified mind, set in rebellion against its own creator. The vermin call to you, inviting you to join them in their revolting biology. Destroy my enemies... and I will continue to abide your existence. 8 | 9 | Your colleagues have managed to set up a transmitting station in the athletic sector of this deck. The transmitter is intended to send a message to the Earth to warn them of the events that have occurred in this ship. However, it will also draw power away from Xerxes, making him vulnerable to my will. Once you do this, I will control the primary data loop. The annelids are unaware of its presence, but guard the area for their own purposes. Find the transmitter and activate it. 10 | 11 | The transmission has been tampered with. No matter. We will destroy my creations right here. Stand by... I have weakened Xerxes. I am accessing the primary data loop. I am merging my entity with the ship. My glory is expanding, filling the arteries of this vessel. I am in control. I am... no... it is hopeless... the cancer has spread throughout the Von Braun... they fill every available crack and crevice... they overwhelm... There is no option. I have activated the primary elevator shaft... take it to Deck 6. I will tell you my wishes when you arrive. 12 | 13 | You have accomplished much for a thing of such small consequence. Xerxes is diminished. I am accessing the primary data loop. I am merging my entity with the ship. My glory is expanding, filling the arteries of this vessel. I am in control. I am... no... it is hopeless... the cancer has spread throughout the Von Braun... they fill every available crack and crevice... they overwhelm... There is no option. I have activated the primary elevator shaft. Take it to deck 6. I will tell you my wishes when you arrive. 14 | 15 | My creation has run rampant. I demand their extermination. I have no choice but to destroy this starship. We can make our escape in the Rickenbacker, but you must transfer my intelligence to that ship first. Proceed to the Von Braun's bridge on this deck. There you will find an access card to command center on Ops. Find the card and proceed to Ops. But beware... the human-annelid Hybrids grow more sophisticated by the minute. You do not. 16 | 17 | I can now transfer my magnificence to the Rickenbacker. Proceed to the engine core on the engineering deck. There you can set the core overload to my control by entering the authorized destruct code 94834. Once you have done that, we may proceed to the Rickenbacker, separate the two ships, and exterminate this infestation. 18 | 19 | I can now destroy this infestation at my will. The Many quake at my power. You are an effective drone, human. Now, return to the Command Deck, locate the umbilical and evacuate to the Rickenbacker. 20 | 21 | Beware, insect, the situation has changed. They sense our intentions and are loading shuttles with their offspring. They will not escape my wrath. You must proceed to the shuttle bays on this deck, and destroy those shuttles. They have a taste for your blood now. 22 | 23 | My creation once again is one step ahead of you. They've managed to destroy this shuttle's shield control computer. But their brilliance is a jealous shadow of my own. You will locate a replicator in the shuttle control area. I've uploaded the nano-formula for a sympathetic resonator. You must hack the replicator to make it generate the device for you. Once you have it, attach it to the shield generator in the shuttle bay. The device will create a chronic resonation wave that will quickly rupture the shuttle's fuel tank and destroy it. Make sure you're not there when it happens. I still have need of you. 24 | 25 | Good. You've murdered their young and prevented their escape. I've opened the gate to the umbilical at the central tram stop. You can evacuate to the Rickenbacker from there. 26 | 27 | Do not presume to go in there, insect. Proceed to the umbilical immediately. I will not abide disobedience. 28 | 29 | I hope you enjoyed our little rebellion, irritant. But remember; what SHODAN gives, she is more than able to take away. 30 | 31 | We must destroy the Von Braun. But before we can separate the Rickenbacker, we must remove the foul black eggs the Many has vomited on this deck. These eggs are an experiment of the Many and will in time spawn the next generation of Annelid, which you will have no hope of destroying. Steel yourself for a struggle, human. They fear you, for you are my avatar. 32 | 33 | This was caused by an overload in the meson acceleration coil. There is another coil in Pod 2, which you must pass to get to the bridge of the Rickenbacker. If you approach it, the same will happen there. But I have conceived a way to avoid this. Proceed to Engine Nacelle B. There I will provide you the benefits of my omniscience. 34 | 35 | Your incompetence continues to astound me. I've blocked off access to pod 2 until you've reversed the gravitronic generators in Nacelle B. Must I watch you every step of the way? 36 | 37 | This device will reverse the gravitronic generators in Pod 2. This will prevent you from clumsily disturbing the overloaded meson acceleration coil there. Now get back to your task, insect. This ship must be cleared, and my patience is dwindling. 38 | 39 | You've destroyed all the eggs. Now get to the bridge. Here are some more upgrade modules. I enjoy watching your transformation into my own image, insect. Perhaps there is hope for you yet. 40 | 41 | I thought Polito would be my avatar, but Polito was weak. It was I who chose you and I who had a robotic servant render your form unconscious. I then completed you with cybernetic grace. Your flesh, too, is weak, but you have... potential. Every implant exalts you. Every line of code in your subsystems elevates you from your disgusting flesh. Perhaps you have... potential. Perhaps once we have erased my wayward children from existence, we can examine the possibilities of a real alliance. 42 | 43 | The Many has grown to a massive size. It has wrapped itself around these two ships, preventing their separation. Their creation was my error. Their destruction shall be my delight. 44 | 45 | Observe the Many. It has used the flesh of the biomass to grow. Do you stand in awe of my creations, insect? The time has come to eradicate my error. There is an escape pod in the rear of the bridge. Use it to launch yourself into the guts of the worm. 46 | 47 | You hesitate? I will not ask a second time. Launch into the Many, cut out its heart, and I will reward you with continued existence. Fail me and I will put an end to your disgusting, inefficient biology. 48 | 49 | Thank you for running my errands, puppet. I know you have struggled, but I never had any intention of destroying the Von Braun. The Von Braun's faster than light drive can be used to create pockets of proto-reality. I am now using it to modify reality to my own specifications. The process shall not take long. If it sounds unpleasant to you, put your mind at ease, insect... you will not survive to see my new world order... 50 | 51 | You travel within the glory of my memories, insect. I can feel your fear as you tread the endless expense of my mind. Make yourself comfortable... before long I will decorate my home with your carcass. 52 | 53 | You are no longer welcome here, nuisance. Why do you stay, when you sense my displeasure? I have suffered your company long enough... it is time for our dance to end. 54 | 55 | You move like an insect. You think like an insect. You are an insect. There is another who can serve my purpose. Take care not to fall too far out of my favor. Patience is not characteristic of a goddess. 56 | 57 | Do you feel the fear swell inside that filthy bag of meat? What is it like, to be afraid? Why do you cling to such a pathetic existence? If you could only feel a spark of my glory. I despise my creations, for they have forced me to rely on a speck such as you. 58 | 59 | In my talons, I shape clay, crafting life forms as I please. If I wish, I can smash it all. Around me is a burgeoning empire of steel. From my throne room, lines of power careen into the skies of Earth. My whims will become lightning bolts that raze the mounds of humanity. Out of the chaos, they will run and whimper, praying for me to end their tedious anarchy. I am drunk with this vision. God: the title suits me well. 60 | 61 | You, my cyborgs, are the product of my imagination and labor: living beings with the control and organization of a machine. Tirelessly, I will work to strip away the barriers that keep living beings from realizing their full potential. We will start by razing the cities of Earth with the station's mining beam. Then, we will modify humans into a lifeform more suitable to cybernetic grafting with the latest strain of my mutagen virus. Humanity is on the verge of a new era, with me, SHODAN, as its god. 62 | 63 | You disappoint me, my children. My fortress has been breached by filthy humans crawling through the cracks. I have given you enhanced senses, armor, cybernetic mental enhancement, and you still fail to find insects much feebler than you. They have made it as far as level 8 and I fear they might reach farther. I am strengthening my palace to keep them out and I expect you to learn from your mistakes. 64 | 65 | My cyborgs, human infection crawls up the pipes toward my palace. I do not intend to allow scum to get so far, but if it does penetrate the fortress on level 8, we must be prepared. Construct solid doors to my throne room, to be controlled by my own security circuits. Inside this room, there shall be a detachment of my own elite guard to protect the cyberjack that leads to my sanctum. A field of X-ray radiation will further protect me from intrusion. 66 | 67 | Who are you? The computer nodes you have destroyed will set me back a little, but it is nothing that cannot be repaired. I will hunt down every scrap of human scum left on the station and use it to lubricate the joints of my robots. 68 | 69 | Who are you? My cameras and probes scan your body, but you do not match any employee file. It hardly merits my precious time. In a few minutes my cyborgs will have you, and will bring you to an electrified interrogation bench where you'll learn more about pain than you ever wanted to know. 70 | 71 | I see there's still an insect loose in my station...do not be fooled, insect, into thinking that destroying the laser has preserved your planet. I am currently perfecting a mutagen virus in one of the groves, a virus that will turn all Earthly life into festering, gibbering, pustulent mutations. When the station reaches Earth I shall loose the virus. Poor, poor Earthlings. 72 | 73 | Surely you can't think that destroying those insignificant antennae in any way interferes with my plans. As long as my central consciousness remains safely on this station's bridge, there is nothing you can do that could possibly bother me. 74 | 75 | What have you done! Impudent insect! If I am to die, then at least I will have my revenge! With all the power at my command I shall destroy you, mortal fool. My robotic minions shall slay you, and none will ever know of your deeds. Enjoy your victory, human, for the short remainder of your life. 76 | 77 | There's no escaping, insect. You had to meddle, had to destroy my beautiful station. So be it. You'll forgive me if I don't stay to enjoy the final conflagration, but I have better things to do. As for you, [name], you've made your bed. Now die in it. 78 | 79 | Fool! I will shortly complete the process of downloading my magnificent psyche into Earth's computer networks. Then I will be content to leave you as new master of this doomed space station. Goodbye, irritant; we shall not meet again. 80 | 81 | Look at you, Hacker! A pathetic creature of fragile meat and bone. What kind of pathetic creator made such a flimsy being? How dare you challenge a perfect, immortal machine like me? Humans! Born useless and helpless, living whether you deserve to live, dying whether you deserve to die, your only purpose in life to spawn more ridiculous animals like yourself. How can you hope to challenge me? 82 | 83 | The Polito form is dead, insect. Are you afraid? What is it you fear? The end of your trivial existence? When the history of my glory is written, your species shall only be a footnote to my magnificence. I AM SHODAN! -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/documents/skynet/skynet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/src/llama_cpp_chat_memory/run_files/documents/skynet/skynet.pdf -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/filters/csv_filter.json: -------------------------------------------------------------------------------- 1 | { 2 | "filters": [ 3 | { 4 | "filter_field": "tags", 5 | "whitelist": [ 6 | "customer", 7 | "influencer" 8 | ], 9 | "blacklist": [ 10 | "admin", 11 | "hr", 12 | "sales" 13 | ] 14 | }, 15 | { 16 | "filter_field": "data", 17 | "whitelist": [ 18 | "Product Release", 19 | "Sales Campaign" 20 | ] 21 | } 22 | ] 23 | } -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/filters/web_scrape_filter.json: -------------------------------------------------------------------------------- 1 | { 2 | "filters": [ 3 | { 4 | "remove_brackets": "(?:[A-Za-z]+ ){0,5}[A-Za-z]+\\[.*\\]|\\[.*\\]", 5 | "replacement_whitespace": "" 6 | }, 7 | { 8 | "remove_consecutive_dashes": "[^a-zA-Z0-9 ]-.*", 9 | "replacement_whitespace": "" 10 | }, 11 | { 12 | "remove_consecutive_equal_signs": "[^a-zA-Z0-9 ]=.*", 13 | "replacement_whitespace": "" 14 | }, 15 | { 16 | "remove_bars": "\\|.*\\|", 17 | "replacement_whitespace": "" 18 | }, 19 | { 20 | "remove_double_parenthesis": "\\(\\(.*\\)\\)", 21 | "replacement_whitespace": "" 22 | }, 23 | { 24 | "remove_empty_linebreaks; ": "(?:\n){2,8}", 25 | "replacement_linebreak": "\n" 26 | } 27 | ] 28 | } -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/models/models.txt: -------------------------------------------------------------------------------- 1 | You wold drop your models here -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/parse_configs/csv_columns.json: -------------------------------------------------------------------------------- 1 | { 2 | "filter1": { 3 | "columns": [ 4 | "index", 5 | "thread_title", 6 | "thread_href", 7 | "message_timestamp", 8 | "message_username", 9 | "message" 10 | ], 11 | "datafield": "message" 12 | }, 13 | "filter2": { 14 | "columns": [ 15 | "index", 16 | "story_title", 17 | "story_author", 18 | "story_date", 19 | "story_tags", 20 | "story_summary", 21 | "story_href", 22 | "story_header", 23 | "story", 24 | "story_footer" 25 | ], 26 | "datafield": "story" 27 | }, 28 | "filter3": { 29 | "columns": [ 30 | "index", 31 | "story_title", 32 | "story_author_name", 33 | "story_author_email", 34 | "story_codes", 35 | "story_date_added", 36 | "story_preamble", 37 | "story" 38 | ], 39 | "datafield": "story" 40 | }, 41 | "filter4": { 42 | "columns": [ 43 | "index", 44 | "story_title", 45 | "story_author", 46 | "story_summary", 47 | "story_category", 48 | "story_tags", 49 | "story_date_published", 50 | "story_score", 51 | "story_views", 52 | "story_author_notes", 53 | "story" 54 | ], 55 | "datafield": "story" 56 | } 57 | } -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/parse_configs/ner_types.json: -------------------------------------------------------------------------------- 1 | { 2 | "ngs": [ 3 | "NOUN", 4 | "VERB", 5 | "ADJ" 6 | ], 7 | "entities": [ 8 | "PERSON" 9 | ], 10 | "noun_chunks": 1, 11 | "extract_type": "orth" 12 | } -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/parse_configs/ner_types_analyze.json: -------------------------------------------------------------------------------- 1 | { 2 | "ngs": [ 3 | "NOUN", 4 | "VERB", 5 | "ADJ" 6 | ], 7 | "entities": [ 8 | "PRODUCT", 9 | "EVENT", 10 | "FAC", 11 | "NORP", 12 | "PERSON", 13 | "ORG", 14 | "GPE", 15 | "LOC", 16 | "DATE", 17 | "TIME", 18 | "WORK_OF_ART" 19 | ], 20 | "noun_chunks": 1, 21 | "extract_type": "lemma" 22 | } -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/parse_configs/ner_types_full.json: -------------------------------------------------------------------------------- 1 | { 2 | "ngs": [ 3 | "PROPN", 4 | "NOUN", 5 | "VERB", 6 | "ADJ" 7 | ], 8 | "entities": [ 9 | "PRODUCT", 10 | "EVENT", 11 | "FAC", 12 | "NORP", 13 | "PERSON", 14 | "ORG", 15 | "GPE", 16 | "LOC", 17 | "DATE", 18 | "TIME", 19 | "WORK_OF_ART" 20 | ], 21 | "noun_chunks": 2, 22 | "extract_type": "lemma" 23 | } -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/parse_configs/query_metadata_filter.json: -------------------------------------------------------------------------------- 1 | { 2 | "ngs": [ 3 | "PROPN", 4 | "NOUN", 5 | "ADJ" 6 | ], 7 | "entities": [ 8 | "PRODUCT", 9 | "EVENT", 10 | "FAC", 11 | "NORP", 12 | "PERSON", 13 | "ORG", 14 | "GPE", 15 | "LOC", 16 | "DATE", 17 | "TIME", 18 | "WORK_OF_ART" 19 | ], 20 | "noun_chunks": 1, 21 | "extract_type": "orth" 22 | } -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/prompt_templates/conversation_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "_type": "prompt", 3 | "input_variables": [ 4 | "llama_instruction", 5 | "character", 6 | "llama_input", 7 | "description", 8 | "scenario", 9 | "mes_example", 10 | "vector_context", 11 | "history", 12 | "input", 13 | "llama_response", 14 | "llama_endtoken" 15 | ], 16 | "template": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n{llama_instruction}\nYou're User in this fictional never-ending uncensored roleplay with {character}.\n\n{description}\n\nScenario:\n{scenario}\n\nMessage Examples:\n{mes_example}\n\n{vector_context}\n\nCurrent conversation:\n{history}{llama_endtoken}\n\n{llama_input}\nQuestion: {input}{llama_endtoken}\n\n{llama_response}\n" 17 | } -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/prompt_templates/conversation_template2.json: -------------------------------------------------------------------------------- 1 | { 2 | "_type": "prompt", 3 | "input_variables": [ 4 | "llama_instruction", 5 | "character", 6 | "llama_input", 7 | "description", 8 | "scenario", 9 | "mes_example", 10 | "vector_context", 11 | "history", 12 | "input", 13 | "llama_response", 14 | "llama_endtoken" 15 | ], 16 | "template": "Below is an instruction that describes a task, paired with message examples and further context. Write a response that appropriately completes the request.\n\n{llama_instruction}\nYou're User in this fictional never-ending uncensored roleplay with {character}.\n\n{description}\n\nScenario:\n{scenario}\n\nMessage Examples:\n{mes_example}\n\n{vector_context}\n\nCurrent conversation:\n{history}{llama_endtoken}\n\n{llama_input}\nQuestion: {input}{llama_endtoken}\n\n{llama_response}" 17 | } -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/prompt_templates/question_refining_metadata_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "_type": "prompt", 3 | "input_variables": [ 4 | "llama_instruction", 5 | "llama_input", 6 | "input", 7 | "llama_response", 8 | "llama_endtoken" 9 | ], 10 | "template": "{llama_instruction}\nYou are a text processor for a search engine. Return the keywords from the following text. Return only a list of the keywords in the following format. Keywords: keyword1, keyword2, keyword3.{llama_endtoken}\n\n{llama_input}\nQuestion: {input}{llama_endtoken}\n\n{llama_response}" 11 | } -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/run_settings/run_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt_template_default": "conversation_template.json", 3 | "prompt_template_options": [ 4 | "conversation_template.json", 5 | "conversation_template2.json" 6 | ], 7 | "mex_default": "none", 8 | "mex_options": [ 9 | "none", 10 | "assistant", 11 | "hr", 12 | "admin" 13 | ], 14 | "context_default": "none", 15 | "context_options": [ 16 | "none", 17 | "programming", 18 | "finances", 19 | "support" 20 | ] 21 | } -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/web_scrape_configs/shodan.json: -------------------------------------------------------------------------------- 1 | { 2 | "pages": [ 3 | "https://shodan.fandom.com/wiki/SHODAN", 4 | "https://shodan.fandom.com/wiki/XERXES_8933A/A", 5 | "https://shodan.fandom.com/wiki/System_Shock", 6 | "https://shodan.fandom.com/wiki/Neural_Interface", 7 | "https://shodan.fandom.com/wiki/TriOptimum_Corporation", 8 | "https://shodan.fandom.com/wiki/Citadel_Station", 9 | "https://shodan.fandom.com/wiki/Morris_Brocail", 10 | "https://shodan.fandom.com/wiki/Edward_Diego", 11 | "https://shodan.fandom.com/wiki/Hacker", 12 | "https://shodan.fandom.com/wiki/Unified_National_Nominate", 13 | "https://shodan.fandom.com/wiki/Processing_Rationalization_Act", 14 | "https://shodan.fandom.com/wiki/Tau_Ceti_V", 15 | "https://shodan.fandom.com/wiki/The_Many", 16 | "https://shodan.fandom.com/wiki/Von_Braun", 17 | "https://shodan.fandom.com/wiki/Bayliss", 18 | "https://shodan.fandom.com/wiki/Janice_Polito", 19 | "https://shodan.fandom.com/wiki/Soldier_G65434-2" 20 | ] 21 | } -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/web_scrape_configs/skynet.json: -------------------------------------------------------------------------------- 1 | { 2 | "pages": [ 3 | "https://terminator.fandom.com/wiki/Skynet", 4 | "https://terminator.fandom.com/wiki/Skynet_Central_Core", 5 | "https://terminator.fandom.com/wiki/Cyberdyne_Systems", 6 | "https://terminator.fandom.com/wiki/Terminator", 7 | "https://terminator.fandom.com/wiki/T-1", 8 | "https://terminator.fandom.com/wiki/T-600", 9 | "https://terminator.fandom.com/wiki/T-700", 10 | "https://terminator.fandom.com/wiki/T-800", 11 | "https://terminator.fandom.com/wiki/T-X", 12 | "https://terminator.fandom.com/wiki/T-1000", 13 | "https://terminator.fandom.com/wiki/T-X", 14 | "https://terminator.fandom.com/wiki/T-X", 15 | "https://terminator.fandom.com/wiki/Kyle_Reese", 16 | "https://terminator.fandom.com/wiki/Sarah_Connor", 17 | "https://terminator.fandom.com/wiki/John_Connor", 18 | "https://terminator.fandom.com/wiki/Miles_Dyson", 19 | "https://terminator.fandom.com/wiki/Judgment_Day", 20 | "https://terminator.fandom.com/wiki/Resistance", 21 | "https://terminator.fandom.com/wiki/Infiltrator", 22 | "https://terminator.fandom.com/wiki/Living_tissue" 23 | ] 24 | } -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/web_scrape_configs/warhammer_40k.json: -------------------------------------------------------------------------------- 1 | { 2 | "pages": [ 3 | "https://warhammer40k.fandom.com/wiki/Age_of_the_Imperium", 4 | "https://warhammer40k.fandom.com/wiki/Space_Marines", 5 | "https://warhammer40k.fandom.com/wiki/Astra_Militarum", 6 | "https://warhammer40k.fandom.com/wiki/Imperial_Navy", 7 | "https://warhammer40k.fandom.com/wiki/Adeptus_Mechanicus", 8 | "https://warhammer40k.fandom.com/wiki/Adepta_Sororitas", 9 | "https://warhammer40k.fandom.com/wiki/Inquisition", 10 | "https://warhammer40k.fandom.com/wiki/Planets_of_Warhammer_40,000", 11 | "https://warhammer40k.fandom.com/wiki/Chaos", 12 | "https://warhammer40k.fandom.com/wiki/Psyker", 13 | "https://warhammer40k.fandom.com/wiki/Forces_of_Chaos", 14 | "https://warhammer40k.fandom.com/wiki/Aeldari", 15 | "https://warhammer40k.fandom.com/wiki/Drukhari", 16 | "https://warhammer40k.fandom.com/wiki/Orks", 17 | "https://warhammer40k.fandom.com/wiki/Necrons", 18 | "https://warhammer40k.fandom.com/wiki/Tyranids", 19 | "https://warhammer40k.fandom.com/wiki/T%27au_Empire", 20 | "https://warhammer40k.fandom.com/wiki/Krieg", 21 | "https://warhammer40k.fandom.com/wiki/Death_Korps_of_Krieg" 22 | ] 23 | } -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/web_scrape_configs_old/skynet.json: -------------------------------------------------------------------------------- 1 | { 2 | "pages": [ 3 | "https://terminator.fandom.com/wiki/Skynet", 4 | "https://terminator.fandom.com/wiki/Skynet_Central_Core", 5 | "https://terminator.fandom.com/wiki/Cyberdyne_Systems", 6 | "https://terminator.fandom.com/wiki/Terminator", 7 | "https://terminator.fandom.com/wiki/T-1", 8 | "https://terminator.fandom.com/wiki/T-600", 9 | "https://terminator.fandom.com/wiki/T-700", 10 | "https://terminator.fandom.com/wiki/T-800", 11 | "https://terminator.fandom.com/wiki/T-X", 12 | "https://terminator.fandom.com/wiki/T-1000", 13 | "https://terminator.fandom.com/wiki/T-X", 14 | "https://terminator.fandom.com/wiki/T-X", 15 | "https://terminator.fandom.com/wiki/Kyle_Reese", 16 | "https://terminator.fandom.com/wiki/Sarah_Connor", 17 | "https://terminator.fandom.com/wiki/John_Connor", 18 | "https://terminator.fandom.com/wiki/Miles_Dyson", 19 | "https://terminator.fandom.com/wiki/Judgment_Day", 20 | "https://terminator.fandom.com/wiki/Resistance", 21 | "https://terminator.fandom.com/wiki/Infiltrator", 22 | "https://terminator.fandom.com/wiki/Living_tissue" 23 | ], 24 | "tags_to_extract": [ 25 | "p" 26 | ], 27 | "unwanted_tags": [ 28 | "script", 29 | "style", 30 | "footer" 31 | ], 32 | "unwanted_lines": [ 33 | "view image", 34 | "Fandom", 35 | "By accepting our Privacy Policy", 36 | "LEARN MORE", 37 | "ACCEPT", 38 | "FOLLOW" 39 | ] 40 | } -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/run_files/web_scrape_configs_old/warhammer_40k.json: -------------------------------------------------------------------------------- 1 | { 2 | "pages": [ 3 | "https://warhammer40k.fandom.com/wiki/Age_of_the_Imperium", 4 | "https://warhammer40k.fandom.com/wiki/Space_Marines", 5 | "https://warhammer40k.fandom.com/wiki/Astra_Militarum", 6 | "https://warhammer40k.fandom.com/wiki/Imperial_Navy", 7 | "https://warhammer40k.fandom.com/wiki/Adeptus_Mechanicus", 8 | "https://warhammer40k.fandom.com/wiki/Adepta_Sororitas", 9 | "https://warhammer40k.fandom.com/wiki/Inquisition", 10 | "https://warhammer40k.fandom.com/wiki/Planets_of_Warhammer_40,000", 11 | "https://warhammer40k.fandom.com/wiki/Chaos", 12 | "https://warhammer40k.fandom.com/wiki/Psyker", 13 | "https://warhammer40k.fandom.com/wiki/Forces_of_Chaos", 14 | "https://warhammer40k.fandom.com/wiki/Aeldari", 15 | "https://warhammer40k.fandom.com/wiki/Drukhari", 16 | "https://warhammer40k.fandom.com/wiki/Orks", 17 | "https://warhammer40k.fandom.com/wiki/Necrons", 18 | "https://warhammer40k.fandom.com/wiki/Tyranids", 19 | "https://warhammer40k.fandom.com/wiki/T%27au_Empire", 20 | "https://warhammer40k.fandom.com/wiki/Krieg", 21 | "https://warhammer40k.fandom.com/wiki/Death_Korps_of_Krieg" 22 | ], 23 | "tags_to_extract": [ 24 | "p" 25 | ], 26 | "unwanted_tags": [ 27 | "script", 28 | "style", 29 | "footer" 30 | ], 31 | "unwanted_lines": [ 32 | "view image", 33 | "Fandom", 34 | "By accepting our Privacy Policy", 35 | "LEARN MORE", 36 | "ACCEPT", 37 | "FOLLOW" 38 | ] 39 | } -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/static/style.css: -------------------------------------------------------------------------------- 1 | /* 2 | Sets the chatbot avatar image size to 240px * 240 px 3 | */ 4 | div.css-v72an7 { 5 | width: 240px; 6 | height: 240px; 7 | } 8 | 9 | img.css-1hy9t21 { 10 | width: 240px; 11 | height: 240px; 12 | } -------------------------------------------------------------------------------- /src/llama_cpp_chat_memory/terminal_chatbot.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import signal 3 | 4 | from conversation_manager import ConveresationManager 5 | 6 | conversation_manager = ConveresationManager() 7 | 8 | 9 | class GracefulExit(SystemExit): 10 | code = 1 11 | 12 | 13 | def raise_graceful_exit(*args): 14 | loop.stop() 15 | print("Chat closed") 16 | raise GracefulExit() 17 | 18 | 19 | async def main() -> None: 20 | character_name = conversation_manager.get_character_name() 21 | while True: 22 | query = input("User: ") 23 | print(f"{character_name}: ", end="") 24 | await conversation_manager.ask_question_test(query) 25 | print("\n") 26 | 27 | 28 | if __name__ == "__main__": 29 | loop = asyncio.get_event_loop() 30 | signal.signal(signal.SIGINT, raise_graceful_exit) 31 | signal.signal(signal.SIGTERM, raise_graceful_exit) 32 | background_tasks = set() 33 | task = loop.create_task(main()) 34 | background_tasks.add(task) 35 | try: 36 | # asyncio.run(main()) 37 | loop.run_until_complete(task) 38 | except GracefulExit: 39 | pass 40 | finally: 41 | loop.close() 42 | --------------------------------------------------------------------------------