├── .gitignore
├── README.md
├── UNLICENSE.md
├── docs
    ├── UNLICENSE.md
    ├── card_format.md
    ├── configs.md
    ├── creating_embeddings.md
    ├── csv.md
    ├── examples.md
    ├── getting_started.md
    ├── img
    │   ├── settings_panel.png
    │   ├── skynet01.png
    │   ├── skynet02.png
    │   ├── skynet03.png
    │   └── skynet04.png
    ├── index.md
    ├── named_entity_recognition.md
    ├── preparing_the_env.md
    ├── prompt_support.md
    ├── running_the_chatbot.md
    ├── running_the_env.md
    └── webscraping.md
├── mkdocs.yml
├── pyproject.toml
├── ruff_defaults.toml
└── src
    ├── __about__.py
    ├── __init__.py
    └── llama_cpp_chat_memory
        ├── .chainlit
            └── config_example.toml
        ├── .env.example
        ├── __about__.py
        ├── character_chat.py
        ├── conversation_manager.py
        ├── custom_llm_classes
            ├── __init__.py
            └── custom_spacy_embeddings.py
        ├── developer
            ├── test_embeddings.py
            └── test_llm.py
        ├── document_analysis
            ├── collection_helper.py
            ├── ctfidf.py
            ├── general_word_frequency.py
            ├── generate_uuid.py
            ├── parse_ner.py
            └── spacy_explain.py
        ├── document_parsing
            ├── __init__.py
            ├── extract
            │   ├── __init__.py
            │   └── basics.py
            ├── filter_csv.py
            ├── parse_csv_to_text.py
            ├── parse_json_documents.py
            ├── parse_pdf_documents.py
            ├── parse_text_documents.py
            ├── parse_text_documents_old.py
            ├── parse_text_documents_simple.py
            ├── spacier
            │   ├── __init__.py
            │   ├── core.py
            │   └── utils.py
            ├── test_query.py
            ├── utils
            │   ├── __init__.py
            │   ├── cache.py
            │   ├── constants.py
            │   ├── errors.py
            │   ├── types.py
            │   └── utils.py
            └── web_scraper.py
        ├── flask_web_server.py
        ├── run_chat.py
        ├── run_files
            ├── cards
            │   ├── Shodan_v2.png
            │   └── Skynet_v2.png
            ├── documents
            │   ├── csv_test
            │   │   └── customers-100.csv
            │   ├── shodan
            │   │   └── shodan.txt
            │   ├── shodan_mes
            │   │   └── shodan_message_examples.txt
            │   └── skynet
            │   │   ├── skynet.pdf
            │   │   └── skynet.txt
            ├── filters
            │   ├── csv_filter.json
            │   └── web_scrape_filter.json
            ├── models
            │   └── models.txt
            ├── parse_configs
            │   ├── csv_columns.json
            │   ├── ner_types.json
            │   ├── ner_types_analyze.json
            │   ├── ner_types_full.json
            │   └── query_metadata_filter.json
            ├── prompt_templates
            │   ├── conversation_template.json
            │   ├── conversation_template2.json
            │   └── question_refining_metadata_template.json
            ├── run_settings
            │   └── run_config.json
            ├── web_scrape_configs
            │   ├── shodan.json
            │   ├── skynet.json
            │   └── warhammer_40k.json
            └── web_scrape_configs_old
            │   ├── skynet.json
            │   └── warhammer_40k.json
        ├── static
            └── style.css
        └── terminal_chatbot.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode
 2 | .env
 3 | .env_test*
 4 | dist/
 5 | site/
 6 | __pycache__/
 7 | src/llama_cpp_chat_memory/public/avatars/
 8 | src/llama_cpp_chat_memory/test/
 9 | src/llama_cpp_chat_memory/run_files/chroma_storage/
10 | src/llama_cpp_chat_memory/run_files/character_storage/
11 | src/llama_cpp_chat_memory/run_files/key_storage/
12 | src/llama_cpp_chat_memory/.chainlit/.langchain.db
13 | src/llama_cpp_chat_memory/.chainlit/config.toml
14 | src/llama_cpp_chat_memory/.chainlit/translations/
15 | src/llama_cpp_chat_memory/run_files/temp/
16 | src/llama_cpp_chat_memory/test.py
17 | src/llama_cpp_chat_memory/document_parsing/test.py
18 | src/llama_cpp_chat_memory/chainlit.md
19 | src/llama_cpp_chat_memory/logs/
20 | src/llama_cpp_chat_memory/.files/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # llama-cpp-chat-memory
2 | This project is a llama-cpp character AI chatbot using tavern or V2 character cards and ChromaDB for character memory. You can also use it as just a normal character Ai chatbot. For full documentation visit [Chatbot Documentation](http://ossirytk.github.io/llama-cpp-chat-memory/index.html).
3 | 
4 | ## TODO add latest updates here


--------------------------------------------------------------------------------
/UNLICENSE.md:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <https://unlicense.org>
25 | 


--------------------------------------------------------------------------------
/docs/UNLICENSE.md:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <https://unlicense.org>
25 | 


--------------------------------------------------------------------------------
/docs/card_format.md:
--------------------------------------------------------------------------------
 1 | ### Card Format
 2 | See [character editor](https://character-tools.srjuggernaut.dev/character-editor).<BR>
 3 | There are few example cards included like'Skynet' and 'Shodan'<BR>
 4 | 'name' : 'char_name'<br>
 5 | The name for the ai character. When using json or yaml, this is expected to correspond to avatar image. name.png or name.jpg.<br>
 6 | 'description' : 'char_persona'<br>
 7 | The description for the character personality. Likes, dislikes, personality traits.<br>
 8 | 'scenario' : 'world_scenario'<br>
 9 | Description of the scenario. This roughly corresponds to things like "You are a hr customer service having a discussion with a customer. Always be polite". etc.<br>
10 | 'mes_example' : 'example_dialogue'<br>
11 | Example dialogue. The AI will pick answer patterns based on this<br>
12 | 'first_mes' : 'char_greeting'<br>
13 | A landing page for the chat. This will not be included in the prompt.
14 | 
15 | The documents folder includes some documents for embeddings parsing for the character cards.


--------------------------------------------------------------------------------
/docs/configs.md:
--------------------------------------------------------------------------------
 1 | ### Basic Configs
 2 | You can change the configuration settings in .env file.
 3 | 
 4 | The available embeddings are llama,spacy and hugginface. Make sure that the config for the chat matches the embeddings that were used to create the chroma collection. 
 5 | 
 6 | VECTOR_K is the value for vector storage documents for how many documents should be returned. You might need to change this based on your context and vector store chunk size. BUFFER_K is the size for conversation buffer. The prompt will include last K qustion answer pairs. Having large VECTOR_K and BUFFER_K can overfill the prompt. The default character card is Skynet_V2.png. This is just a basic template.
 7 | 
 8 | Config Field  | Description
 9 | ------------- | -------------
10 | MODEL_DIR     | The dir for the models
11 | MODEL         | model_name.gguf
12 | MODEL_TYPE    | alpaca/mistral
13 | CHARACTER_CARD_DIR | The directory for chracter cards
14 | CHARACTER_CARD | character_card.png/yaml/json
15 | PERSIST_DIRECTORY | dir for chroma embeddings
16 | PROMPT_TEMPLATE_DIRECTORY | Prompt template are stored here
17 | REPLACE_YOU | Replace references to "You" in card with "User"
18 | KEY_STORAGE_DIRECTORY | dir for NER keys for chroma
19 | COLLECTION_CONFIG | Path to run config file for collection and prompt
20 | EMBEDDINGS_TYPE | llama/spacy/hugginface
21 | EMBEDDINGS_MODEL | spacy/hugginface model name (needs to be installed)
22 | CUSTOM_CSS | Url to the custom css file to be used by the application.
23 | REFINE_MODEL | Spacy model used for metadata ner parsing
24 | REFINE_CONFIG | Ner config file used for metadata ner parsing
25 | VECTOR_SORT_TYPE | Vector searach sort type distance/bm25/fusionrank
26 | VECTOR_K | Fetch k closest embeddings for mmr
27 | BUFFER_K | Buffer last k exchanges to conversation context
28 | FETCH_K | Fetch k closest embeddings for similiarity
29 | LAMBDA_MULT | Lambda for Chroma
30 | LAYERS        | Number of layers to offload to gpu
31 | SEED | Seed used for generation. Default random (-1)
32 | N_PARTS | How many parts the model is divided into. Default auto (-1)
33 | USE_MLOCK | Load the whole model into ram. Default False
34 | TEMPERATURE | Adjust the randomness of the generated text (default: 0.8)
35 | TOP_P | A higher value for top-p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.
36 | REPEAT_PENALTY | The repeat-penalty option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
37 | TOP_K | A higher value for top-k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.
38 | LAST_N_TOKENS_SIZE | Last n tokens to consider for penalizing repetition
39 | USE_MMAP | Allows only the necessary parts to be loaded into memory and offloading the rest. Default false
40 | VERBOSE | Verbose mode. Default True
41 | ROPE_CONTEXT | Rope context for rope scaling
42 | N_CTX | Context size default 8192
43 | N_BATCH | Message write batch size
44 | MAX_TOKENS | Max tokens. Default 4096
45 | 
46 | ### General Configs
47 | Other configs are found in the run_files folder. These include Webscrape configs, ner parse configs and filter configs. 
48 | 
49 | Filters folder defines the general webscrape filters to clean the documents. This file uses regex and can easily be modified to add extra filtering.
50 | 
51 | Parse_configs defines the expected csv column structure and ner type parsing. This includes noun engrams, entities, noun chunks and parse type.
52 | 
53 | Web scrape configs define the web pages fo a scrape. This is convinient if you want to scrape multiple pages.
54 | 
55 | ### Run Config
56 | The run config in run_config.json in the run_files folder defines the options for chat run settings. The run config sets the defaults for message collection, context collection and for the prompt template. The config also gives the list of alternative collection and prompt settings. These can be changed while the chat is running from the chat settings menu.
57 | ![Settings Panel](img/settings_panel.png)
58 | 


--------------------------------------------------------------------------------
/docs/creating_embeddings.md:
--------------------------------------------------------------------------------
 1 | ### Creating embeddings
 2 | The embeddings creation uses env setting for threading and cuda. The Example documents are in the Documents folder. The scripts are in the documents_parsing folder. 
 3 | Use --help for basic instructions.<BR>
 4 | The parsing script will parse all txt, pdf or json files in the target directory. For json lorebooks a key_storage file will also be created for metadata filtering.<BR>
 5 | You need to download models for NER parsing. Textacy parses text files with Spacy sentence transformers to automatically generate keys for metadata filters. The default model is en_core_web_lg. See available models at [Spacy Models](https://spacy.io/usage/models)<BR>
 6 | ```
 7 | python -m spacy download en_core_web_sm
 8 | python -m spacy download en_core_web_md
 9 | python -m spacy download en_core_web_lg
10 | ```
11 | 
12 | You might want to play with the chunk size and overlap based on your text documents<BR>
13 | The example documents include a txt file for 'Skynet' and 'Shodan'<BR>
14 | The supported lorebook formats are chub inferred AgnAIstic and SillyTavern original source.
15 | For pdf files there is a pdf file of short stories from Fyodor Dostoyevsky included The source is Internet Archive, the copy is in public domain. The pdf text quality is quite poor thought, so I recommend getting another file.
16 | 
17 | Performance for files over 200mb is not great. Parsing a large text with a large keyfile will result in poor performance. It's more effective to have smaller colletions that have their own keyfiles rather that one large collection with one keyfile. I recommend splitting sollections my subject category and then switching as needed.
18 | 
19 | **!!Important!!.** You need to make sure that the documents, character_storage and key_storage folders exist.
20 | 
21 | Textacy parsing will use NER to parse keys from the document using sentence transformers. This keys can be used as Chroma metadata,
22 | NOTE: Textacy parsing will create a key file in key_storage that can be used by text parsing. Json files will create keys automatically if present in json file.
23 | ```
24 | python -m document_parsing.textacy_parsing --collection-name skynet --embeddings-type spacy
25 | ```
26 | 
27 | Parse csv to text
28 | ```
29 | python -m document_parsing.parse_csv_to_text
30 | ```
31 | 
32 | Parse the documents with. The new document parsing uses multiprocess to parse metadata keys created with parse_ner script. This increases the processing speed with large key files by a significant margin. The old script uses a single thread for processing keys and this can cause significant slowdown with many documents with large keyfiles. You can give the number of threads for the multiprocess with --threads
33 | ```
34 | python -m document_parsing.parse_text_documents
35 | python -m document_parsing.parse_text_documents
36 | python -m document_parsing.parse_json_documents
37 | ```
38 | 
39 | You can test the embeddings with
40 | ```
41 | python -m document_parsing.test_embeddings  --collection-name skynet --query "Who is John Connor" --embeddings-type llama
42 | python -m document_parsing.test_embeddings  --collection-name skynet2 --query "Who is John Connor" --embeddings-type spacy
43 | python -m document_parsing.test_embeddings  --collection-name hogwarts --query "Who is Charles Rookwood'" --embeddings-type spacy
44 | ```
45 | 
46 | Optional params         | Description
47 | ---------------------- | -------------
48 | --documents-directory       | The directory where your text files are stored. Default "./run_files/documents/skynet"
49 | --collection-name      | The name of the collection. Default "skynet"
50 | --persist-directory    | The directory where you want to store the Chroma collection. Default "./run_files/character_storage/"
51 | --key-storage          | The directory for the collection metadata keys Need to be created with textacy parsing. Default "./run_files/key_storage/"
52 | --keyfile-name          | The name of the keyfile. Matches collection name by default.
53 | --chunk-size           | The text chunk size for parsing. Default "1024"
54 | --chunk-overlap        | The overlap for text chunks for parsing. Default "0"
55 | --embeddings-type      | The chosen embeddings type. Default "spacy"


--------------------------------------------------------------------------------
/docs/csv.md:
--------------------------------------------------------------------------------
1 | ### Named Entity Recognition(NER)
2 | You can use filter_csv.py and parse_csv_to_text.py to process csv files. The filter script will remove rows using whitelists and blacklists. You can set a filter for any column. This is useful when you want to split large documents to more manageable portions. The csv to text parsing document filters web elements if you have webscraped data into csv.


--------------------------------------------------------------------------------
/docs/examples.md:
--------------------------------------------------------------------------------
1 | ### Some examples 
2 | ![skynet01](img/skynet01.png)
3 | ![skynet02](img/skynet02.png)
4 | ![skynet03](img/skynet03.png)
5 | ![skynet04](img/skynet04.png)


--------------------------------------------------------------------------------
/docs/getting_started.md:
--------------------------------------------------------------------------------
 1 | You will need hatch to run this project. You can install hatch with pipx. See [Hatch](https://pypi.org/project/hatch/) and [Pipx](https://pipx.pypa.io/latest/installation/). The commands here are for windows powershell. If you use another shell, you'll have to change things as needed.
 2 | ```
 3 | pip install pipx
 4 | pipx install hatch
 5 | ```
 6 | Then from the repo root folder run.
 7 | ```
 8 | hatch shell chat
 9 | cd .\src\llama_cpp_chat_memory\
10 | python -m spacy download en_core_web_lg
11 | playwright install
12 | ```
13 | 
14 | You will need spacy models for text embeddings if you do not use llama-cpp embeddings. Playwright is used by the old webscrape scripts. These are not needed for running the chatbot itself.</BR>
15 | 
16 | You also might want to run llama-cpp with gpu acceleration like cuda. See [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) for specifics. Then run:
17 | ```
18 | $env:FORCE_CMAKE=1
19 | $env:CMAKE_ARGS="-DLLAMA_CUBLAS=on"
20 | pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir --no-deps
21 | ```
22 | 
23 | Note that this example is for powershell and for the latest llama-cpp-python. You will need to change the command based on the terminal and the llama-cpp-python version.</BR>
24 | 
25 | Get a gguf model from a site like
26 | [The Bloke](https://huggingface.co/models?sort=modified&search=theBloke+gguf)
27 | and a character card and lorebooks from a site like [Character hub](https://www.characterhub.org/) or make your own with [character editor](https://character-tools.srjuggernaut.dev/character-editor)<BR>
28 | 
29 | Change the .env_test to .env and make sure that the correct folders exist.</BR>
30 | 
31 | You can set the collection to "" and try the chatbot by running:
32 | ```
33 | chainlit run character_chat.py
34 | ```
35 | If you want to create memory then see more details below.


--------------------------------------------------------------------------------
/docs/img/settings_panel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/docs/img/settings_panel.png


--------------------------------------------------------------------------------
/docs/img/skynet01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/docs/img/skynet01.png


--------------------------------------------------------------------------------
/docs/img/skynet02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/docs/img/skynet02.png


--------------------------------------------------------------------------------
/docs/img/skynet03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/docs/img/skynet03.png


--------------------------------------------------------------------------------
/docs/img/skynet04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/docs/img/skynet04.png


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # llama-cpp-chat-memory
2 | This project is intended as an example and a basic framework for a locally run chatbot with documents. The target user group is developers with some understanding about python and llm framworks. If you want to learn about llm and AI, when you can take a look at my [llm resources for beginners](https://github.com/ossirytk/llm_resources) or [PygWiki](https://wikia.schneedc.com/). This project is mainly intended to serve as a more fleshed out tutorial and a basic frame to test various things like document embeddings. For this reason, the chatbot itself is intended to be lightweight and simple. You can also use this chatbot to test models and prompts. The document fetching can be disabled by setting collection to "" in the config files. This leaves you with just a basic character chatbot.</BR>
3 | 
4 | Everything is designed to run locally. The model is run with llama.cpp and it's python bindings, the UI is Chainlit, the vector database is Chroma and everythin is glued together with Langchain. Document processing uses Spacy and Sentence Transformers and Playwright. There are no dependencies to external api's. Llama.cpp can use gpu acceleration with Cuda and Blas. See the documentation for [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) for documentation.</BR>
5 | 
6 | The chatbot uses character cards as prompts. The supported cards are Tavern and V2. Internal lorebooks are not supported yet. There are several scripts for parsing json lorebooks, pdt, textfiles and scarping web pages for the memory content. Also included are scripts for parsing metadata from documents automatically. 


--------------------------------------------------------------------------------
/docs/named_entity_recognition.md:
--------------------------------------------------------------------------------
 1 | ### Named Entity Recognition(NER)
 2 | You can use textacy_parsing script for generating document metadata keys automatically. The scripts are a modified version of textacy code updated to run with the current spacy version. The script uses a spacy embeddings model to process a text document for a json metadata keyfile. The keys are parsed based on a config file in run_files/parse_configs/ner_types.json or run_files/parse_configs/ner_types_full.json. You can give your own config file if you want.
 3 | 
 4 | **The new parse script uses multiprocess to improve performance. The default process pool number is 6. You should change the process number based on the number of cores your machine has.**
 5 | 
 6 | The available configs are
 7 | 
 8 | Ngrams        | Description
 9 | ------------- | -------------
10 | PROPN         | Proper Noun
11 | NOUN          | Noun
12 | ADJ           | Adjective
13 | NNP           | Noun proper singular
14 | NN            | Noun, singular or mass
15 | AUX           | Auxiliary
16 | VBZ           | Verb, 3rd person singular present
17 | VERB          | Verb
18 | ADP           | Adposition
19 | SYM           | Symbol
20 | NUM           | Numeral
21 | CD            | Cardinal number
22 | VBG           | verb, gerund or present participle
23 | ROOT          | Root
24 | 
25 | Entities      | Description
26 | ------------- | -------------
27 | FAC           | Buildings, airports, highways, bridges, etc.
28 | NORP          | Nationalities or religious or political groups
29 | GPE           | Countries, cities, states
30 | PRODUCT       | Objects, vehicles, foods, etc. (not services)
31 | EVENT         | Named hurricanes, battles, wars, sports events, etc.
32 | PERSON        | People, including fictional
33 | ORG           | Companies, agencies, institutions, etc.
34 | LOC           | Non-GPE locations, mountain ranges, bodies of water
35 | DATE          | Absolute or relative dates or periods
36 | TIME          | Times smaller than a day
37 | WORK_OF_ART   | Titles of books, songs, etc.
38 | 
39 | Extract type    | Description
40 | --------------- | -------------
41 | orth            | Terms are represented by their text exactly as written
42 | lower           | Lowercased form of the text
43 | lemma           | Base form w/o inflectional suffixes
44 | 
45 | For details see [Spacy linguistic features](https://spacy.io/usage/linguistic-features) and [Model NER labels](https://spacy.io/models/en). The instructions expect en model, but spacy supports a wide range of models. You can also specify Noun chunks. Noun chunk of 2 for example would create keys like "Yellow House" or "Blond Hair".
46 | 
47 | 
48 | 
49 | You can create ner metadata list with
50 | ```
51 | python -m document_parsing.parse_ner
52 | ```
53 | 
54 | Optional param         | Description
55 | ---------------------- | -------------
56 | --data-directory       | The directory where your text files are stored. Default "./run_files/documents/skynet"
57 | --collection-name      | The name of the collection Will be used as name and location for the keyfile. Default "skynet"
58 | --key-storage          | The directory for the collection metadata keys. Default "./run_files/key_storage/"
59 | --threads              | The number of multiprocess threads. Default 6.


--------------------------------------------------------------------------------
/docs/preparing_the_env.md:
--------------------------------------------------------------------------------
 1 | ### Preparing the env
 2 | You will need a llama model that is compatible with llama-cpp. See models in HuggingFace by [The Bloke](https://huggingface.co/models?sort=modified&search=theBloke+gguf)<BR>
 3 | You might want to build with cuda support. <BR>
 4 | You need to pass FORCE_CMAKE=1 and CMAKE_ARGS="-DLLAMA_CUBLAS=on" to env variables. This is the powershell syntax. Use whatever syntax your shell uses to set env variables<BR>
 5 | You need to download language models if you use NER parsing, embeddings or spacy sentence transformers. The default model is en_core_web_lg. See available models at [Spacy Models](https://spacy.io/usage/models)<BR>
 6 | Choose the preferred model size and type.
 7 | ```
 8 | python -m spacy download en_core_web_sm
 9 | python -m spacy download en_core_web_md
10 | python -m spacy download en_core_web_lg
11 | ```
12 | 
13 | For installing dependencies in the virtual envs with hatch
14 | ```
15 | hatch env create
16 | ```
17 | Copy the .env_test to .env and set directories and model settings
18 | NOTE: Setting collection to "" will disable chroma fetching and you will get a normal character chatbot.


--------------------------------------------------------------------------------
/docs/prompt_support.md:
--------------------------------------------------------------------------------
1 | ### Prompt Support
2 | Supports alpaca and mistral text prompts, V2 and tavern style json and yaml files and V2 and tavern png cards. Avatar images need to be in the same folder as the prompt file. V2 and Tavern png files get a copy of the image without exif data in the project temp file. Inbuilt lorebooks are currently not supported
3 | 
4 | See [Character hub](https://www.characterhub.org/) for some character cards or make your own with [character editor](https://character-tools.srjuggernaut.dev/character-editor).<BR>


--------------------------------------------------------------------------------
/docs/running_the_chatbot.md:
--------------------------------------------------------------------------------
 1 | ### Running the chatbot
 2 | To run the chatbot. You need to run the chat with the custom script instead of the chainlit run command.
 3 | The reason for this is the updates for the config files when switching character. 
 4 | These changes need to be done before calling chainlit.
 5 | 
 6 | If you call chainlit directly, the character name and avatar picture won't update.
 7 | 
 8 | Note: Currently something seems to be cached by chainlit. Until I find a way to clear the cache,
 9 | you need to call run_chat twice for changes to take effect.
10 | 
11 | Some browsers don't allow loading css file from local directories. For testing purposes there is a flask script to run a simple http server that serves stylesheets from the "static/" directory. You will need to run the flask server in another terminal instance.
12 | 
13 | ```
14 | cd src\llama_cpp_langchain_chat
15 | python -m run_chat
16 | ```
17 | 
18 | The chatbot should open in your browser<BR>
19 | 
20 | Running flask
21 | ```
22 | hatch shell chat
23 | cd .\src\llama_cpp_chat_memory\
24 | flask --app flask_web_server run
25 | ```
26 | ### Running the terminal chatbot
27 | You can run the chatbot directly in the terminal without starting a web browser. The terminal script is a low effort way to debug the chatbot quickly.
28 | 
29 | ```
30 | cd src\llama_cpp_langchain_chat
31 | python -m document_parsing.terminal_chatbot
32 | ```
33 | ### Avatar Images
34 | Avatar images need to be stored in the .\public\avatars folder. Make sure that the folder exists. Character cards in png format will have a copy of the image data saved in the avatars folder automatically. If you copy an image manually, make sure that the filename matches the name is the character card and replace the whitespace in the name with underscores.
35 | ### Vector search
36 | The search for relevant documents from chroma happens based on VECTOR_SORT_TYPE and VECTOR_K. The search will return VECTOR_K+4 closest matches and sorts by sort type before appending to vector_k. Default search simply sorts by distance. "bm25" sorts with the bm25 search algorithm. Fusion rank gets the combined results of both.
37 | ### Query metadata
38 | The query is parsed for metadata using spacy. The metadata keys are used as a filter when searching the Chroma collections.


--------------------------------------------------------------------------------
/docs/running_the_env.md:
--------------------------------------------------------------------------------
 1 | ### Running the env
 2 | You'll need to run all the commands inside the virtual env. Some browsers don't allow loading css file from local directories. For testing purposes there is a flask script to run a simple http server that serves stylesheets from the "static/" directory. You will need to run the flask server in another terminal instance.
 3 | ```
 4 | hatch shell chat
 5 | (optional for cuda support)$env:FORCE_CMAKE=1
 6 | (optional for cuda support)$env:CMAKE_ARGS="-DLLAMA_CUBLAS=on"
 7 | (optional for cuda support)pip install llama-cpp-python==VERSION --force-reinstall --upgrade --no-cache-dir --no-deps
 8 | cd src\llama_cpp_langchain_chat
 9 | ```
10 | 
11 | Running flask
12 | ```
13 | hatch shell chat
14 | cd .\src\llama_cpp_chat_memory\
15 | flask --app flask_web_server run
16 | ```


--------------------------------------------------------------------------------
/docs/webscraping.md:
--------------------------------------------------------------------------------
 1 | ### Webscraping
 2 | You can scrape web pages to text documents in order to use them as documents for chroma. 
 3 | 
 4 | Optional. The old web scraping uses playwright and requires that the web engines are installed. After starting the virtual env run:</BR>
 5 | 
 6 | ```
 7 | playwright install
 8 | ```
 9 | 
10 | The web scraping is prepared with config files in web_scrape_configs folder. The format is in json. See the example files for the specfics. A number of regex filters are used to clean the scrape data. You can modify and add filters if you want. The filters are stored in the src/llama_cpp_chat_memory/run_files/filters/web_scrape_filters.json file.</BR>
11 | 
12 | To run the scrape run:
13 | ```
14 | python -m document_parsing.web_scraper</BR>
15 | ```
16 | 
17 | Optional param         | Description
18 | ---------------------- | -------------
19 | --data-directory       | The directory where your text files are stored. Default "./run_files/documents/skynet"
20 | --collection-name      | The name of the collection. Default "skynet"
21 | --web-scrape-directory | The config file to be used for the webscrape. Default "./run_files/web_scrape_configs/"


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: Llama.cpp chat
 2 | site_url: https://ossirytk.github.io/llama-cpp-chat-memory
 3 | repo_url: https://github.com/ossirytk/llama-cpp-chat-memory
 4 | nav:
 5 |         - Home: index.md
 6 |         - Quickstart:
 7 |                   - Getting started: getting_started.md
 8 |                   - Prompt Support: prompt_support.md
 9 |                   - Card Format: card_format.md
10 |                   - Configs: configs.md
11 |                   - Preparing the env: preparing_the_env.md
12 |         - The chatbot:
13 |                   - Running the env: running_the_env.md
14 |                   - Running the chatbot: running_the_chatbot.md
15 |         - Working with documents:
16 |                   - Webscraping: webscraping.md
17 |                   - Csv filtering and parsing: csv.md
18 |                   - Named Entity Recognition(NER): named_entity_recognition.md
19 |                   - Creating embeddings: creating_embeddings.md
20 |         - Extras:
21 |                   - Some Examples: examples.md
22 |                   - License: UNLICENSE.md
23 | theme: readthedocs
24 | markdown_extensions:
25 |         - fenced_code
26 |         - tables
27 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["hatchling"]
  3 | build-backend = "hatchling.build"
  4 | 
  5 | [project]
  6 | name = "llama-cpp-chat-memory"
  7 | dynamic = ["version"]
  8 | description = 'llama_cpp chat with langhcain, chainlit and vectorstore memory.'
  9 | readme = "README.md"
 10 | license = "UNLICENSE"
 11 | keywords = []
 12 | authors = [
 13 |   { name = "ossirytk", email = "ossirytk@gmail.com" },
 14 | ]
 15 | classifiers = [
 16 |   "Development Status :: 4 - Beta",
 17 |   "Programming Language :: Python",
 18 |   "Programming Language :: Python :: 3.11",
 19 | ]
 20 | dependencies = [
 21 | "langchain==0.2.16",
 22 | "langchain-community==0.2.16",
 23 | "chainlit==1.1.404",
 24 | "llama-cpp-python==0.2.90",
 25 | "Pillow==10.4.0",
 26 | "PyYAML==6.0.2",
 27 | "toml==0.10.2",
 28 | "chromadb==0.5.5",
 29 | "pypdf==4.3.1",
 30 | "sentence-transformers==3.0.1",
 31 | "simsimd==5.1.0",
 32 | "pydantic==2.9.0",
 33 | "cytoolz==0.12.3",
 34 | "spacy==3.7.6",
 35 | "pandas==2.2.2",
 36 | "pyarrow==17.0.0",
 37 | "trafilatura==1.12.1",
 38 | "flask==3.0.3",
 39 | "nltk==3.9.1",
 40 | "rank-bm25==0.2.2",
 41 | "click==8.1.7",
 42 | "scikit-learn==1.5.2",
 43 | ]
 44 | 
 45 | [project.urls]
 46 | Documentation = "https://github.com/ossirytk/llama-cpp-chat-memory/blob/main/README.md"
 47 | Issues = "https://github.com/ossirytk/llama-cpp-chat-memory/issues"
 48 | Source = "https://github.com/ossirytk/llama-cpp-chat-memory"
 49 | 
 50 | [tool.hatch.version]
 51 | path = "src/llama_cpp_chat_memory/__about__.py"
 52 | 
 53 | [tool.hatch.envs.chat]
 54 | decription="Llama cpp chat with vector store memory"
 55 | dependencies = [
 56 |   "coverage[toml]>=6.5",
 57 |   "pytest",
 58 | ]
 59 | [tool.hatch.envs.chat.scripts]
 60 | test = "pytest {args:tests}"
 61 | test-cov = "coverage run -m pytest {args:tests}"
 62 | cov-report = [
 63 |   "- coverage combine",
 64 |   "coverage report",
 65 | ]
 66 | cov = [
 67 |   "test-cov",
 68 |   "cov-report",
 69 | ]
 70 | 
 71 | [[tool.hatch.envs.all.matrix]]
 72 | python = ["3.11"]
 73 | 
 74 | [tool.hatch.envs.lint]
 75 | detached = true
 76 | dependencies = [
 77 |   "black>=23.1.0",
 78 |   "mypy>=1.0.0",
 79 |   "ruff>=0.0.243",
 80 | ]
 81 | 
 82 | [tool.hatch.build.targets.wheel]
 83 | packages = ["src/llama_cpp_chat_memory"]
 84 | 
 85 | [tool.hatch.envs.lint.scripts]
 86 | typing = "mypy --install-types --non-interactive {args:src/llama_cpp_chat_memory tests}"
 87 | style = [
 88 |   "ruff {args:.}",
 89 |   "black --check --diff {args:.}",
 90 | ]
 91 | 
 92 | fmt = [
 93 |   "black {args:.}",
 94 |   "ruff --fix {args:.}",
 95 |   "style",
 96 | ]
 97 | all = [
 98 |   "style",
 99 |   "typing",
100 | ]
101 | 
102 | [tool.black]
103 | target-version = ["py311"]
104 | line-length = 120
105 | skip-string-normalization = true
106 | 
107 | [tool.ruff]
108 | target-version = "py311"
109 | line-length = 120
110 | select = [
111 |   "A",
112 |   "ARG",
113 |   "B",
114 |   "C",
115 |   "DTZ",
116 |   "E",
117 |   "EM",
118 |   "F",
119 |   "FBT",
120 |   "I",
121 |   "ICN",
122 |   "ISC",
123 |   "N",
124 |   "PLC",
125 |   "PLE",
126 |   "PLR",
127 |   "PLW",
128 |   "Q",
129 |   "RUF",
130 |   "S",
131 |   "T",
132 |   "TID",
133 |   "UP",
134 |   "W",
135 |   "YTT",
136 | ]
137 | ignore = [
138 |   # Allow non-abstract empty methods in abstract base classes
139 |   "B027",
140 |   # Allow boolean positional values in function calls, like `dict.get(... True)`
141 |   "FBT003",
142 |   # Ignore checks for possible passwords
143 |   "S105", "S106", "S107",
144 |   # Ignore complexity
145 |   "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
146 | ]
147 | unfixable = [
148 |   # Don't touch unused imports
149 |   "F401",
150 | ]
151 | 
152 | [tool.ruff.isort]
153 | known-first-party = ["llama_cpp_chat_memory"]
154 | 
155 | [tool.ruff.flake8-tidy-imports]
156 | ban-relative-imports = "all"
157 | 
158 | [tool.ruff.per-file-ignores]
159 | # Tests can use magic values, assertions, and relative imports
160 | "tests/**/*" = ["PLR2004", "S101", "TID252"]
161 | 
162 | [tool.coverage.run]
163 | source_pkgs = ["llama_cpp_chat_memory", "tests"]
164 | branch = true
165 | parallel = true
166 | omit = [
167 |   "src/llama_cpp_chat_memory/__about__.py",
168 | ]
169 | 
170 | [tool.coverage.paths]
171 | llama_cpp_chat_memory = ["src/llama_cpp_chat_memory", "*/llama-cpp-chat-memory/src/llama_cpp_chat_memory"]
172 | tests = ["tests", "*/llama-cpp-chat-memory/tests"]
173 | 
174 | [tool.coverage.report]
175 | exclude_lines = [
176 |   "no cov",
177 |   "if __name__ == .__main__.:",
178 |   "if TYPE_CHECKING:",
179 | ]
180 | 


--------------------------------------------------------------------------------
/ruff_defaults.toml:
--------------------------------------------------------------------------------
 1 | # Exclude a variety of commonly ignored directories.
 2 | exclude = [
 3 |     ".bzr",
 4 |     ".direnv",
 5 |     ".eggs",
 6 |     ".git",
 7 |     ".git-rewrite",
 8 |     ".hg",
 9 |     ".mypy_cache",
10 |     ".nox",
11 |     ".pants.d",
12 |     ".pytype",
13 |     ".ruff_cache",
14 |     ".svn",
15 |     ".tox",
16 |     ".venv",
17 |     "__pypackages__",
18 |     "_build",
19 |     "buck-out",
20 |     "build",
21 |     "dist",
22 |     "node_modules",
23 |     "venv",
24 | ]
25 | 
26 | # Same as Black.
27 | line-length = 88
28 | indent-width = 4
29 | 
30 | # Assume Python 3.11
31 | target-version = "py311"
32 | 
33 | [lint]
34 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`)  codes by default.
35 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
36 | # McCabe complexity (`C901`) by default.
37 | select = ["E4", "E7", "E9", "F"]
38 | ignore = []
39 | 
40 | # Allow fix for all enabled rules (when `--fix`) is provided.
41 | fixable = ["ALL"]
42 | unfixable = []
43 | 
44 | # Allow unused variables when underscore-prefixed.
45 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
46 | 
47 | [format]
48 | # Like Black, use double quotes for strings.
49 | quote-style = "double"
50 | 
51 | # Like Black, indent with spaces, rather than tabs.
52 | indent-style = "space"
53 | 
54 | # Like Black, respect magic trailing commas.
55 | skip-magic-trailing-comma = false
56 | 
57 | # Like Black, automatically detect the appropriate line ending.
58 | line-ending = "auto"
59 | 
60 | # Enable auto-formatting of code examples in docstrings. Markdown,
61 | # reStructuredText code/literal blocks and doctests are all supported.
62 | #
63 | # This is currently disabled by default, but it is planned for this
64 | # to be opt-out in the future.
65 | docstring-code-format = false
66 | 
67 | # Set the line length limit used when formatting code snippets in
68 | # docstrings.
69 | #
70 | # This only has an effect when the `docstring-code-format` setting is
71 | # enabled.
72 | docstring-code-line-length = "dynamic"


--------------------------------------------------------------------------------
/src/__about__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1"
2 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/src/__init__.py


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/.chainlit/config_example.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | enable_telemetry = false
 3 | user_env = []
 4 | session_timeout = 50000
 5 | cache = false
 6 | allow_origins = [ "*",]
 7 | 
 8 | [features]
 9 | unsafe_allow_html = false
10 | latex = false
11 | auto_tag_thread = true
12 | edit_message = true
13 | 
14 | [UI]
15 | name = "Assistant"
16 | default_collapse_content = true
17 | cot = "hidden"
18 | custom_css = "http://127.0.0.1:5000/static/style.css"
19 | 
20 | [meta]
21 | generated_by = "1.1.404"
22 | 
23 | [features.spontaneous_file_upload]
24 | enabled = true
25 | accept = [ "*/*",]
26 | max_files = 20
27 | max_size_mb = 500
28 | 
29 | [features.audio]
30 | min_decibels = -45
31 | initial_silence_timeout = 3000
32 | silence_timeout = 1500
33 | max_duration = 15000
34 | chunk_duration = 1000
35 | sample_rate = 44100
36 | 
37 | [UI.theme]
38 | default = "dark"
39 | layout = "wide"
40 | 
41 | [UI.theme.light.primary]
42 | 
43 | [UI.theme.light.text]
44 | 
45 | [UI.theme.dark.primary]
46 | 
47 | [UI.theme.dark.text]
48 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/.env.example:
--------------------------------------------------------------------------------
 1 | MODEL_DIR = "./models/"
 2 | MODEL = "llama2.gguf"
 3 | MODEL_TYPE = "alpaca"
 4 | CHARACTER_CARD_DIR = "./cards/"
 5 | CHARACTER_CARD = "Skynet_v2.png"
 6 | PERSIST_DIRECTORY = "./run_files/character_storage/"
 7 | PROMPT_TEMPLATE_DIRECTORY="./prompt_templates/"
 8 | REPLACE_YOU=False
 9 | KEY_STORAGE_DIRECTORY = "./run_files/key_storage/"
10 | COLLECTION_CONFIG = "./run_files/run_settings/run_config.json"
11 | EMBEDDINGS_TYPE = "spacy"
12 | EMBEDDINGS_MODEL = "en_core_web_lg"
13 | CUSTOM_CSS="http://127.0.0.1:5000/static/style.css"
14 | REFINE_MODEL="en_core_web_lg"
15 | REFINE_CONFIG="./run_files/parse_configs/query_metadata_filter.json"
16 | VECTOR_SORT_TYPE="fusion_rank"
17 | VECTOR_K = 1
18 | BUFFER_K = 3
19 | FETCH_K = 10
20 | LAMBDA_MULT = 0.75
21 | LAYERS = 10
22 | SEED = -1
23 | N_PARTS = -1
24 | USE_MLOCK = False
25 | TEMPERATURE = 0.7
26 | TOP_P = 0.95
27 | REPEAT_PENALTY = 1.1
28 | TOP_K = 50
29 | LAST_N_TOKENS_SIZE = 256
30 | USE_MMAP = False
31 | VERBOSE = True
32 | ROPE_CONTEXT = 1
33 | N_CTX = 8192
34 | N_BATCH = 256
35 | MAX_TOKENS = 4096


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/__about__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1"
2 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/character_chat.py:
--------------------------------------------------------------------------------
 1 | import chainlit as cl
 2 | from chainlit.input_widget import Select
 3 | from conversation_manager import ConveresationManager
 4 | 
 5 | conversation_manager = ConveresationManager()
 6 | 
 7 | 
 8 | @cl.author_rename
 9 | def rename(orig_author: str):
10 |     # Renames chatbot to whatever the current character card name is
11 |     rename_dict = {"Chatbot": conversation_manager.get_character_name()}
12 |     return rename_dict.get(orig_author, orig_author)
13 | 
14 | 
15 | @cl.on_chat_start
16 | async def start():
17 |     await cl.ChatSettings(
18 |         [
19 |             Select(
20 |                 id="prompt_template_options",
21 |                 label="Prompt Templates",
22 |                 values=conversation_manager.get_prompt_templates(),
23 |                 initial_index=conversation_manager.get_prompt_template_index(),
24 |             ),
25 |             Select(
26 |                 id="context_collection",
27 |                 label="Context Collection",
28 |                 values=conversation_manager.get_context_collections(),
29 |                 initial_index=conversation_manager.get_context_index(),
30 |             ),
31 |             Select(
32 |                 id="mex_collection",
33 |                 label="Mex. Collection",
34 |                 values=conversation_manager.get_mes_collections(),
35 |                 initial_index=conversation_manager.get_mes_index(),
36 |             ),
37 |         ]
38 |     ).send()
39 | 
40 | 
41 | @cl.on_settings_update
42 | async def setup_agent(settings: dict[str, str]):
43 |     conversation_manager.update_settings(settings)
44 | 
45 | 
46 | @cl.on_message
47 | async def main(message: cl.Message):
48 | 
49 |     result: cl.Message = await conversation_manager.ask_question(message)
50 |     await result.send()
51 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/custom_llm_classes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/src/llama_cpp_chat_memory/custom_llm_classes/__init__.py


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/custom_llm_classes/custom_spacy_embeddings.py:
--------------------------------------------------------------------------------
 1 | import importlib.util
 2 | from typing import Any
 3 | 
 4 | from langchain.pydantic_v1 import BaseModel, Extra, root_validator
 5 | from langchain.schema.embeddings import Embeddings
 6 | 
 7 | 
 8 | class CustomSpacyEmbeddings(BaseModel, Embeddings):
 9 |     model_path: str
10 |     nlp: Any
11 | 
12 |     class Config:
13 |         """Configuration for this pydantic object."""
14 | 
15 |         extra = Extra.forbid
16 | 
17 |     @root_validator(pre=True)
18 |     def validate_environment(cls, values: dict) -> dict:
19 |         model_path = values["model_path"]
20 |         # Check if the Spacy package is installed
21 |         if importlib.util.find_spec("spacy") is None:
22 |             spacy_not_installed_error_message = "Spacy package not found. Please install it with `pip install spacy`."
23 |             raise ValueError(spacy_not_installed_error_message)
24 |         try:
25 |             # Try to load the 'en_core_web_sm' Spacy model
26 |             import spacy
27 | 
28 |             values["nlp"] = spacy.load(model_path)
29 |         except OSError:
30 |             # If the model is not found, raise a ValueError
31 |             error_message = f"""Spacy model not found.
32 |                 Please install it with
33 |                 python -m spacy download {model_path}"""
34 |             raise ValueError(error_message) from None
35 |         return values  # Return the validated values
36 | 
37 |     def embed_documents(self, texts: list[str]) -> list[list[float]]:
38 |         """
39 |         Generates embeddings for a list of documents.
40 | 
41 |         Args:
42 |             texts (List[str]): The documents to generate embeddings for.
43 | 
44 |         Returns:
45 |             A list of embeddings, one for each document.
46 |         """
47 |         return [self.nlp(text).vector.tolist() for text in texts]
48 | 
49 |     def embed_query(self, text: str) -> list[float]:
50 |         """
51 |         Generates an embedding for a single piece of text.
52 | 
53 |         Args:
54 |             text (str): The text to generate an embedding for.
55 | 
56 |         Returns:
57 |             The embedding for the text.
58 |         """
59 |         return self.nlp(text).vector.tolist()
60 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/developer/test_embeddings.py:
--------------------------------------------------------------------------------
 1 | from os import makedirs
 2 | 
 3 | import click
 4 | import pandas as pd
 5 | from conversation_manager import ConveresationManager
 6 | from langchain_core.documents.base import Document
 7 | 
 8 | 
 9 | @click.command()
10 | @click.argument("query")
11 | @click.option(
12 |     "--ttype",
13 |     "-t",
14 |     default="mes",
15 |     type=click.Choice(["mes", "context"]),
16 |     help="Test type.",
17 | )
18 | @click.option("--keywords", "-k", default="polito, cyborgs, shodan", help="Query metadata keywords.")
19 | @click.option("--ksize", default=3, help="The amount of context to fetch")
20 | def main(query: str, ttype, keywords: str, ksize: int) -> None:
21 |     """
22 |     This script is for doing tests on embeddings. Retuns metadata results from the vector storage.
23 |     """
24 |     test_path = "./test/"
25 |     # test_file = "./test/test.json"
26 |     makedirs(test_path, exist_ok=True)
27 |     conversation_manager = ConveresationManager(test="Testing")
28 | 
29 |     metadata_filter = conversation_manager.get_metadata_filter(keywords, ttype)
30 |     docs: list[tuple[Document, float]] = conversation_manager.get_vector(query, ttype, metadata_filter, ksize)
31 | 
32 |     df: pd.DataFrame = conversation_manager.calculate_fusion_rank(query, docs)
33 | 
34 |     # result = df.to_json(orient="split")
35 |     # with open(test_file, "w") as w:
36 |     #    w.write(result)
37 | 
38 |     df = df.iloc[0:ksize]
39 |     for line_item in df["content"].tolist():
40 |         print(line_item)
41 |         print("-----------")
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     main()
46 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/developer/test_llm.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | import click
 4 | from conversation_manager import ConveresationManager
 5 | 
 6 | conversation_manager = ConveresationManager()
 7 | 
 8 | 
 9 | async def test_llm(query: str):
10 |     await conversation_manager.ask_question_test(query)
11 | 
12 | 
13 | @click.command()
14 | @click.argument("query")
15 | def main(
16 |     query: str,
17 | ) -> None:
18 |     """
19 |     This script is for doing quick tests on the model. Runs a single shot query.
20 |     """
21 |     asyncio.run(test_llm(query))
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     main()
26 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_analysis/collection_helper.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import chromadb
 4 | import click
 5 | from chromadb.api.client import Client
 6 | from chromadb.config import Settings
 7 | from dotenv import find_dotenv, load_dotenv
 8 | 
 9 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.DEBUG)
10 | 
11 | load_dotenv(find_dotenv())
12 | 
13 | 
14 | @click.command()
15 | @click.argument("command", type=click.Choice(["list", "delete"]))
16 | @click.option(
17 |     "--collection-name",
18 |     "-c",
19 |     "collection_name",
20 |     default="skynet",
21 |     help="The name of the Chroma collection that's the target of an action",
22 | )
23 | @click.option(
24 |     "--persist-directory",
25 |     "-p",
26 |     "persist_directory",
27 |     default="./run_files/character_storage/",
28 |     help="The directory where you want to store the Chroma collection",
29 | )
30 | def main(
31 |     collection_name: str,
32 |     persist_directory: str,
33 |     command: str,
34 | ) -> None:
35 |     client: Client = chromadb.PersistentClient(path=persist_directory, settings=Settings(anonymized_telemetry=False))
36 | 
37 |     match command:
38 |         case "list":
39 |             logging.info("Available collections:")
40 |             collections = client.list_collections()
41 |             for collection in collections:
42 |                 logging.info(collection.name)
43 |         case "delete":
44 |             logging.info(f"Deleting {collection_name}")
45 |             client.delete_collection(collection_name)
46 |             logging.info(f"{collection_name} deleted")
47 |         case _:
48 |             collections = client.list_collections()
49 |             logging.info("Available collections:")
50 |             for collection in collections:
51 |                 logging.info(collection.name)
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_analysis/ctfidf.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import json
  3 | import logging
  4 | import math
  5 | import os
  6 | import re
  7 | from collections.abc import Iterable
  8 | from functools import partial
  9 | from os.path import exists, join
 10 | from pathlib import Path
 11 | 
 12 | import click
 13 | from document_parsing.extract import entities, ngrams, terms
 14 | from document_parsing.extract.basics import terms_to_strings
 15 | from document_parsing.spacier import core
 16 | from dotenv import find_dotenv, load_dotenv
 17 | from spacy.tokens import Doc
 18 | 
 19 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.INFO)
 20 | 
 21 | load_dotenv(find_dotenv())
 22 | 
 23 | SPACY_CHARACTER_LIMIT = 1000000
 24 | 
 25 | 
 26 | def split_text(
 27 |     text: str,
 28 |     chunk_size: int,
 29 |     chunk_overlap: int,
 30 | ) -> list[str]:
 31 |     separators = ["\n\n", "\n", " ", ""]
 32 | 
 33 |     """Split incoming text and return chunks."""
 34 |     final_chunks = []
 35 |     # Get appropriate separator to use
 36 |     separator = separators[-1]
 37 |     new_separators = []
 38 |     for i, _s in enumerate(separators):
 39 |         _separator = re.escape(_s)
 40 |         if _s == "":
 41 |             separator = _s
 42 |             break
 43 |         if re.search(_separator, text):
 44 |             separator = _s
 45 |             new_separators = separators[i + 1 :]
 46 |             break
 47 | 
 48 |     _separator = re.escape(separator)
 49 |     splits = split_text_with_regex(text, _separator)
 50 | 
 51 |     # Now go merging things, recursively splitting longer texts.
 52 |     _good_splits = []
 53 |     _separator = ""
 54 |     for s in splits:
 55 |         if _good_splits:
 56 |             merged_text = merge_splits(_good_splits, _separator)
 57 |             final_chunks.extend(merged_text)
 58 |             _good_splits = []
 59 |         if not new_separators:
 60 |             final_chunks.append(s)
 61 |         else:
 62 |             other_info = _split_text(s, new_separators, chunk_size, chunk_overlap)
 63 |             final_chunks.extend(other_info)
 64 |     if _good_splits:
 65 |         merged_text = merge_splits(_good_splits, _separator)
 66 |         final_chunks.extend(merged_text)
 67 |     return final_chunks
 68 | 
 69 | 
 70 | def _split_text(text: str, separators: list[str], chunk_size: int, chunk_overlap: int) -> list[str]:
 71 |     """Split incoming text and return chunks."""
 72 |     final_chunks = []
 73 |     # Get appropriate separator to use
 74 |     separator = separators[-1]
 75 |     new_separators = []
 76 |     for i, _s in enumerate(separators):
 77 |         _separator = re.escape(_s)
 78 |         if _s == "":
 79 |             separator = _s
 80 |             break
 81 |         if re.search(_separator, text):
 82 |             separator = _s
 83 |             new_separators = separators[i + 1 :]
 84 |             break
 85 | 
 86 |     _separator = re.escape(separator)
 87 |     splits = split_text_with_regex(text, _separator)
 88 | 
 89 |     # Now go merging things, recursively splitting longer texts.
 90 |     _good_splits = []
 91 |     _separator = separator
 92 |     for s in splits:
 93 |         if len(s) < chunk_size:
 94 |             _good_splits.append(s)
 95 |         else:
 96 |             if _good_splits:
 97 |                 merged_text = merge_splits(_good_splits, _separator, chunk_size, chunk_overlap)
 98 |                 final_chunks.extend(merged_text)
 99 |                 _good_splits = []
100 |             if not new_separators:
101 |                 final_chunks.append(s)
102 |             else:
103 |                 other_info = _split_text(s, new_separators, chunk_size, chunk_overlap)
104 |                 final_chunks.extend(other_info)
105 |     if _good_splits:
106 |         merged_text = merge_splits(_good_splits, _separator, chunk_size, chunk_overlap)
107 |         final_chunks.extend(merged_text)
108 |     return final_chunks
109 | 
110 | 
111 | def split_text_with_regex(text: str, separator: str) -> list[str]:
112 |     # Now that we have the separator, split the text
113 |     if separator:
114 |         # The parentheses in the pattern keep the delimiters in the result.
115 |         _splits = re.split(f"({separator})", text)
116 |         splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
117 |         if len(_splits) % 2 == 0:
118 |             splits += _splits[-1:]
119 |         splits = [_splits[0], *splits]
120 |     else:
121 |         splits = list(text)
122 |     return [s for s in splits if s != ""]
123 | 
124 | 
125 | def merge_splits(splits: Iterable[str], separator: str, chunk_size, chunk_overlap) -> list[str]:
126 |     # We now want to combine these smaller pieces into medium size
127 |     # chunks to send to the LLM.
128 |     separator_len = len(separator)
129 | 
130 |     docs = []
131 |     current_doc: list[str] = []
132 |     total = 0
133 |     for d in splits:
134 |         _len = len(d)
135 |         if total + _len + (separator_len if len(current_doc) > 0 else 0) > chunk_size:
136 |             if total > chunk_size:
137 |                 logging.warning(f"Created a chunk of size {total}, which is longer than the specified {chunk_size}")
138 |             if len(current_doc) > 0:
139 |                 doc = join_docs(current_doc, separator)
140 |                 if doc is not None:
141 |                     docs.append(doc)
142 |                 # Keep on popping if:
143 |                 # - we have a larger chunk than in the chunk overlap
144 |                 # - or if we still have any chunks and the length is long
145 |                 while total > chunk_overlap or (
146 |                     total + _len + (separator_len if len(current_doc) > 0 else 0) > chunk_size and total > 0
147 |                 ):
148 |                     total -= len(current_doc[0]) + (separator_len if len(current_doc) > 1 else 0)
149 |                     current_doc = current_doc[1:]
150 |         current_doc.append(d)
151 |         total += _len + (separator_len if len(current_doc) > 1 else 0)
152 |     doc = join_docs(current_doc, separator)
153 |     if doc is not None:
154 |         docs.append(doc)
155 |     return docs
156 | 
157 | 
158 | def join_docs(docs: list[str], separator: str) -> str | None:
159 |     text = separator.join(docs)
160 |     text = text.strip()
161 | 
162 |     if text == "":
163 |         return None
164 |     else:
165 |         return text
166 | 
167 | 
168 | def process_documents(
169 |     documents: Doc,
170 |     parse_config_directory: str,
171 |     parse_config_file: str,
172 | ) -> list:
173 |     # You can use spacy.explain to get a description for these terms
174 |     # Or see the model in https://spacy.io/usage/models and look for model label data
175 | 
176 |     parse_config_path = join(".", parse_config_directory, parse_config_file)
177 |     if exists(parse_config_path):
178 |         with open(parse_config_path) as key_file:
179 |             filter_content = key_file.read()
180 |         filter_configs = json.loads(filter_content)
181 |     else:
182 |         logging.info("Could not load parse config file")
183 |         return
184 | 
185 |     ngrams_list = filter_configs["ngs"]
186 |     entities_list = filter_configs["entities"]
187 |     noun_chunks = filter_configs["noun_chunks"]
188 |     extract_type = filter_configs["extract_type"]
189 | 
190 |     logging.debug("Extracting terms from corpus")
191 |     extracted_terms = terms(
192 |         documents,
193 |         ngs=partial(ngrams, n=noun_chunks, include_pos=ngrams_list),
194 |         ents=partial(
195 |             entities,
196 |             include_types=entities_list,
197 |         ),
198 |         dedupe=True,
199 |     )
200 | 
201 |     lemma_strings = list(terms_to_strings(extracted_terms, by=extract_type))
202 | 
203 |     logging.debug(f"{len(lemma_strings)} metadata keys created")
204 |     return lemma_strings
205 | 
206 | 
207 | @click.command()
208 | @click.option(
209 |     "--documents-directory",
210 |     "-d",
211 |     "documents_directory",
212 |     default="./run_files/documents/skynet",
213 |     help="The directory where your text files are stored",
214 | )
215 | @click.option(
216 |     "--key-storage", "-k", default="./run_files/key_storage/", help="The directory for the collection metadata keys."
217 | )
218 | @click.option(
219 |     "--keyfile-name",
220 |     "-k",
221 |     "keyfile_name",
222 |     default="keyfile",
223 |     help="Keyfile name.",
224 | )
225 | @click.option(
226 |     "--model",
227 |     "-m",
228 |     default="en_core_web_lg",
229 |     help="The spacy model to parse the text",
230 | )
231 | @click.option(
232 |     "--parse-config-directory", "-pcd", default="./run_files/parse_configs/", help="The parse config directory"
233 | )
234 | @click.option(
235 |     "--parse-config-file",
236 |     "-pcf",
237 |     default="ner_types_analyze.json",
238 |     help="The parse config file",
239 | )
240 | @click.option(
241 |     "--chunk-size",
242 |     "-cs",
243 |     "chunk_size",
244 |     type=int,
245 |     default=1000000,
246 |     help="The text chunk size for parsing. Default spacy maximum chunk size",
247 | )
248 | @click.option(
249 |     "--chunk-overlap",
250 |     "-co",
251 |     "chunk_overlap",
252 |     default=0,
253 |     type=int,
254 |     help="The overlap for text chunks for parsing",
255 | )
256 | def main(
257 |     documents_directory: str,
258 |     key_storage: str,
259 |     keyfile_name: str,
260 |     model: str,
261 |     parse_config_directory: str,
262 |     parse_config_file: str,
263 |     chunk_size: int,
264 |     chunk_overlap: int,
265 | ) -> None:
266 |     """This script is a rough implementation of Class-based TF-IDF.
267 |     See: https://www.maartengrootendorst.com/blog/ctfidf/
268 |     Tries to find words that represent a topic. Needs a large amount of topics to work.
269 |     This is just a crude proof of concept. Has poor performance and some hacks.
270 |     """
271 |     documents_pattern = os.path.join(documents_directory, "*.txt")
272 |     documents_paths_txt = glob.glob(documents_pattern)
273 | 
274 |     # Contains the total occurances of words in all material
275 |     all_words = {}
276 |     # Contains a the occurance frequency in the doc and c-TF-IDF value
277 |     all_docs = {}
278 |     # Total word counts in docs
279 |     total_word_count = {}
280 |     for txt_document in documents_paths_txt:
281 |         logging.info(f"Parsing: {txt_document}")
282 |         with open(txt_document, encoding="utf-8") as f:
283 |             content = f.read()
284 |         parts = split_text(content, chunk_size, chunk_overlap)
285 |         data = {}
286 |         for part in parts:
287 |             doc = core.make_spacy_doc(part, lang=model)
288 |             words = process_documents(doc, parse_config_directory, parse_config_file)
289 | 
290 |             for word in words:
291 |                 # TODO Better word filter
292 |                 if word not in [
293 |                     "User",
294 |                     "user",
295 |                 ]:
296 |                     if word in data.keys():
297 |                         data[word] = data[word] + 1
298 |                     else:
299 |                         data[word] = 1
300 |                     if word in all_words.keys():
301 |                         all_words[word] = all_words[word] + 1
302 |                     else:
303 |                         all_words[word] = 1
304 |         all_docs[txt_document] = data
305 | 
306 |         frequency = 0
307 |         for _key, word in data.items():
308 |             frequency = frequency + word
309 |         total_word_count[txt_document] = frequency
310 | 
311 |     total_documents = len(all_docs)
312 |     # TODO Process documents in threads?
313 |     for doc_name, adoc in all_docs.items():
314 |         logging.info(f"Words for document - {doc_name}: {len(adoc)}")
315 |         logging.info(f"Calculating values for: {doc_name}")
316 |         topic_collection = {}
317 |         for word_key, word_count in adoc.items():
318 |             word_count_in_class = word_count
319 |             words_in_class_total = total_word_count[doc_name]
320 |             word_count_in_total = all_words[word_key]
321 |             log_part = math.log(total_documents / word_count_in_total)
322 |             log_part = max(log_part, 1)
323 |             reference_value = (word_count_in_class / words_in_class_total * log_part) * 100000
324 |             reference_tuple = (word_count, reference_value)
325 |             topic_collection[word_key] = reference_tuple
326 | 
327 |         topic_collection = dict(sorted(topic_collection.items(), key=lambda item: item[1][1], reverse=True))
328 | 
329 |         collection_name = Path(doc_name).stem
330 |         complete_name = collection_name + "_" + keyfile_name
331 |         key_storage_path = os.path.join(key_storage, complete_name + ".json")
332 |         logging.info(f"Saving values for: {doc_name}")
333 |         with open(key_storage_path, mode="w", encoding="utf-8") as key_file:
334 |             json.dump(topic_collection, key_file)
335 | 
336 | 
337 | if __name__ == "__main__":
338 |     main()
339 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_analysis/general_word_frequency.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import json
  3 | import logging
  4 | import os
  5 | import re
  6 | from collections.abc import Iterable
  7 | from functools import partial
  8 | from os.path import exists, join
  9 | 
 10 | import click
 11 | from document_parsing.extract import entities, ngrams, terms
 12 | from document_parsing.extract.basics import terms_to_strings
 13 | from document_parsing.spacier import core
 14 | from dotenv import find_dotenv, load_dotenv
 15 | from spacy.tokens import Doc
 16 | 
 17 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.INFO)
 18 | 
 19 | load_dotenv(find_dotenv())
 20 | 
 21 | SPACY_CHARACTER_LIMIT = 1000000
 22 | 
 23 | 
 24 | def split_text(
 25 |     text: str,
 26 |     chunk_size: int,
 27 |     chunk_overlap: int,
 28 | ) -> list[str]:
 29 |     separators = ["\n\n", "\n", " ", ""]
 30 | 
 31 |     """Split incoming text and return chunks."""
 32 |     final_chunks = []
 33 |     # Get appropriate separator to use
 34 |     separator = separators[-1]
 35 |     new_separators = []
 36 |     for i, _s in enumerate(separators):
 37 |         _separator = re.escape(_s)
 38 |         if _s == "":
 39 |             separator = _s
 40 |             break
 41 |         if re.search(_separator, text):
 42 |             separator = _s
 43 |             new_separators = separators[i + 1 :]
 44 |             break
 45 | 
 46 |     _separator = re.escape(separator)
 47 |     splits = split_text_with_regex(text, _separator)
 48 | 
 49 |     # Now go merging things, recursively splitting longer texts.
 50 |     _good_splits = []
 51 |     _separator = ""
 52 |     for s in splits:
 53 |         if _good_splits:
 54 |             merged_text = merge_splits(_good_splits, _separator)
 55 |             final_chunks.extend(merged_text)
 56 |             _good_splits = []
 57 |         if not new_separators:
 58 |             final_chunks.append(s)
 59 |         else:
 60 |             other_info = _split_text(s, new_separators, chunk_size, chunk_overlap)
 61 |             final_chunks.extend(other_info)
 62 |     if _good_splits:
 63 |         merged_text = merge_splits(_good_splits, _separator)
 64 |         final_chunks.extend(merged_text)
 65 |     return final_chunks
 66 | 
 67 | 
 68 | def _split_text(text: str, separators: list[str], chunk_size: int, chunk_overlap: int) -> list[str]:
 69 |     """Split incoming text and return chunks."""
 70 |     final_chunks = []
 71 |     # Get appropriate separator to use
 72 |     separator = separators[-1]
 73 |     new_separators = []
 74 |     for i, _s in enumerate(separators):
 75 |         _separator = re.escape(_s)
 76 |         if _s == "":
 77 |             separator = _s
 78 |             break
 79 |         if re.search(_separator, text):
 80 |             separator = _s
 81 |             new_separators = separators[i + 1 :]
 82 |             break
 83 | 
 84 |     _separator = re.escape(separator)
 85 |     splits = split_text_with_regex(text, _separator)
 86 | 
 87 |     # Now go merging things, recursively splitting longer texts.
 88 |     _good_splits = []
 89 |     _separator = separator
 90 |     for s in splits:
 91 |         if len(s) < chunk_size:
 92 |             _good_splits.append(s)
 93 |         else:
 94 |             if _good_splits:
 95 |                 merged_text = merge_splits(_good_splits, _separator, chunk_size, chunk_overlap)
 96 |                 final_chunks.extend(merged_text)
 97 |                 _good_splits = []
 98 |             if not new_separators:
 99 |                 final_chunks.append(s)
100 |             else:
101 |                 other_info = _split_text(s, new_separators, chunk_size, chunk_overlap)
102 |                 final_chunks.extend(other_info)
103 |     if _good_splits:
104 |         merged_text = merge_splits(_good_splits, _separator, chunk_size, chunk_overlap)
105 |         final_chunks.extend(merged_text)
106 |     return final_chunks
107 | 
108 | 
109 | def split_text_with_regex(text: str, separator: str) -> list[str]:
110 |     # Now that we have the separator, split the text
111 |     if separator:
112 |         # The parentheses in the pattern keep the delimiters in the result.
113 |         _splits = re.split(f"({separator})", text)
114 |         splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
115 |         if len(_splits) % 2 == 0:
116 |             splits += _splits[-1:]
117 |         splits = [_splits[0], *splits]
118 |     else:
119 |         splits = list(text)
120 |     return [s for s in splits if s != ""]
121 | 
122 | 
123 | def merge_splits(splits: Iterable[str], separator: str, chunk_size, chunk_overlap) -> list[str]:
124 |     # We now want to combine these smaller pieces into medium size
125 |     # chunks to send to the LLM.
126 |     separator_len = len(separator)
127 | 
128 |     docs = []
129 |     current_doc: list[str] = []
130 |     total = 0
131 |     for d in splits:
132 |         _len = len(d)
133 |         if total + _len + (separator_len if len(current_doc) > 0 else 0) > chunk_size:
134 |             if total > chunk_size:
135 |                 logging.warning(f"Created a chunk of size {total}, which is longer than the specified {chunk_size}")
136 |             if len(current_doc) > 0:
137 |                 doc = join_docs(current_doc, separator)
138 |                 if doc is not None:
139 |                     docs.append(doc)
140 |                 # Keep on popping if:
141 |                 # - we have a larger chunk than in the chunk overlap
142 |                 # - or if we still have any chunks and the length is long
143 |                 while total > chunk_overlap or (
144 |                     total + _len + (separator_len if len(current_doc) > 0 else 0) > chunk_size and total > 0
145 |                 ):
146 |                     total -= len(current_doc[0]) + (separator_len if len(current_doc) > 1 else 0)
147 |                     current_doc = current_doc[1:]
148 |         current_doc.append(d)
149 |         total += _len + (separator_len if len(current_doc) > 1 else 0)
150 |     doc = join_docs(current_doc, separator)
151 |     if doc is not None:
152 |         docs.append(doc)
153 |     return docs
154 | 
155 | 
156 | def join_docs(docs: list[str], separator: str) -> str | None:
157 |     text = separator.join(docs)
158 |     text = text.strip()
159 | 
160 |     if text == "":
161 |         return None
162 |     else:
163 |         return text
164 | 
165 | 
166 | def process_documents(
167 |     documents: Doc,
168 |     parse_config_directory: str,
169 |     parse_config_file: str,
170 | ) -> list:
171 |     # You can use spacy.explain to get a description for these terms
172 |     # Or see the model in https://spacy.io/usage/models and look for model label data
173 | 
174 |     parse_config_path = join(".", parse_config_directory, parse_config_file)
175 |     if exists(parse_config_path):
176 |         with open(parse_config_path) as key_file:
177 |             filter_content = key_file.read()
178 |         filter_configs = json.loads(filter_content)
179 |     else:
180 |         logging.info("Could not load parse config file")
181 |         return
182 | 
183 |     ngrams_list = filter_configs["ngs"]
184 |     entities_list = filter_configs["entities"]
185 |     noun_chunks = filter_configs["noun_chunks"]
186 |     extract_type = filter_configs["extract_type"]
187 | 
188 |     logging.debug("Extracting terms from corpus")
189 |     extracted_terms = terms(
190 |         documents,
191 |         ngs=partial(ngrams, n=noun_chunks, include_pos=ngrams_list),
192 |         ents=partial(
193 |             entities,
194 |             include_types=entities_list,
195 |         ),
196 |         dedupe=True,
197 |     )
198 | 
199 |     lemma_strings = list(terms_to_strings(extracted_terms, by=extract_type))
200 | 
201 |     logging.debug(f"{len(lemma_strings)} metadata keys created")
202 |     return lemma_strings
203 | 
204 | 
205 | @click.command()
206 | @click.option(
207 |     "--documents-directory",
208 |     "-d",
209 |     "documents_directory",
210 |     default="./run_files/documents/skynet",
211 |     help="The directory where your text files are stored",
212 | )
213 | @click.option(
214 |     "--key-storage", "-k", default="./run_files/key_storage/", help="The directory for the collection metadata keys."
215 | )
216 | @click.option(
217 |     "--keyfile-name",
218 |     "-k",
219 |     "keyfile_name",
220 |     default="keyfile.json",
221 |     help="Keyfile name.",
222 | )
223 | @click.option(
224 |     "--model",
225 |     "-m",
226 |     default="en_core_web_lg",
227 |     help="The spacy model to parse the text",
228 | )
229 | @click.option(
230 |     "--parse-config-directory", "-pcd", default="./run_files/parse_configs/", help="The parse config directory"
231 | )
232 | @click.option(
233 |     "--parse-config-file",
234 |     "-pcf",
235 |     default="ner_types_full.json",
236 |     help="The parse config file",
237 | )
238 | @click.option(
239 |     "--chunk-size",
240 |     "-cs",
241 |     "chunk_size",
242 |     type=int,
243 |     default=1000000,
244 |     help="The text chunk size for parsing. Default spacy maximum chunk size",
245 | )
246 | @click.option(
247 |     "--chunk-overlap",
248 |     "-co",
249 |     "chunk_overlap",
250 |     default=0,
251 |     type=int,
252 |     help="The overlap for text chunks for parsing",
253 | )
254 | def main(
255 |     documents_directory: str,
256 |     key_storage: str,
257 |     keyfile_name: str,
258 |     model: str,
259 |     parse_config_directory: str,
260 |     parse_config_file: str,
261 |     chunk_size: int,
262 |     chunk_overlap: int,
263 | ) -> None:
264 |     """Parse ner keywords from text using spacy and grammar configuration files."""
265 |     documents_pattern = os.path.join(documents_directory, "*.txt")
266 |     documents_paths_txt = glob.glob(documents_pattern)
267 | 
268 |     # TODO c-TF-IDF instead of frequency
269 |     # Lemma graphs and matplotlib representations
270 | 
271 |     data = {}
272 |     for txt_document in documents_paths_txt:
273 |         logging.info(f"Parsing: {txt_document}")
274 |         with open(txt_document, encoding="utf-8") as f:
275 |             content = f.read()
276 |         parts = split_text(content, chunk_size, chunk_overlap)
277 | 
278 |         for part in parts:
279 |             doc = core.make_spacy_doc(part, lang=model)
280 |             words = process_documents(doc, parse_config_directory, parse_config_file)
281 |             for word in words:
282 |                 if word in data.keys():
283 |                     data[word] = data[word] + 1
284 |                 else:
285 |                     data[word] = 1
286 | 
287 |     # Filter words that occure only once
288 |     data = {k: v for k, v in data.items() if v > 1}
289 | 
290 |     # Sort with most common first
291 |     sorted_data = dict(sorted(data.items(), key=lambda item: item[1], reverse=True))
292 | 
293 |     logging.info(f"Total words: {len(data)}")
294 |     key_storage_path = os.path.join(key_storage, keyfile_name + ".json")
295 |     with open(key_storage_path, mode="w", encoding="utf-8") as key_file:
296 |         json.dump(sorted_data, key_file)
297 | 
298 | 
299 | if __name__ == "__main__":
300 |     main()
301 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_analysis/generate_uuid.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import uuid
3 | 
4 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.DEBUG)
5 | 
6 | logging.debug(str(uuid.uuid1()))
7 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_analysis/parse_ner.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import json
  3 | import logging
  4 | import multiprocessing as mp
  5 | import os
  6 | import re
  7 | import uuid
  8 | from collections.abc import Iterable
  9 | from functools import partial
 10 | from multiprocessing import Manager, Pool
 11 | from os.path import exists, join
 12 | from queue import Queue
 13 | 
 14 | import click
 15 | import pandas as pd
 16 | from document_parsing.extract import entities, ngrams, terms
 17 | from document_parsing.extract.basics import terms_to_strings
 18 | from document_parsing.spacier import core
 19 | from dotenv import find_dotenv, load_dotenv
 20 | from spacy.tokens import Doc
 21 | 
 22 | # This is the config for multiprocess logger
 23 | # Setting the level to debug outputs multiprocess debug lines too
 24 | NER_LOGGER = mp.get_logger()
 25 | FORMAT = "%(levelname)s:%(message)s"
 26 | formatter = logging.Formatter(fmt=FORMAT)
 27 | handler = logging.StreamHandler()
 28 | handler.setFormatter(formatter)
 29 | 
 30 | NER_LOGGER.addHandler(handler)
 31 | NER_LOGGER.setLevel(logging.INFO)
 32 | 
 33 | load_dotenv(find_dotenv())
 34 | 
 35 | SPACY_CHARACTER_LIMIT = 1000000
 36 | 
 37 | 
 38 | def split_text(
 39 |     text: str,
 40 |     chunk_size: int,
 41 |     chunk_overlap: int,
 42 | ) -> list[str]:
 43 |     separators = ["\n\n", "\n", " ", ""]
 44 | 
 45 |     """Split incoming text and return chunks."""
 46 |     final_chunks = []
 47 |     # Get appropriate separator to use
 48 |     separator = separators[-1]
 49 |     new_separators = []
 50 |     for i, _s in enumerate(separators):
 51 |         _separator = re.escape(_s)
 52 |         if _s == "":
 53 |             separator = _s
 54 |             break
 55 |         if re.search(_separator, text):
 56 |             separator = _s
 57 |             new_separators = separators[i + 1 :]
 58 |             break
 59 | 
 60 |     _separator = re.escape(separator)
 61 |     splits = split_text_with_regex(text, _separator)
 62 | 
 63 |     # Now go merging things, recursively splitting longer texts.
 64 |     _good_splits = []
 65 |     _separator = ""
 66 |     for s in splits:
 67 |         if _good_splits:
 68 |             merged_text = merge_splits(_good_splits, _separator)
 69 |             final_chunks.extend(merged_text)
 70 |             _good_splits = []
 71 |         if not new_separators:
 72 |             final_chunks.append(s)
 73 |         else:
 74 |             other_info = _split_text(s, new_separators, chunk_size, chunk_overlap)
 75 |             final_chunks.extend(other_info)
 76 |     if _good_splits:
 77 |         merged_text = merge_splits(_good_splits, _separator)
 78 |         final_chunks.extend(merged_text)
 79 |     return final_chunks
 80 | 
 81 | 
 82 | def _split_text(text: str, separators: list[str], chunk_size: int, chunk_overlap: int) -> list[str]:
 83 |     """Split incoming text and return chunks."""
 84 |     final_chunks = []
 85 |     # Get appropriate separator to use
 86 |     separator = separators[-1]
 87 |     new_separators = []
 88 |     for i, _s in enumerate(separators):
 89 |         _separator = re.escape(_s)
 90 |         if _s == "":
 91 |             separator = _s
 92 |             break
 93 |         if re.search(_separator, text):
 94 |             separator = _s
 95 |             new_separators = separators[i + 1 :]
 96 |             break
 97 | 
 98 |     _separator = re.escape(separator)
 99 |     splits = split_text_with_regex(text, _separator)
100 | 
101 |     # Now go merging things, recursively splitting longer texts.
102 |     _good_splits = []
103 |     _separator = separator
104 |     for s in splits:
105 |         if len(s) < chunk_size:
106 |             _good_splits.append(s)
107 |         else:
108 |             if _good_splits:
109 |                 merged_text = merge_splits(_good_splits, _separator, chunk_size, chunk_overlap)
110 |                 final_chunks.extend(merged_text)
111 |                 _good_splits = []
112 |             if not new_separators:
113 |                 final_chunks.append(s)
114 |             else:
115 |                 other_info = _split_text(s, new_separators, chunk_size, chunk_overlap)
116 |                 final_chunks.extend(other_info)
117 |     if _good_splits:
118 |         merged_text = merge_splits(_good_splits, _separator, chunk_size, chunk_overlap)
119 |         final_chunks.extend(merged_text)
120 |     return final_chunks
121 | 
122 | 
123 | def split_text_with_regex(text: str, separator: str) -> list[str]:
124 |     # Now that we have the separator, split the text
125 |     if separator:
126 |         # The parentheses in the pattern keep the delimiters in the result.
127 |         _splits = re.split(f"({separator})", text)
128 |         splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
129 |         if len(_splits) % 2 == 0:
130 |             splits += _splits[-1:]
131 |         splits = [_splits[0], *splits]
132 |     else:
133 |         splits = list(text)
134 |     return [s for s in splits if s != ""]
135 | 
136 | 
137 | def merge_splits(splits: Iterable[str], separator: str, chunk_size, chunk_overlap) -> list[str]:
138 |     # We now want to combine these smaller pieces into medium size
139 |     # chunks to send to the LLM.
140 |     separator_len = len(separator)
141 | 
142 |     docs = []
143 |     current_doc: list[str] = []
144 |     total = 0
145 |     for d in splits:
146 |         _len = len(d)
147 |         if total + _len + (separator_len if len(current_doc) > 0 else 0) > chunk_size:
148 |             if total > chunk_size:
149 |                 NER_LOGGER.warning(f"Created a chunk of size {total}, which is longer than the specified {chunk_size}")
150 |             if len(current_doc) > 0:
151 |                 doc = join_docs(current_doc, separator)
152 |                 if doc is not None:
153 |                     docs.append(doc)
154 |                 # Keep on popping if:
155 |                 # - we have a larger chunk than in the chunk overlap
156 |                 # - or if we still have any chunks and the length is long
157 |                 while total > chunk_overlap or (
158 |                     total + _len + (separator_len if len(current_doc) > 0 else 0) > chunk_size and total > 0
159 |                 ):
160 |                     total -= len(current_doc[0]) + (separator_len if len(current_doc) > 1 else 0)
161 |                     current_doc = current_doc[1:]
162 |         current_doc.append(d)
163 |         total += _len + (separator_len if len(current_doc) > 1 else 0)
164 |     doc = join_docs(current_doc, separator)
165 |     if doc is not None:
166 |         docs.append(doc)
167 |     return docs
168 | 
169 | 
170 | def join_docs(docs: list[str], separator: str) -> str | None:
171 |     text = separator.join(docs)
172 |     text = text.strip()
173 | 
174 |     if text == "":
175 |         return None
176 |     else:
177 |         return text
178 | 
179 | 
180 | def process_documents(
181 |     documents: Doc,
182 |     parse_config_directory: str,
183 |     parse_config_file: str,
184 | ) -> pd.Series:
185 |     # You can use spacy.explain to get a description for these terms
186 |     # Or see the model in https://spacy.io/usage/models and look for model label data
187 | 
188 |     parse_config_path = join(".", parse_config_directory, parse_config_file)
189 |     if exists(parse_config_path):
190 |         with open(parse_config_path) as key_file:
191 |             filter_content = key_file.read()
192 |         filter_configs = json.loads(filter_content)
193 |     else:
194 |         NER_LOGGER.info("Could not load parse config file")
195 |         return
196 | 
197 |     ngrams_list = filter_configs["ngs"]
198 |     entities_list = filter_configs["entities"]
199 |     noun_chunks = filter_configs["noun_chunks"]
200 |     extract_type = filter_configs["extract_type"]
201 | 
202 |     NER_LOGGER.info("Extracting terms from corpus")
203 |     extracted_terms = terms(
204 |         documents,
205 |         ngs=partial(ngrams, n=noun_chunks, include_pos=ngrams_list),
206 |         ents=partial(
207 |             entities,
208 |             include_types=entities_list,
209 |         ),
210 |         dedupe=True,
211 |     )
212 | 
213 |     lemma_strings = list(terms_to_strings(extracted_terms, by=extract_type))
214 |     all_keys = {}
215 | 
216 |     NER_LOGGER.info(f"{len(lemma_strings)} metadata keys created")
217 | 
218 |     # Create uuids for metadata filters
219 |     for line in lemma_strings:
220 |         filter_uuid = str(uuid.uuid1())
221 |         all_keys[filter_uuid] = line
222 |     return pd.Series(all_keys)
223 | 
224 | 
225 | def read_chuncks(text_corpus, chunk_size, chunk_overlap, que, reader_num) -> bool:
226 |     NER_LOGGER.info("Reading chuncks to que")
227 |     parts = split_text(text_corpus, chunk_size, chunk_overlap)
228 |     for doc in parts:
229 |         que.put(doc)
230 |     for _i in range(reader_num):
231 |         que.put("QUEUE_DONE")
232 |     NER_LOGGER.info("Reader done")
233 |     return True
234 | 
235 | 
236 | def process_chuncks(model, parse_config_directory, parse_config_file, read_que: Queue, write_que: Queue, name) -> bool:
237 |     NER_LOGGER.info(f"Processor {name} reading chuncks from que")
238 |     while True:
239 |         try:
240 |             corpus = read_que.get(timeout=10)
241 |         except Exception as e:
242 |             NER_LOGGER.info(f"Processor {name} timed out: {e}")
243 |             write_que.put("QUEUE_DONE")
244 |             return False
245 | 
246 |         if corpus == "QUEUE_DONE":
247 |             NER_LOGGER.info(f"Processor {name} done")
248 |             write_que.put("QUEUE_DONE")
249 |             break
250 |         doc = core.make_spacy_doc(corpus, lang=model)
251 |         pseries = process_documents(doc, parse_config_directory, parse_config_file)
252 |         write_que.put(pseries)
253 |     return True
254 | 
255 | 
256 | def clean_and_merge_chunks(que: Queue, name) -> pd.DataFrame:
257 |     NER_LOGGER.info(f"cleaner {name} reading chuncks from que")
258 |     df = None
259 |     while True:
260 |         try:
261 |             corpus = que.get(timeout=10)
262 |         except Exception as e:
263 |             NER_LOGGER.info(f"Writer {name} timed out: {e}")
264 |             df["Content"].apply(lambda x: x.strip())
265 |             # TODO Place this filter in config file
266 |             # Removes one and two letter words
267 |             m = ~df.apply(lambda x: x.str.contains("\\b[a-zA-Z]{1,2}\\b")).any(axis=1)
268 |             df = df[m]
269 |             return df
270 |         if not isinstance(corpus, pd.Series) and corpus == "QUEUE_DONE":
271 |             NER_LOGGER.info(f"Writer {name} received done")
272 |             break
273 |         elif isinstance(corpus, pd.Series):
274 |             NER_LOGGER.info(f"Writer {name} received a chunck")
275 |             if df is None:
276 |                 df = pd.DataFrame(corpus, columns=["Content"])
277 |             else:
278 |                 df2 = pd.DataFrame(corpus, columns=["Content"])
279 |                 df = pd.concat([df, df2])
280 | 
281 |     df["Content"].apply(lambda x: x.strip())
282 |     # TODO Place this filter in config file
283 |     # Removes one and two letter words
284 |     m = ~df.apply(lambda x: x.str.contains("\\b[a-zA-Z]{1,2}\\b")).any(axis=1)
285 |     df = df[m]
286 |     NER_LOGGER.info(f"writer {name} - Total amount of keys created: {len(df.index)}")
287 | 
288 |     return df
289 | 
290 | 
291 | @click.command()
292 | @click.option(
293 |     "--documents-directory",
294 |     "-d",
295 |     "documents_directory",
296 |     default="./run_files/documents/skynet",
297 |     help="The directory where your text files are stored",
298 | )
299 | @click.option(
300 |     "--key-storage", "-k", default="./run_files/key_storage/", help="The directory for the collection metadata keys."
301 | )
302 | @click.option(
303 |     "--keyfile-name",
304 |     "-k",
305 |     "keyfile_name",
306 |     default="keyfile.json",
307 |     help="Keyfile name.",
308 | )
309 | @click.option(
310 |     "--model",
311 |     "-m",
312 |     default="en_core_web_lg",
313 |     help="The spacy model to parse the text",
314 | )
315 | @click.option(
316 |     "--parse-config-directory", "-pcd", default="./run_files/parse_configs/", help="The parse config directory"
317 | )
318 | @click.option(
319 |     "--parse-config-file",
320 |     "-pcf",
321 |     default="ner_types.json",
322 |     help="The parse config file",
323 | )
324 | @click.option(
325 |     "--chunk-size",
326 |     "-cs",
327 |     "chunk_size",
328 |     type=int,
329 |     default=1000000,
330 |     help="The text chunk size for parsing. Default spacy maximum chunk size",
331 | )
332 | @click.option(
333 |     "--chunk-overlap",
334 |     "-co",
335 |     "chunk_overlap",
336 |     default=0,
337 |     type=int,
338 |     help="The overlap for text chunks for parsing",
339 | )
340 | @click.option(
341 |     "--threads",
342 |     "-t",
343 |     default=6,
344 |     type=int,
345 |     help="The parse config file",
346 | )
347 | def main(
348 |     documents_directory: str,
349 |     key_storage: str,
350 |     keyfile_name: str,
351 |     model: str,
352 |     parse_config_directory: str,
353 |     parse_config_file: str,
354 |     chunk_size: int,
355 |     chunk_overlap: int,
356 |     threads: int,
357 | ) -> None:
358 |     """Parse ner keywords from text using spacy and grammar configuration files."""
359 |     documents_pattern = os.path.join(documents_directory, "*.txt")
360 |     documents_paths_txt = glob.glob(documents_pattern)
361 |     text_corpus = ""
362 | 
363 |     for txt_document in documents_paths_txt:
364 |         NER_LOGGER.info(f"Reading: {txt_document}")
365 |         with open(txt_document, encoding="utf-8") as f:
366 |             content = f.read()
367 |             text_corpus = text_corpus + content
368 | 
369 |     manager = Manager()
370 |     read_que = manager.Queue()
371 |     write_que = manager.Queue()
372 | 
373 |     pool = Pool(threads)
374 | 
375 |     reader = pool.apply_async(
376 |         read_chuncks,
377 |         (
378 |             text_corpus,
379 |             chunk_size,
380 |             chunk_overlap,
381 |             read_que,
382 |             threads,
383 |         ),
384 |     )
385 | 
386 |     read_success = reader.get()
387 |     if not read_success:
388 |         return
389 | 
390 |     jobs = []
391 |     for i in range(threads):
392 |         job = pool.apply_async(
393 |             process_chuncks,
394 |             (
395 |                 model,
396 |                 parse_config_directory,
397 |                 parse_config_file,
398 |                 read_que,
399 |                 write_que,
400 |                 i,
401 |             ),
402 |         )
403 |         jobs.append(job)
404 | 
405 |     for job in jobs:
406 |         job.get()
407 | 
408 |     jobs = []
409 |     for i in range(threads):
410 |         job = pool.apply_async(
411 |             clean_and_merge_chunks,
412 |             (
413 |                 write_que,
414 |                 i,
415 |             ),
416 |         )
417 |         jobs.append(job)
418 | 
419 |     df = None
420 |     for job in jobs:
421 |         merge_result = job.get()
422 |         if merge_result is not None:
423 |             if df is None:
424 |                 df = merge_result
425 |             else:
426 |                 df = pd.concat([df, merge_result])
427 | 
428 |     pool.close()
429 |     pool.join()
430 | 
431 |     if df is not None:
432 |         df = df.drop_duplicates()
433 |         NER_LOGGER.info(f"Total amount of keys created: {len(df.index)}")
434 |         key_storage_path = os.path.join(key_storage, keyfile_name + ".json")
435 | 
436 |         NER_LOGGER.debug("Create key file")
437 |         json_key_file = df.to_json()
438 |         with open(key_storage_path, mode="w", encoding="utf-8") as key_file:
439 |             key_file.write(json_key_file)
440 | 
441 |         NER_LOGGER.info(f"Read files from directory: {documents_directory}")
442 |         NER_LOGGER.info(f"Wrote keys to: {key_storage_path}")
443 | 
444 | 
445 | if __name__ == "__main__":
446 |     main()
447 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_analysis/spacy_explain.py:
--------------------------------------------------------------------------------
 1 | import spacy
 2 | 
 3 | print(spacy.explain("PROPN"))
 4 | print(spacy.explain("NOUN"))
 5 | print(spacy.explain("ADJ"))
 6 | print(spacy.explain("NNP"))
 7 | print(spacy.explain("NN"))
 8 | print(spacy.explain("AUX"))
 9 | print(spacy.explain("VBZ"))
10 | print(spacy.explain("VERB"))
11 | print(spacy.explain("ADP"))
12 | print(spacy.explain("SYM"))
13 | print(spacy.explain("NUM"))
14 | print(spacy.explain("CD"))
15 | print(spacy.explain("VBG"))
16 | print(spacy.explain("ROOT"))
17 | 
18 | print(spacy.explain("FAC"))
19 | print(spacy.explain("NORP"))
20 | print(spacy.explain("GPE"))
21 | print(spacy.explain("PRODUCT"))
22 | print(spacy.explain("EVENT"))
23 | print(spacy.explain("PERSON"))
24 | print(spacy.explain("ORG"))
25 | print(spacy.explain("LOC"))
26 | print(spacy.explain("DATE"))
27 | print(spacy.explain("TIME"))
28 | print(spacy.explain("WORK_OF_ART"))
29 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/src/llama_cpp_chat_memory/document_parsing/__init__.py


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/extract/__init__.py:
--------------------------------------------------------------------------------
1 | from document_parsing.extract.basics import entities, ngrams, noun_chunks, terms
2 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/extract/basics.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Basics
  3 | ------
  4 | 
  5 | :mod:`textacy.extract.basics`: Extract basic components from a document or sentence
  6 | via spaCy, with bells and whistles for filtering the results.
  7 | """
  8 | 
  9 | import operator
 10 | from collections.abc import Callable, Collection, Iterable
 11 | from functools import partial
 12 | 
 13 | from cytoolz import itertoolz
 14 | from spacy.parts_of_speech import DET
 15 | from spacy.tokens import Span
 16 | 
 17 | from document_parsing.utils import constants, errors, types, utils
 18 | 
 19 | 
 20 | def ngrams(
 21 |     doclike: types.DocLike,
 22 |     n: int | Collection[int],
 23 |     *,
 24 |     filter_stops: bool = True,
 25 |     filter_punct: bool = True,
 26 |     filter_nums: bool = False,
 27 |     include_pos: str | Collection[str] | None = None,
 28 |     exclude_pos: str | Collection[str] | None = None,
 29 |     min_freq: int = 1,
 30 | ) -> Iterable[Span]:
 31 |     """
 32 |     Extract an ordered sequence of n-grams (``n`` consecutive tokens) from a spaCy
 33 |     ``Doc`` or ``Span``, for one or multiple ``n`` values, optionally filtering n-grams
 34 |     by the types and parts-of-speech of the constituent tokens.
 35 | 
 36 |     Args:
 37 |         doclike
 38 |         n: Number of tokens included per n-gram; for example, ``2`` yields bigrams
 39 |             and ``3`` yields trigrams. If multiple values are specified, then the
 40 |             collections of n-grams are concatenated together; for example, ``(2, 3)``
 41 |             yields bigrams and then trigrams.
 42 |         filter_stops: If True, remove ngrams that start or end with a stop word.
 43 |         filter_punct: If True, remove ngrams that contain any punctuation-only tokens.
 44 |         filter_nums: If True, remove ngrams that contain any numbers
 45 |             or number-like tokens (e.g. 10, 'ten').
 46 |         include_pos: Remove ngrams if any constituent tokens' part-of-speech tags
 47 |             ARE NOT included in this param.
 48 |         exclude_pos: Remove ngrams if any constituent tokens' part-of-speech tags
 49 |             ARE included in this param.
 50 |         min_freq: Remove ngrams that occur in ``doclike`` fewer than ``min_freq`` times
 51 | 
 52 |     Yields:
 53 |         Next ngram from ``doclike`` passing all specified filters, in order of appearance
 54 |         in the document.
 55 | 
 56 |     Raises:
 57 |         ValueError: if any ``n`` < 1
 58 |         TypeError: if ``include_pos`` or ``exclude_pos`` is not a str, a set of str,
 59 |             or a falsy value
 60 | 
 61 |     Note:
 62 |         Filtering by part-of-speech tag uses the universal POS tag set; for details,
 63 |         check spaCy's docs: https://spacy.io/api/annotation#pos-tagging
 64 |     """
 65 |     ns_: tuple[int, ...] = utils.to_tuple(n)
 66 |     if any(n_ < 1 for n_ in ns_):
 67 |         msg = "n must be greater than or equal to 1"
 68 |         raise ValueError(msg)
 69 | 
 70 |     ngrams_: Iterable[Span]
 71 |     for n_ in ns_:
 72 |         ngrams_ = (doclike[i : i + n_] for i in range(len(doclike) - n_ + 1))
 73 |         ngrams_ = (ng for ng in ngrams_ if not any(w.is_space for w in ng))
 74 |         if filter_stops is True:
 75 |             ngrams_ = (ng for ng in ngrams_ if not ng[0].is_stop and not ng[-1].is_stop)
 76 |         if filter_punct is True:
 77 |             ngrams_ = (ng for ng in ngrams_ if not any(w.is_punct for w in ng))
 78 |         if filter_nums is True:
 79 |             ngrams_ = (ng for ng in ngrams_ if not any(w.like_num for w in ng))
 80 |         if include_pos:
 81 |             include_pos_: set[str] = {pos.upper() for pos in utils.to_set(include_pos)}
 82 |             ngrams_ = (ng for ng in ngrams_ if all(w.pos_ in include_pos_ for w in ng))
 83 |         if exclude_pos:
 84 |             exclude_pos_: set[str] = {pos.upper() for pos in utils.to_set(exclude_pos)}
 85 |             ngrams_ = (ng for ng in ngrams_ if not any(w.pos_ in exclude_pos_ for w in ng))
 86 |         if min_freq > 1:
 87 |             ngrams_ = list(ngrams_)
 88 |             freqs = itertoolz.frequencies(ng.text.lower() for ng in ngrams_)
 89 |             ngrams_ = (ng for ng in ngrams_ if freqs[ng.text.lower()] >= min_freq)
 90 | 
 91 |         yield from ngrams_
 92 | 
 93 | 
 94 | def entities(
 95 |     doclike: types.DocLike,
 96 |     *,
 97 |     include_types: str | Collection[str] | None = None,
 98 |     exclude_types: str | Collection[str] | None = None,
 99 |     drop_determiners: bool = True,
100 |     min_freq: int = 1,
101 | ) -> Iterable[Span]:
102 |     """
103 |     Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from
104 |     a ``Doc``, optionally filtering by entity types and frequencies.
105 | 
106 |     Args:
107 |         doclike
108 |         include_types: Remove entities whose type IS NOT
109 |             in this param; if "NUMERIC", all numeric entity types ("DATE",
110 |             "MONEY", "ORDINAL", etc.) are included
111 |         exclude_types: Remove entities whose type IS
112 |             in this param; if "NUMERIC", all numeric entity types ("DATE",
113 |             "MONEY", "ORDINAL", etc.) are excluded
114 |         drop_determiners: Remove leading determiners (e.g. "the")
115 |             from entities (e.g. "the United States" => "United States").
116 | 
117 |             .. note:: Entities from which a leading determiner has been removed
118 |                are, effectively, *new* entities, and not saved to the ``Doc``
119 |                from which they came. This is irritating but unavoidable, since
120 |                this function is not meant to have side-effects on document state.
121 |                If you're only using the text of the returned spans, this is no
122 |                big deal, but watch out if you're counting on determiner-less
123 |                entities associated with the doc downstream.
124 | 
125 |         min_freq: Remove entities that occur in ``doclike`` fewer
126 |             than ``min_freq`` times
127 | 
128 |     Yields:
129 |         Next entity from ``doclike`` passing all specified filters in order of appearance
130 |         in the document
131 | 
132 |     Raises:
133 |         TypeError: if ``include_types`` or ``exclude_types`` is not a str, a set of
134 |             str, or a falsy value
135 |     """
136 |     ents = doclike.ents
137 | 
138 |     include_types = _parse_ent_types(include_types, "include")
139 |     exclude_types = _parse_ent_types(exclude_types, "exclude")
140 |     if include_types:
141 |         if isinstance(include_types, str):
142 |             ents = (ent for ent in ents if ent.label_ == include_types)
143 |         elif isinstance(include_types, set | frozenset | list | tuple):
144 |             ents = (ent for ent in ents if ent.label_ in include_types)
145 |     if exclude_types:
146 |         if isinstance(exclude_types, str):
147 |             ents = (ent for ent in ents if ent.label_ != exclude_types)
148 |         elif isinstance(exclude_types, set | frozenset | list | tuple):
149 |             ents = (ent for ent in ents if ent.label_ not in exclude_types)
150 |     if drop_determiners is True:
151 |         ents = (
152 |             ent if ent[0].pos != DET else Span(ent.doc, ent.start + 1, ent.end, label=ent.label, vector=ent.vector)
153 |             for ent in ents
154 |         )
155 |     if min_freq > 1:
156 |         ents = list(ents)  # type: ignore
157 |         freqs = itertoolz.frequencies(ent.text.lower() for ent in ents)
158 |         ents = (ent for ent in ents if freqs[ent.text.lower()] >= min_freq)
159 | 
160 |     yield from ents
161 | 
162 | 
163 | def _parse_ent_types(ent_types: str | Collection[str] | None, which: str) -> str | set[str] | None:
164 |     if not ent_types:
165 |         return None
166 |     elif isinstance(ent_types, str):
167 |         ent_types = ent_types.upper()
168 |         # replace the shorthand numeric case by its corresponding constant
169 |         if ent_types == "NUMERIC":
170 |             return constants.NUMERIC_ENT_TYPES
171 |         else:
172 |             return ent_types
173 |     elif isinstance(ent_types, set | frozenset | list | tuple):
174 |         ent_types = {ent_type.upper() for ent_type in ent_types}
175 |         # again, replace the shorthand numeric case by its corresponding constant
176 |         # and include it in the set in case other types are specified
177 |         if any(ent_type == "NUMERIC" for ent_type in ent_types):
178 |             return ent_types.union(constants.NUMERIC_ENT_TYPES)
179 |         else:
180 |             return ent_types
181 |     else:
182 |         raise TypeError(errors.type_invalid_msg(f"{which}_types", type(ent_types), [str | Collection[str]] | None))
183 | 
184 | 
185 | def noun_chunks(doclike: types.DocLike, *, drop_determiners: bool = True, min_freq: int = 1) -> Iterable[Span]:
186 |     """
187 |     Extract an ordered sequence of noun chunks from a spacy-parsed doc, optionally
188 |     filtering by frequency and dropping leading determiners.
189 | 
190 |     Args:
191 |         doclike
192 |         drop_determiners: Remove leading determiners (e.g. "the")
193 |             from phrases (e.g. "the quick brown fox" => "quick brown fox")
194 |         min_freq: Remove chunks that occur in ``doclike`` fewer than ``min_freq`` times
195 | 
196 |     Yields:
197 |         Next noun chunk from ``doclike`` in order of appearance in the document
198 |     """
199 |     ncs: Iterable[Span]
200 |     ncs = doclike.noun_chunks
201 |     if drop_determiners is True:
202 |         ncs = (nc if nc[0].pos != DET else nc[1:] for nc in ncs)
203 |     if min_freq > 1:
204 |         ncs = list(ncs)
205 |         freqs = itertoolz.frequencies(nc.text.lower() for nc in ncs)
206 |         ncs = (nc for nc in ncs if freqs[nc.text.lower()] >= min_freq)
207 | 
208 |     yield from ncs
209 | 
210 | 
211 | def terms(
212 |     doclike: types.DocLike,
213 |     *,
214 |     ngs: int | Collection[int] | types.DocLikeToSpans | None = None,
215 |     ents: bool | types.DocLikeToSpans | None = None,
216 |     ncs: bool | types.DocLikeToSpans | None = None,
217 |     dedupe: bool = True,
218 | ) -> Iterable[Span]:
219 |     """
220 |     Extract one or multiple types of terms -- ngrams, entities, and/or noun chunks --
221 |     from ``doclike`` as a single, concatenated collection, with optional deduplication
222 |     of spans extracted by more than one type.
223 | 
224 |     .. code-block:: pycon
225 | 
226 |         >>> extract.terms(doc, ngs=2, ents=True, ncs=True)
227 |         >>> extract.terms(doc, ngs=lambda doc: extract.ngrams(doc, n=2))
228 |         >>> extract.terms(doc, ents=extract.entities)
229 |         >>> extract.terms(doc, ents=partial(extract.entities, include_types="PERSON"))
230 | 
231 |     Args:
232 |         doclike
233 |         ngs: N-gram terms to be extracted.
234 |             If one or multiple ints, :func:`textacy.extract.ngrams(doclike, n=ngs)` is
235 |             used to extract terms; if a callable, ``ngs(doclike)`` is used to extract
236 |             terms; if None, no n-gram terms are extracted.
237 |         ents: Entity terms to be extracted.
238 |             If True, :func:`textacy.extract.entities(doclike)` is used to extract terms;
239 |             if a callable, ``ents(doclike)`` is used to extract terms;
240 |             if None, no entity terms are extracted.
241 |         ncs: Noun chunk terms to be extracted.
242 |             If True, :func:`textacy.extract.noun_chunks(doclike)` is used to extract
243 |             terms; if a callable, ``ncs(doclike)`` is used to extract terms;
244 |             if None, no noun chunk terms are extracted.
245 |         dedupe: If True, deduplicate terms whose spans are extracted by multiple types
246 |             (e.g. a span that is both an n-gram and an entity), as identified by
247 |             identical (start, stop) indexes in ``doclike``; otherwise, don't.
248 | 
249 |     Returns:
250 |         Next term from ``doclike``, in order of n-grams then entities then noun chunks,
251 |         with each collection's terms given in order of appearance.
252 | 
253 |     Note:
254 |         This function is *not* to be confused with keyterm extraction, which leverages
255 |         statistics and algorithms to quantify the "key"-ness of terms before returning
256 |         the top-ranking terms. There is no such scoring or ranking here.
257 | 
258 |     See Also:
259 |         - :func:`textacy.extact.ngrams()`
260 |         - :func:`textacy.extact.entities()`
261 |         - :func:`textacy.extact.noun_chunks()`
262 |         - :mod:`textacy.extact.keyterms`
263 |     """
264 |     extractors = _get_extractors(ngs, ents, ncs)
265 |     terms_ = itertoolz.concat(extractor(doclike) for extractor in extractors)
266 |     if dedupe is True:
267 |         terms_ = itertoolz.unique(terms_, lambda span: (span.start, span.end))
268 |     yield from terms_
269 | 
270 | 
271 | def _get_extractors(ngs, ents, ncs) -> list[types.DocLikeToSpans]:
272 |     all_extractors = [
273 |         _get_ngs_extractor(ngs),
274 |         _get_ents_extractor(ents),
275 |         _get_ncs_extractor(ncs),
276 |     ]
277 |     extractors = [extractor for extractor in all_extractors if extractor is not None]
278 |     if not extractors:
279 |         msg = "at least one term extractor must be specified"
280 |         raise ValueError(msg)
281 |     else:
282 |         return extractors
283 | 
284 | 
285 | def _get_ngs_extractor(ngs) -> types.DocLikeToSpans | None:
286 |     if ngs is None:
287 |         return None
288 |     elif callable(ngs):
289 |         return ngs
290 |     elif isinstance(ngs, int) or (isinstance(ngs, Collection) and all(isinstance(ng, int) for ng in ngs)):
291 |         return partial(ngrams, n=ngs)
292 |     else:
293 |         raise TypeError()
294 | 
295 | 
296 | def _get_ents_extractor(ents) -> types.DocLikeToSpans | None:
297 |     if ents is None:
298 |         return None
299 |     elif callable(ents):
300 |         return ents
301 |     elif isinstance(ents, bool):
302 |         return entities
303 |     else:
304 |         raise TypeError()
305 | 
306 | 
307 | def _get_ncs_extractor(ncs) -> types.DocLikeToSpans | None:
308 |     if ncs is None:
309 |         return None
310 |     elif callable(ncs):
311 |         return ncs
312 |     elif isinstance(ncs, bool):
313 |         return noun_chunks
314 |     else:
315 |         raise TypeError()
316 | 
317 | 
318 | def terms_to_strings(
319 |     terms: Iterable[types.SpanLike],
320 |     by: str | Callable[[types.SpanLike], str],
321 | ) -> Iterable[str]:
322 |     """
323 |     Transform a sequence of terms as spaCy ``Token`` s or ``Span`` s into strings.
324 | 
325 |     Args:
326 |         terms
327 |         by: Method by which terms are transformed into strings.
328 |             If "orth", terms are represented by their text exactly as written;
329 |             if "lower", by the lowercased form of their text;
330 |             if "lemma", by their base form w/o inflectional suffixes;
331 |             if a callable, must accept a ``Token`` or ``Span`` and return a string.
332 | 
333 |     Yields:
334 |         Next term in ``terms``, as a string.
335 |     """
336 |     terms_: Iterable[str]
337 |     if by == "lower":
338 |         terms_ = (term.text.lower() for term in terms)
339 |     elif by in ("lemma", "orth"):
340 |         by_ = operator.attrgetter(f"{by}_")
341 |         terms_ = (by_(term) for term in terms)
342 |     elif callable(by):
343 |         terms_ = (by(term) for term in terms)
344 |     else:
345 |         raise ValueError(errors.value_invalid_msg("by", by, {"orth", "lower", "lemma", Callable}))
346 |     yield from terms_
347 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/filter_csv.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import glob
  3 | import json
  4 | import logging
  5 | import os
  6 | from os.path import exists, join
  7 | 
  8 | import pandas as pd
  9 | from dotenv import find_dotenv, load_dotenv
 10 | 
 11 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.DEBUG)
 12 | # logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.INFO)
 13 | load_dotenv(find_dotenv())
 14 | 
 15 | 
 16 | def main(
 17 |     documents_directory: str,
 18 |     parse_config_directory: str,
 19 |     parse_config_file: str,
 20 |     filter_config_directory: str,
 21 |     filter_config_file: str,
 22 | ) -> None:
 23 |     documents_pattern = os.path.join(documents_directory, "*.csv")
 24 |     logging.debug(f"documents search pattern: {documents_pattern}")
 25 |     documents_paths_csv = glob.glob(documents_pattern)
 26 | 
 27 |     logging.debug("Reading config file")
 28 |     parse_config_path = join(".", parse_config_directory, parse_config_file)
 29 |     if exists(parse_config_path):
 30 |         with open(parse_config_path) as key_file:
 31 |             column_content = key_file.read()
 32 |         column_configs = json.loads(column_content)
 33 |     else:
 34 |         logging.debug("Could not load parse config file")
 35 |         return
 36 | 
 37 |     logging.debug("Reading filter file")
 38 |     filter_config_path = join(".", filter_config_directory, filter_config_file)
 39 |     if exists(filter_config_path):
 40 |         with open(filter_config_path) as key_file:
 41 |             filter_content = key_file.read()
 42 |         filter_configs = json.loads(filter_content)
 43 |     else:
 44 |         logging.debug("Could not load parse config file")
 45 |         return
 46 | 
 47 |     for csv_document in documents_paths_csv:
 48 |         logging.debug(f"Processing: {csv_document}")
 49 |         columns_list = []
 50 |         with open(csv_document, encoding="utf8") as f:
 51 |             first_line = f.readline()
 52 |         columns_line = "index" + first_line.strip()
 53 |         logging.debug("Matching csv type to config")
 54 |         for column_conf_key in column_configs:
 55 |             columns = column_configs[column_conf_key]["columns"]
 56 |             columns_string = ",".join(columns)
 57 |             if columns_string == columns_line:
 58 |                 columns_list = columns
 59 |                 logging.debug("Match found")
 60 |                 break
 61 | 
 62 |         logging.debug("Reading to datafile")
 63 |         df = pd.read_csv(csv_document, header=0, names=columns_list)
 64 |         item_count = df.shape[0]
 65 |         logging.debug(f"item count: {item_count}")
 66 |         logging.debug(df.head())
 67 | 
 68 |         for csv_filter in filter_configs["filters"]:
 69 |             if "whitelist" in csv_filter:
 70 |                 whitelist = csv_filter["whitelist"]
 71 |                 tags = csv_filter["filter_field"]
 72 |                 df = df[df[tags].apply(lambda x, wordlist=set(whitelist): any(word in x for word in wordlist))]
 73 |                 item_count = df.shape[0]
 74 |                 logging.debug(f"item count: {item_count}")
 75 |                 logging.debug(df.head())
 76 | 
 77 |             if "blacklist" in csv_filter:
 78 |                 blacklist = csv_filter["blacklist"]
 79 |                 tags = csv_filter["filter_field"]
 80 |                 df = df[df[tags].apply(lambda x, wordlist=set(blacklist): not any(word in x for word in wordlist))]
 81 |                 item_count = df.shape[0]
 82 |                 logging.debug(f"item count: {item_count}")
 83 |                 logging.debug(df.head())
 84 | 
 85 |         output = documents_directory + "/filtered.csv"
 86 |         df.to_csv(output, index=False)
 87 | 
 88 | 
 89 | if __name__ == "__main__":
 90 |     # Read the data directory, collection name, and persist directory
 91 |     parser = argparse.ArgumentParser(description="Filter rows from a csv file using whitelist and blacklist filters")
 92 | 
 93 |     # Add arguments
 94 |     parser.add_argument(
 95 |         "--data-directory",
 96 |         type=str,
 97 |         default="./run_files/documents/csv_test",
 98 |         help="The directory where your csv files are stored",
 99 |     )
100 | 
101 |     parser.add_argument(
102 |         "--parse-config-directory",
103 |         type=str,
104 |         default="./run_files/parse_configs/",
105 |         help="The parse config directory",
106 |     )
107 | 
108 |     parser.add_argument(
109 |         "--parse-config-file",
110 |         type=str,
111 |         default="csv_columns.json",
112 |         help="The parse config file",
113 |     )
114 | 
115 |     parser.add_argument(
116 |         "--filter-config-directory",
117 |         type=str,
118 |         default="./run_files/filters/",
119 |         help="The parse config directory",
120 |     )
121 | 
122 |     parser.add_argument(
123 |         "--filter-config-file",
124 |         type=str,
125 |         default="csv_filter.json",
126 |         help="The parse config file",
127 |     )
128 | 
129 |     # Parse arguments
130 |     args = parser.parse_args()
131 | 
132 |     main(
133 |         documents_directory=args.data_directory,
134 |         parse_config_directory=args.parse_config_directory,
135 |         parse_config_file=args.parse_config_file,
136 |         filter_config_directory=args.filter_config_directory,
137 |         filter_config_file=args.filter_config_file,
138 |     )
139 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/parse_csv_to_text.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import glob
  3 | import json
  4 | import logging
  5 | import os
  6 | from os.path import exists, join, splitext
  7 | 
  8 | import pandas as pd
  9 | from dotenv import find_dotenv, load_dotenv
 10 | from trafilatura import extract
 11 | 
 12 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.DEBUG)
 13 | # logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.INFO)
 14 | load_dotenv(find_dotenv())
 15 | 
 16 | 
 17 | def main(
 18 |     documents_directory: str,
 19 |     parse_config_directory: str,
 20 |     parse_config_file: str,
 21 |     filter_config_directory: str,
 22 |     filter_config_file: str,
 23 | ) -> None:
 24 |     documents_pattern = os.path.join(documents_directory, "*.csv")
 25 |     logging.debug(f"documents search pattern: {documents_pattern}")
 26 |     documents_paths_csv = glob.glob(documents_pattern)
 27 | 
 28 |     logging.debug("Reading config file")
 29 |     parse_config_path = join(".", parse_config_directory, parse_config_file)
 30 |     if exists(parse_config_path):
 31 |         with open(parse_config_path) as key_file:
 32 |             column_content = key_file.read()
 33 |         column_configs = json.loads(column_content)
 34 |     else:
 35 |         logging.debug("Could not load parse config file")
 36 |         return
 37 | 
 38 |     logging.debug("Reading filter file")
 39 |     filter_config_path = join(".", filter_config_directory, filter_config_file)
 40 |     if exists(filter_config_path):
 41 |         with open(filter_config_path) as key_file:
 42 |             filter_content = key_file.read()
 43 |         filter_configs = json.loads(filter_content)
 44 |     else:
 45 |         logging.debug("Could not load parse config file")
 46 |         return
 47 | 
 48 |     parse_filters = filter_configs["filters"]
 49 | 
 50 |     for csv_document in documents_paths_csv:
 51 |         logging.debug(f"Processing: {csv_document}")
 52 |         data = ""
 53 |         columns_list = []
 54 |         with open(csv_document, encoding="utf8") as f:
 55 |             first_line = f.readline()
 56 |         columns_line = "index" + first_line.strip()
 57 |         logging.debug("Matching csv type to config")
 58 |         for column_conf_key in column_configs:
 59 |             columns = column_configs[column_conf_key]["columns"]
 60 |             columns_string = ",".join(columns)
 61 |             index_columns_string = "index" + columns_string
 62 |             if columns_line in (columns_string, index_columns_string):
 63 |                 columns_list = columns
 64 |                 data = column_configs[column_conf_key]["datafield"]
 65 |                 logging.debug("Match found")
 66 |                 break
 67 | 
 68 |         logging.debug("Reading to datafile")
 69 |         df = pd.read_csv(csv_document, header=0, names=columns_list)
 70 |         # logging.debug(df.head())
 71 |         # logging.debug(df[data].head())
 72 | 
 73 |         for parse_filter in parse_filters:
 74 |             filter_iterator = iter(parse_filter)
 75 |             parse_regex = next(filter_iterator)
 76 |             parse_replacment = next(filter_iterator)
 77 |             logging.debug(f"Applying filter: {parse_regex}")
 78 |             df[data] = df[data].replace(
 79 |                 to_replace=parse_filter[parse_regex], value=parse_filter[parse_replacment], regex=True
 80 |             )
 81 |         base = splitext(csv_document)[0]
 82 |         doc_path = base + ".txt"
 83 |         logging.debug("Writing to file")
 84 |         with open(file=doc_path, mode="a", encoding="utf-8") as doc_file:
 85 |             for line in df[data].to_numpy():
 86 |                 clean_text = extract(line)
 87 |                 if clean_text is not None:
 88 |                     doc_file.write(clean_text + "\n\n")
 89 |                 # logging.info(clean_text)
 90 | 
 91 | 
 92 | if __name__ == "__main__":
 93 |     # Read the data directory, collection name, and persist directory
 94 |     parser = argparse.ArgumentParser(description="Parse csv file to a text file and filter out noise from web scrapes.")
 95 | 
 96 |     # Add arguments
 97 |     parser.add_argument(
 98 |         "--data-directory",
 99 |         type=str,
100 |         default="./run_files/documents/csv_test",
101 |         help="The directory where your csv files are stored",
102 |     )
103 | 
104 |     parser.add_argument(
105 |         "--parse-config-directory",
106 |         type=str,
107 |         default="./run_files/parse_configs/",
108 |         help="The parse config directory",
109 |     )
110 | 
111 |     parser.add_argument(
112 |         "--parse-config-file",
113 |         type=str,
114 |         default="csv_columns.json",
115 |         help="The parse config file",
116 |     )
117 | 
118 |     parser.add_argument(
119 |         "--filter-config-directory",
120 |         type=str,
121 |         default="./run_files/filters/",
122 |         help="The parse config directory",
123 |     )
124 | 
125 |     parser.add_argument(
126 |         "--filter-config-file",
127 |         type=str,
128 |         default="web_scrape_filter.json",
129 |         help="The parse config file",
130 |     )
131 | 
132 |     # Parse arguments
133 |     args = parser.parse_args()
134 | 
135 |     main(
136 |         documents_directory=args.data_directory,
137 |         parse_config_directory=args.parse_config_directory,
138 |         parse_config_file=args.parse_config_file,
139 |         filter_config_directory=args.filter_config_directory,
140 |         filter_config_file=args.filter_config_file,
141 |     )
142 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/parse_json_documents.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import glob
  3 | import json
  4 | import logging
  5 | import os
  6 | import uuid
  7 | from os import getenv
  8 | from os.path import join
  9 | 
 10 | import chromadb
 11 | from chromadb.config import Settings
 12 | from custom_llm_classes.custom_spacy_embeddings import CustomSpacyEmbeddings
 13 | from dotenv import find_dotenv, load_dotenv
 14 | from langchain.docstore.document import Document
 15 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 16 | from langchain_community.embeddings import HuggingFaceEmbeddings, LlamaCppEmbeddings
 17 | from langchain_community.vectorstores import Chroma
 18 | 
 19 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.DEBUG)
 20 | # logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.INFO)
 21 | load_dotenv(find_dotenv())
 22 | 
 23 | 
 24 | def main(
 25 |     documents_directory: str,
 26 |     collection_name: str,
 27 |     persist_directory: str,
 28 |     key_storage: str,
 29 |     chunk_size: int,
 30 |     chunk_overlap: int,
 31 |     embeddings_type: str,
 32 | ) -> None:
 33 |     model_dir = getenv("MODEL_DIR")
 34 |     model = getenv("MODEL")
 35 |     model_source = join(model_dir, model)
 36 |     embeddings_model = getenv("EMBEDDINGS_MODEL")
 37 | 
 38 |     all_documents = []
 39 |     all_keys = {}
 40 |     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
 41 | 
 42 |     documents_pattern = os.path.join(documents_directory, "*.json")
 43 |     documents_paths_json = glob.glob(documents_pattern)
 44 | 
 45 |     for json_document in documents_paths_json:
 46 |         with open(json_document, encoding="utf-8") as f:
 47 |             content = f.read()
 48 |         document_content = json.loads(content)
 49 |         if isinstance(document_content["entries"], list):
 50 |             logging.debug("Parsing List")
 51 |             for entry in document_content["entries"]:
 52 |                 document_text = ""
 53 |                 metadata_filters = {"source": json_document}
 54 | 
 55 |                 if "content" in entry:
 56 |                     document_text = document_text + entry["content"]
 57 |                 elif "entry" in entry:
 58 |                     document_text = document_text + entry["entry"]
 59 | 
 60 |                 logging.debug(f"Extracted a key: {entry['keys']}")
 61 |                 for m_filter in entry["keys"]:
 62 |                     filter_uuid = str(uuid.uuid1())
 63 |                     metadata_filters[filter_uuid] = m_filter
 64 | 
 65 |                 all_keys = metadata_filters
 66 |                 json_doc = [Document(page_content=document_text, metadata=metadata_filters)]
 67 |                 json_document_content = text_splitter.split_documents(json_doc)
 68 |                 all_documents.extend(json_document_content)
 69 |         elif isinstance(document_content["entries"], dict):
 70 |             logging.debug("Parsing dict")
 71 |             for entry in document_content["entries"]:
 72 |                 metadata_filters = {"source": json_document}
 73 |                 document_text = document_text + document_content["entries"][entry]["content"]
 74 | 
 75 |                 logging.debug(f"Extracted a key: {document_content['entries'][entry]['key']}")
 76 |                 for m_filter in document_content["entries"][entry]["key"]:
 77 |                     filter_uuid = str(uuid.uuid1())
 78 |                     metadata_filters[filter_uuid] = m_filter
 79 | 
 80 |                 all_keys = metadata_filters
 81 |                 json_doc = [Document(page_content=document_text, metadata=metadata_filters)]
 82 |                 json_document_content = text_splitter.split_documents(json_doc)
 83 |                 all_documents.extend(json_document_content)
 84 | 
 85 |     if embeddings_type == "llama":
 86 |         logging.info("Using llama embeddigs")
 87 |         params = {
 88 |             "n_ctx": getenv("N_CTX"),
 89 |             "n_batch": 1024,
 90 |             "n_gpu_layers": getenv("LAYERS"),
 91 |         }
 92 |         embedder = LlamaCppEmbeddings(
 93 |             model_path=model_source,
 94 |             **params,
 95 |         )
 96 |     elif embeddings_type == "spacy":
 97 |         logging.info("Using spacy embeddigs")
 98 |         # embedder = CustomSpacyEmbeddings(model_path="en_core_web_lg")
 99 |         embedder = CustomSpacyEmbeddings(model_path=embeddings_model)
100 |     elif embeddings_type == "huggingface":
101 |         logging.info("Using huggingface embeddigs")
102 |         # model_name = "sentence-transformers/all-mpnet-base-v2"
103 |         model_kwargs = {"device": "cpu"}
104 |         encode_kwargs = {"normalize_embeddings": False}
105 |         embedder = HuggingFaceEmbeddings(
106 |             model_name=embeddings_model, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
107 |         )
108 |     else:
109 |         error_message = f"Unsupported embeddings type: {embeddings_type}"
110 |         raise ValueError(error_message)
111 |     client = chromadb.PersistentClient(path=persist_directory, settings=Settings(anonymized_telemetry=False))
112 |     Chroma.from_documents(
113 |         client=client,
114 |         documents=all_documents,
115 |         embedding=embedder,
116 |         persist_directory=persist_directory,
117 |         collection_name=collection_name,
118 |         collection_metadata={"hnsw:space": "l2"},
119 |     )
120 | 
121 |     logging.debug(f"Key file content: {all_keys}")
122 | 
123 |     json_key_file = json.dumps(all_keys)
124 |     # logging.debug(f"Key file uuid keys: {list(all_keys.keys())}")
125 | 
126 |     # If you enable this you might want to pipe the output to a file
127 |     # logging.debug(all_documents)
128 | 
129 |     key_storage_path = os.path.join(key_storage, collection_name + ".json")
130 |     with open(key_storage_path, "w", encoding="utf-8") as key_file:
131 |         key_file.write(json_key_file)
132 | 
133 |     logging.info(f"Read files from directory: {documents_directory}")
134 |     logging.info(f"Text parsed with chunk size: {chunk_size}, and chunk overlap: {chunk_overlap}")
135 |     logging.debug(f"Saved collection as: {collection_name}")
136 |     logging.debug(f"Saved collection to: {persist_directory}")
137 |     logging.info(f"Wrote keys to: {key_storage_path}")
138 | 
139 | 
140 | if __name__ == "__main__":
141 |     # Read the data directory, collection name, and persist directory
142 |     parser = argparse.ArgumentParser(description="Parse json documents to documents and upload to chroma")
143 | 
144 |     # Add arguments
145 |     parser.add_argument(
146 |         "--data-directory",
147 |         type=str,
148 |         default="./run_files/documents/hogwarts",
149 |         help="The directory where your text files are stored",
150 |     )
151 |     parser.add_argument(
152 |         "--collection-name",
153 |         type=str,
154 |         default="hogwarts",
155 |         help="The name of the Chroma collection",
156 |     )
157 |     parser.add_argument(
158 |         "--persist-directory",
159 |         type=str,
160 |         default="./run_files/character_storage/",
161 |         help="The directory where you want to store the Chroma collection",
162 |     )
163 | 
164 |     parser.add_argument(
165 |         "--key-storage",
166 |         type=str,
167 |         default="./run_files/key_storage/",
168 |         help="The directory where you want to store the Chroma collection metadata keys",
169 |     )
170 | 
171 |     parser.add_argument(
172 |         "--chunk-size",
173 |         type=int,
174 |         default=1024,
175 |         help="The text chunk size for parsing",
176 |     )
177 | 
178 |     parser.add_argument(
179 |         "--chunk-overlap",
180 |         type=int,
181 |         default=0,
182 |         help="The overlap for text chunks for parsing",
183 |     )
184 | 
185 |     parser.add_argument(
186 |         "--embeddings-type",
187 |         type=str,
188 |         default="spacy",
189 |         help="The chosen embeddings type",
190 |     )
191 | 
192 |     # Parse arguments
193 |     args = parser.parse_args()
194 | 
195 |     main(
196 |         documents_directory=args.data_directory,
197 |         collection_name=args.collection_name,
198 |         persist_directory=args.persist_directory,
199 |         key_storage=args.key_storage,
200 |         chunk_size=args.chunk_size,
201 |         chunk_overlap=args.chunk_overlap,
202 |         embeddings_type=args.embeddings_type,
203 |     )
204 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/parse_pdf_documents.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import glob
  3 | import logging
  4 | import os
  5 | from os import getenv
  6 | from os.path import join
  7 | 
  8 | import chromadb
  9 | from chromadb.config import Settings
 10 | from custom_llm_classes.custom_spacy_embeddings import CustomSpacyEmbeddings
 11 | from dotenv import find_dotenv, load_dotenv
 12 | from langchain.document_loaders import PyPDFLoader
 13 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 14 | from langchain_community.embeddings import HuggingFaceEmbeddings, LlamaCppEmbeddings
 15 | from langchain_community.vectorstores import Chroma
 16 | 
 17 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.DEBUG)
 18 | # logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.INFO)
 19 | load_dotenv(find_dotenv())
 20 | 
 21 | 
 22 | def main(
 23 |     documents_directory: str,
 24 |     collection_name: str,
 25 |     persist_directory: str,
 26 |     chunk_size: int,
 27 |     chunk_overlap: int,
 28 |     embeddings_type: str,
 29 | ) -> None:
 30 |     model_dir = getenv("MODEL_DIR")
 31 |     model = getenv("MODEL")
 32 |     model_source = join(model_dir, model)
 33 |     embeddings_model = getenv("EMBEDDINGS_MODEL")
 34 | 
 35 |     documents_pattern = os.path.join(documents_directory, "*.pdf")
 36 |     logging.debug(f"documents search pattern: {documents_pattern}")
 37 |     documents_paths_pdf = glob.glob(documents_pattern)
 38 | 
 39 |     all_documents = []
 40 |     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
 41 |     for pdf_document in documents_paths_pdf:
 42 |         logging.debug(f"loading: {pdf_document}")
 43 |         loader = PyPDFLoader(pdf_document)
 44 |         docs = loader.load_and_split(text_splitter=text_splitter)
 45 |         all_documents.extend(docs)
 46 | 
 47 |     if embeddings_type == "llama":
 48 |         params = {
 49 |             "n_ctx": getenv("N_CTX"),
 50 |             "n_batch": 1024,
 51 |             "n_gpu_layers": getenv("LAYERS"),
 52 |         }
 53 | 
 54 |         logging.info("Using llama embeddigs")
 55 |         embedder = LlamaCppEmbeddings(
 56 |             model_path=model_source,
 57 |             **params,
 58 |         )
 59 |     elif embeddings_type == "spacy":
 60 |         logging.info("Using spacy embeddigs")
 61 |         # embedder = CustomSpacyEmbeddings(model_path="en_core_web_lg")
 62 |         embedder = CustomSpacyEmbeddings(model_path=embeddings_model)
 63 |     elif embeddings_type == "huggingface":
 64 |         logging.info("Using huggingface embeddigs")
 65 |         # model_name = "sentence-transformers/all-mpnet-base-v2"
 66 |         model_kwargs = {"device": "cpu"}
 67 |         encode_kwargs = {"normalize_embeddings": False}
 68 |         embedder = HuggingFaceEmbeddings(
 69 |             model_name=embeddings_model, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
 70 |         )
 71 | 
 72 |     else:
 73 |         error_message = f"Unsupported embeddings type: {embeddings_type}"
 74 |         raise ValueError(error_message)
 75 |     client = chromadb.PersistentClient(path=persist_directory, settings=Settings(anonymized_telemetry=False))
 76 |     Chroma.from_documents(
 77 |         client=client,
 78 |         documents=all_documents,
 79 |         embedding=embedder,
 80 |         persist_directory=persist_directory,
 81 |         collection_name=collection_name,
 82 |         collection_metadata={"hnsw:space": "l2"},
 83 |     )
 84 | 
 85 |     # If you enable this you might want to pipe the output to a file
 86 |     # logging.debug(all_documents)
 87 | 
 88 |     logging.info(f"Read files from directory: {documents_directory}")
 89 |     logging.info(f"Text parsed with chunk size: {chunk_size}, and chunk overlap: {chunk_overlap}")
 90 |     logging.debug(f"Saved collection as: {collection_name}")
 91 |     logging.debug(f"Saved collection to: {persist_directory}")
 92 | 
 93 | 
 94 | if __name__ == "__main__":
 95 |     # Read the data directory, collection name, and persist directory
 96 |     parser = argparse.ArgumentParser(description="Parse pdf documents to documents and upload to chroma")
 97 | 
 98 |     # Add arguments
 99 |     parser.add_argument(
100 |         "--data-directory",
101 |         type=str,
102 |         default="./run_files/documents/fyodor_dostoyevsky",
103 |         help="The directory where your text files are stored",
104 |     )
105 |     parser.add_argument(
106 |         "--collection-name",
107 |         type=str,
108 |         default="dostoyevsky",
109 |         help="The name of the Chroma collection",
110 |     )
111 |     parser.add_argument(
112 |         "--persist-directory",
113 |         type=str,
114 |         default="./run_files/character_storage/",
115 |         help="The directory where you want to store the Chroma collection",
116 |     )
117 | 
118 |     parser.add_argument(
119 |         "--chunk-size",
120 |         type=int,
121 |         default=1024,
122 |         help="The text chunk size for parsing",
123 |     )
124 | 
125 |     parser.add_argument(
126 |         "--chunk-overlap",
127 |         type=int,
128 |         default=0,
129 |         help="The overlap for text chunks for parsing",
130 |     )
131 | 
132 |     parser.add_argument(
133 |         "--embeddings-type",
134 |         type=str,
135 |         default="spacy",
136 |         help="The chosen embeddings type",
137 |     )
138 | 
139 |     # Parse arguments
140 |     args = parser.parse_args()
141 | 
142 |     main(
143 |         documents_directory=args.data_directory,
144 |         collection_name=args.collection_name,
145 |         persist_directory=args.persist_directory,
146 |         chunk_size=args.chunk_size,
147 |         chunk_overlap=args.chunk_overlap,
148 |         embeddings_type=args.embeddings_type,
149 |     )
150 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/parse_text_documents.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import json
  3 | import logging
  4 | import multiprocessing as mp
  5 | import os
  6 | 
  7 | # For perf measuring
  8 | import time
  9 | from multiprocessing import Manager, Pool
 10 | from os import getenv
 11 | from os.path import join
 12 | 
 13 | import chromadb
 14 | import click
 15 | import pandas as pd
 16 | from chromadb.config import Settings
 17 | from custom_llm_classes.custom_spacy_embeddings import CustomSpacyEmbeddings
 18 | from dotenv import find_dotenv, load_dotenv
 19 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 20 | from langchain_community.document_loaders import TextLoader
 21 | from langchain_community.embeddings import HuggingFaceEmbeddings, LlamaCppEmbeddings
 22 | from langchain_community.vectorstores import Chroma
 23 | from langchain_core.documents.base import Document
 24 | 
 25 | # This is the config for multiprocess logger
 26 | # Setting the level to debug outputs multiprocess debug lines too
 27 | NER_LOGGER = mp.get_logger()
 28 | FORMAT = "%(levelname)s:%(message)s"
 29 | formatter = logging.Formatter(fmt=FORMAT)
 30 | handler = logging.StreamHandler()
 31 | handler.setFormatter(formatter)
 32 | 
 33 | NER_LOGGER.addHandler(handler)
 34 | NER_LOGGER.setLevel(logging.INFO)
 35 | 
 36 | load_dotenv(find_dotenv())
 37 | 
 38 | 
 39 | def read_documents(
 40 |     all_documents,
 41 |     que,
 42 |     reader_num,
 43 | ) -> bool:
 44 |     NER_LOGGER.info("Reading documents to que")
 45 |     for doc in all_documents:
 46 |         que.put(doc)
 47 |     for _i in range(reader_num):
 48 |         que.put("QUEUE_DONE")
 49 |     NER_LOGGER.info("Reader done")
 50 |     return True
 51 | 
 52 | 
 53 | def process_documents(all_keys, read_que, write_que, name) -> bool:
 54 |     NER_LOGGER.info(f"Processor {name} reading documents from que")
 55 |     while True:
 56 |         try:
 57 |             document = read_que.get(timeout=10)
 58 |         except Exception as e:
 59 |             NER_LOGGER.info(f"Processor {name} timed out: {e}")
 60 |             write_que.put("QUEUE_DONE")
 61 |             return False
 62 | 
 63 |         if document == "QUEUE_DONE":
 64 |             NER_LOGGER.info(f"Processor {name} done")
 65 |             write_que.put("QUEUE_DONE")
 66 |             break
 67 | 
 68 |         for key in all_keys:
 69 |             if all_keys[key] in document.page_content:
 70 |                 document.metadata[key] = all_keys[key]
 71 |         write_que.put(document)
 72 |     return True
 73 | 
 74 | 
 75 | def clean_and_merge_documents(que, name) -> pd.DataFrame:
 76 |     NER_LOGGER.info(f"cleaner {name} reading documents from que")
 77 |     document_list = []
 78 |     while True:
 79 |         try:
 80 |             document = que.get(timeout=10)
 81 |         except Exception as e:
 82 |             NER_LOGGER.info(f"Writer {name} timed out: {e}")
 83 |             return document_list
 84 |         if not isinstance(document, Document) and document == "QUEUE_DONE":
 85 |             NER_LOGGER.info(f"Writer {name} received done")
 86 |             break
 87 |         elif isinstance(document, Document):
 88 |             NER_LOGGER.info(f"Writer {name} received a document")
 89 |             document_list.append(document)
 90 | 
 91 |     return document_list
 92 | 
 93 | 
 94 | @click.command()
 95 | @click.option(
 96 |     "--documents-directory",
 97 |     "-d",
 98 |     "documents_directory",
 99 |     default="./run_files/documents/skynet",
100 |     help="The directory where your text files are stored",
101 | )
102 | @click.option("--collection-name", "-c", default="skynet", help="The name of the Chroma collection.")
103 | @click.option(
104 |     "--persist-directory",
105 |     "-p",
106 |     default="./run_files/character_storage/",
107 |     help="The directory where you want to store the Chroma collection.",
108 | )
109 | @click.option(
110 |     "--key-storage", "-k", default="./run_files/key_storage/", help="The directory for the collection metadata keys."
111 | )
112 | @click.option("--keyfile-name", "-k", default="none", help="Keyfile name. If not given, defaults to collection name.")
113 | @click.option("--embeddings-type", "-e", default="spacy", help="The chosen embeddings type.")
114 | @click.option("--threads", "-t", default=6, type=int, help="The number of threads to use for parsing.")
115 | @click.option("--chunk-size", "-cs", default=2048, type=int, help="Data chunk for size for parsing.")
116 | @click.option("--chunk-overlap", "-co", default=1024, type=int, help="Overlap for the chunks.")
117 | def main(
118 |     documents_directory: str,
119 |     collection_name: str,
120 |     persist_directory: str,
121 |     chunk_size: int,
122 |     chunk_overlap: int,
123 |     key_storage: str,
124 |     keyfile_name: str,
125 |     embeddings_type: str,
126 |     threads: int,
127 | ) -> None:
128 |     """
129 |     This script parses text documents into a chroma collection. Using langchain RecursiveSplitter.
130 |     Text documents are loaded from a directory and parsed into chunk sized text pieces.
131 |     These pieces are matched for metadata keys in keyfile.
132 |     The matching is done with multiprocess to improve perf for large collections and keyfiles.
133 |     The resulting documents are pushed into a Chroma vector data collection in persist-directory.
134 |     """
135 |     model_dir = getenv("MODEL_DIR")
136 |     model = getenv("MODEL")
137 |     model_source = join(model_dir, model)
138 |     embeddings_model = getenv("EMBEDDINGS_MODEL")
139 | 
140 |     documents_pattern = os.path.join(documents_directory, "*.txt")
141 |     documents_paths_txt = glob.glob(documents_pattern)
142 | 
143 |     all_documents = []
144 |     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
145 |     for txt_document in documents_paths_txt:
146 |         loader = TextLoader(txt_document, encoding="utf-8")
147 |         documents = loader.load()
148 |         docs = text_splitter.split_documents(documents)
149 |         all_documents.extend(docs)
150 | 
151 |     if keyfile_name == "none":
152 |         key_storage_path = join(key_storage, collection_name + ".json")
153 |     else:
154 |         key_storage_path = join(key_storage, keyfile_name)
155 | 
156 |     all_keys = None
157 |     NER_LOGGER.info(f"Loading filter list from: {key_storage_path}")
158 |     with open(key_storage_path, encoding="utf-8") as key_file:
159 |         content = key_file.read()
160 |     all_keys = json.loads(content)
161 |     if "Content" in all_keys:
162 |         all_keys = all_keys["Content"]
163 | 
164 |     # Start Timer
165 |     tic = time.perf_counter()
166 | 
167 |     manager = Manager()
168 |     read_que = manager.Queue()
169 |     write_que = manager.Queue()
170 | 
171 |     pool = Pool(threads)
172 | 
173 |     reader = pool.apply_async(
174 |         read_documents,
175 |         (
176 |             all_documents,
177 |             read_que,
178 |             threads,
179 |         ),
180 |     )
181 | 
182 |     read_success = reader.get()
183 |     if not read_success:
184 |         return
185 | 
186 |     jobs = []
187 |     for i in range(threads):
188 |         job = pool.apply_async(
189 |             process_documents,
190 |             (
191 |                 all_keys,
192 |                 read_que,
193 |                 write_que,
194 |                 i,
195 |             ),
196 |         )
197 |         jobs.append(job)
198 | 
199 |     for job in jobs:
200 |         job.get()
201 | 
202 |     jobs = []
203 |     for i in range(threads):
204 |         job = pool.apply_async(
205 |             clean_and_merge_documents,
206 |             (
207 |                 write_que,
208 |                 i,
209 |             ),
210 |         )
211 |         jobs.append(job)
212 | 
213 |     document_list = None
214 |     for job in jobs:
215 |         merge_result = job.get()
216 |         if merge_result is not None:
217 |             if document_list is None:
218 |                 document_list = merge_result
219 |             else:
220 |                 document_list = document_list + merge_result
221 | 
222 |     pool.close()
223 |     pool.join()
224 | 
225 |     # Stop timer
226 |     toc = time.perf_counter()
227 |     NER_LOGGER.info(f"Keys took {toc - tic:0.4f} seconds")
228 | 
229 |     tic = time.perf_counter()
230 |     if embeddings_type == "llama":
231 |         NER_LOGGER.info("Using llama embeddigs")
232 |         params = {
233 |             "n_ctx": getenv("N_CTX"),
234 |             "n_batch": 1024,
235 |             "n_gpu_layers": getenv("LAYERS"),
236 |         }
237 |         embedder = LlamaCppEmbeddings(
238 |             model_path=model_source,
239 |             **params,
240 |         )
241 |     elif embeddings_type == "spacy":
242 |         NER_LOGGER.info("Using spacy embeddigs")
243 |         # embedder = CustomSpacyEmbeddings(model_path="en_core_web_lg")
244 |         embedder = CustomSpacyEmbeddings(model_path=embeddings_model)
245 |     elif embeddings_type == "huggingface":
246 |         NER_LOGGER.info("Using huggingface embeddigs")
247 |         # model_name = "sentence-transformers/all-mpnet-base-v2"
248 |         model_kwargs = {"device": "cpu"}
249 |         encode_kwargs = {"normalize_embeddings": False}
250 |         embedder = HuggingFaceEmbeddings(
251 |             model_name=embeddings_model, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
252 |         )
253 | 
254 |     else:
255 |         error_message = f"Unsupported embeddings type: {embeddings_type}"
256 |         raise ValueError(error_message)
257 |     client = chromadb.PersistentClient(path=persist_directory, settings=Settings(anonymized_telemetry=False))
258 |     Chroma.from_documents(
259 |         client=client,
260 |         documents=document_list,
261 |         embedding=embedder,
262 |         persist_directory=persist_directory,
263 |         collection_name=collection_name,
264 |         collection_metadata={"hnsw:space": "l2"},
265 |     )
266 | 
267 |     # Stop timer
268 |     toc = time.perf_counter()
269 |     NER_LOGGER.info(f"Storing embeddings took {toc - tic:0.4f} seconds")
270 | 
271 |     NER_LOGGER.info(f"Read metadata filters from directory: {key_storage_path}")
272 |     NER_LOGGER.info(f"Read files from directory: {documents_directory}")
273 |     NER_LOGGER.info(f"Text parsed with chunk size: {chunk_size}, and chunk overlap: {chunk_overlap}")
274 |     NER_LOGGER.info(f"Saved collection as: {collection_name}")
275 |     NER_LOGGER.info(f"Saved collection to: {persist_directory}")
276 | 
277 | 
278 | if __name__ == "__main__":
279 |     main()
280 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/parse_text_documents_old.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import glob
  3 | import json
  4 | import logging
  5 | import os
  6 | from os import getenv
  7 | from os.path import join
  8 | 
  9 | import chromadb
 10 | from chromadb.config import Settings
 11 | from custom_llm_classes.custom_spacy_embeddings import CustomSpacyEmbeddings
 12 | from dotenv import find_dotenv, load_dotenv
 13 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 14 | from langchain_community.document_loaders import TextLoader
 15 | from langchain_community.embeddings import HuggingFaceEmbeddings, LlamaCppEmbeddings
 16 | from langchain_community.vectorstores import Chroma
 17 | 
 18 | # logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.DEBUG)
 19 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.INFO)
 20 | load_dotenv(find_dotenv())
 21 | 
 22 | 
 23 | def main(
 24 |     documents_directory: str,
 25 |     collection_name: str,
 26 |     persist_directory: str,
 27 |     chunk_size: int,
 28 |     chunk_overlap: int,
 29 |     key_storage: str,
 30 |     embeddings_type: str,
 31 | ) -> None:
 32 |     model_dir = getenv("MODEL_DIR")
 33 |     model = getenv("MODEL")
 34 |     model_source = join(model_dir, model)
 35 |     embeddings_model = getenv("EMBEDDINGS_MODEL")
 36 | 
 37 |     documents_pattern = os.path.join(documents_directory, "*.txt")
 38 |     documents_paths_txt = glob.glob(documents_pattern)
 39 | 
 40 |     all_documents = []
 41 |     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
 42 |     for txt_document in documents_paths_txt:
 43 |         loader = TextLoader(txt_document, encoding="utf-8")
 44 |         documents = loader.load()
 45 |         docs = text_splitter.split_documents(documents)
 46 |         all_documents.extend(docs)
 47 | 
 48 |     key_storage_path = join(key_storage, collection_name + ".json")
 49 | 
 50 |     with open(key_storage_path, encoding="utf-8") as key_file:
 51 |         content = key_file.read()
 52 |     all_keys = json.loads(content)
 53 |     if "Content" in all_keys:
 54 |         all_keys = all_keys["Content"]
 55 | 
 56 |     logging.debug(f"Loading filter list from: {key_storage_path}")
 57 |     # logging.debug(f"Filter keys: {all_keys}")
 58 | 
 59 |     # If a metadata filter is found in the chunk, then add as metadata for that chunk
 60 |     for chunk in all_documents:
 61 |         logging.debug("-----------------------------------")
 62 |         for key in all_keys:
 63 |             if all_keys[key].lower() in chunk.page_content.lower():
 64 |                 chunk.metadata[key] = all_keys[key]
 65 |         logging.debug(chunk)
 66 | 
 67 |     if embeddings_type == "llama":
 68 |         logging.info("Using llama embeddigs")
 69 |         params = {
 70 |             "n_ctx": getenv("N_CTX"),
 71 |             "n_batch": 1024,
 72 |             "n_gpu_layers": getenv("LAYERS"),
 73 |         }
 74 |         embedder = LlamaCppEmbeddings(
 75 |             model_path=model_source,
 76 |             **params,
 77 |         )
 78 |     elif embeddings_type == "spacy":
 79 |         logging.info("Using spacy embeddigs")
 80 |         # embedder = CustomSpacyEmbeddings(model_path="en_core_web_lg")
 81 |         embedder = CustomSpacyEmbeddings(model_path=embeddings_model)
 82 |     elif embeddings_type == "huggingface":
 83 |         logging.info("Using huggingface embeddigs")
 84 |         # model_name = "sentence-transformers/all-mpnet-base-v2"
 85 |         model_kwargs = {"device": "cpu"}
 86 |         encode_kwargs = {"normalize_embeddings": False}
 87 |         embedder = HuggingFaceEmbeddings(
 88 |             model_name=embeddings_model, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
 89 |         )
 90 | 
 91 |     else:
 92 |         error_message = f"Unsupported embeddings type: {embeddings_type}"
 93 |         raise ValueError(error_message)
 94 |     client = chromadb.PersistentClient(path=persist_directory, settings=Settings(anonymized_telemetry=False))
 95 |     Chroma.from_documents(
 96 |         client=client,
 97 |         documents=all_documents,
 98 |         embedding=embedder,
 99 |         persist_directory=persist_directory,
100 |         collection_name=collection_name,
101 |         collection_metadata={"hnsw:space": "l2"},
102 |     )
103 | 
104 |     logging.info(f"Read metadata filters from directory: {key_storage_path}")
105 |     logging.info(f"Read files from directory: {documents_directory}")
106 |     logging.info(f"Text parsed with chunk size: {chunk_size}, and chunk overlap: {chunk_overlap}")
107 |     logging.debug(f"Saved collection as: {collection_name}")
108 |     logging.debug(f"Saved collection to: {persist_directory}")
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     # Read the data directory, collection name, and persist directory
113 |     parser = argparse.ArgumentParser(description="Parse text into documents and upload to chroma")
114 | 
115 |     # Add arguments
116 |     parser.add_argument(
117 |         "--data-directory",
118 |         type=str,
119 |         default="./run_files/documents/skynet",
120 |         help="The directory where your text files are stored",
121 |     )
122 |     parser.add_argument(
123 |         "--collection-name",
124 |         type=str,
125 |         default="skynet",
126 |         help="The name of the Chroma collection",
127 |     )
128 |     parser.add_argument(
129 |         "--persist-directory",
130 |         type=str,
131 |         default="./run_files/character_storage/",
132 |         help="The directory where you want to store the Chroma collection",
133 |     )
134 | 
135 |     parser.add_argument(
136 |         "--key-storage",
137 |         type=str,
138 |         default="./run_files/key_storage/",
139 |         help="The directory for the collection metadata keys",
140 |     )
141 | 
142 |     parser.add_argument(
143 |         "--chunk-size",
144 |         type=int,
145 |         default=2048,
146 |         help="The text chunk size for parsing",
147 |     )
148 | 
149 |     parser.add_argument(
150 |         "--chunk-overlap",
151 |         type=int,
152 |         default=1024,
153 |         help="The overlap for text chunks for parsing",
154 |     )
155 |     parser.add_argument(
156 |         "--embeddings-type",
157 |         type=str,
158 |         default="spacy",
159 |         help="The chosen embeddings type",
160 |     )
161 | 
162 |     # Parse arguments
163 |     args = parser.parse_args()
164 | 
165 |     main(
166 |         documents_directory=args.data_directory,
167 |         collection_name=args.collection_name,
168 |         persist_directory=args.persist_directory,
169 |         key_storage=args.key_storage,
170 |         chunk_size=args.chunk_size,
171 |         chunk_overlap=args.chunk_overlap,
172 |         embeddings_type=args.embeddings_type,
173 |     )
174 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/parse_text_documents_simple.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import json
  3 | import logging
  4 | import multiprocessing as mp
  5 | import os
  6 | 
  7 | # For perf measuring
  8 | import time
  9 | from multiprocessing import Manager, Pool
 10 | from os import getenv
 11 | from os.path import join
 12 | 
 13 | import chromadb
 14 | import click
 15 | import pandas as pd
 16 | from chromadb.config import Settings
 17 | from custom_llm_classes.custom_spacy_embeddings import CustomSpacyEmbeddings
 18 | from dotenv import find_dotenv, load_dotenv
 19 | from langchain_community.embeddings import HuggingFaceEmbeddings, LlamaCppEmbeddings
 20 | from langchain_community.vectorstores import Chroma
 21 | from langchain_core.documents.base import Document
 22 | 
 23 | # This is the config for multiprocess logger
 24 | # Setting the level to debug outputs multiprocess debug lines too
 25 | NER_LOGGER = mp.get_logger()
 26 | FORMAT = "%(levelname)s:%(message)s"
 27 | formatter = logging.Formatter(fmt=FORMAT)
 28 | handler = logging.StreamHandler()
 29 | handler.setFormatter(formatter)
 30 | 
 31 | NER_LOGGER.addHandler(handler)
 32 | NER_LOGGER.setLevel(logging.INFO)
 33 | 
 34 | load_dotenv(find_dotenv())
 35 | 
 36 | 
 37 | def read_documents(
 38 |     all_documents,
 39 |     que,
 40 |     reader_num,
 41 | ) -> bool:
 42 |     NER_LOGGER.info("Reading documents to que")
 43 |     for doc in all_documents:
 44 |         que.put(doc)
 45 |     for _i in range(reader_num):
 46 |         que.put("QUEUE_DONE")
 47 |     NER_LOGGER.info("Reader done")
 48 |     return True
 49 | 
 50 | 
 51 | def process_documents(all_keys, read_que, write_que, name) -> bool:
 52 |     NER_LOGGER.info(f"Processor {name} reading documents from que")
 53 |     while True:
 54 |         try:
 55 |             document = read_que.get(timeout=10)
 56 |         except Exception as e:
 57 |             NER_LOGGER.info(f"Processor {name} timed out: {e}")
 58 |             write_que.put("QUEUE_DONE")
 59 |             return False
 60 | 
 61 |         if document == "QUEUE_DONE":
 62 |             NER_LOGGER.info(f"Processor {name} done")
 63 |             write_que.put("QUEUE_DONE")
 64 |             break
 65 | 
 66 |         for key in all_keys:
 67 |             if all_keys[key] in document.page_content:
 68 |                 document.metadata[key] = all_keys[key]
 69 |         write_que.put(document)
 70 |     return True
 71 | 
 72 | 
 73 | def clean_and_merge_documents(que, name) -> pd.DataFrame:
 74 |     NER_LOGGER.info(f"cleaner {name} reading documents from que")
 75 |     document_list = []
 76 |     while True:
 77 |         try:
 78 |             document = que.get(timeout=10)
 79 |         except Exception as e:
 80 |             NER_LOGGER.info(f"Writer {name} timed out: {e}")
 81 |             return document_list
 82 |         if not isinstance(document, Document) and document == "QUEUE_DONE":
 83 |             NER_LOGGER.info(f"Writer {name} received done")
 84 |             break
 85 |         elif isinstance(document, Document):
 86 |             NER_LOGGER.info(f"Writer {name} received a document")
 87 |             document_list.append(document)
 88 | 
 89 |     return document_list
 90 | 
 91 | 
 92 | @click.command()
 93 | @click.option(
 94 |     "--documents-directory",
 95 |     "-d",
 96 |     "documents_directory",
 97 |     default="./run_files/documents/skynet",
 98 |     help="The directory where your text files are stored",
 99 | )
100 | @click.option("--collection-name", "-c", default="skynet", help="The name of the Chroma collection.")
101 | @click.option(
102 |     "--persist-directory",
103 |     "-p",
104 |     default="./run_files/character_storage/",
105 |     help="The directory where you want to store the Chroma collection.",
106 | )
107 | @click.option(
108 |     "--key-storage", "-k", default="./run_files/key_storage/", help="The directory for the collection metadata keys."
109 | )
110 | @click.option("--keyfile-name", "-k", default="none", help="Keyfile name. If not given, defaults to collection name.")
111 | @click.option("--embeddings-type", "-e", default="spacy", help="The chosen embeddings type.")
112 | @click.option("--threads", "-t", default=6, type=int, help="The number of threads to use for parsing.")
113 | def main(
114 |     documents_directory: str,
115 |     collection_name: str,
116 |     persist_directory: str,
117 |     key_storage: str,
118 |     keyfile_name: str,
119 |     embeddings_type: str,
120 |     threads: int,
121 | ) -> None:
122 |     """
123 |     This script parses text documents into a chroma collection. Using simple stop string parsing.
124 |     Text documents are loaded from a directory and parsed into chunk sized text pieces.
125 |     These pieces are matched for metadata keys in keyfile.
126 |     The matching is done with multiprocess to improve perf for large collections and keyfiles.
127 |     The resulting documents are pushed into a Chroma vector data collection in persist-directory.
128 |     """
129 |     model_dir = getenv("MODEL_DIR")
130 |     model = getenv("MODEL")
131 |     model_source = join(model_dir, model)
132 |     embeddings_model = getenv("EMBEDDINGS_MODEL")
133 | 
134 |     documents_pattern = os.path.join(documents_directory, "*.txt")
135 |     documents_paths_txt = glob.glob(documents_pattern)
136 | 
137 |     all_documents = []
138 |     for txt_document in documents_paths_txt:
139 |         docs = []
140 |         with open(txt_document, encoding="utf-8") as f:
141 |             text = f.read()
142 |             split_text = text.split("\n\n")
143 | 
144 |         for line in split_text:
145 |             text_doc = Document(line)
146 |             docs.append(text_doc)
147 | 
148 |         all_documents.extend(docs)
149 | 
150 |     if keyfile_name == "none":
151 |         key_storage_path = join(key_storage, collection_name + ".json")
152 |     else:
153 |         key_storage_path = join(key_storage, keyfile_name)
154 | 
155 |     all_keys = None
156 |     NER_LOGGER.info(f"Loading filter list from: {key_storage_path}")
157 |     with open(key_storage_path, encoding="utf-8") as key_file:
158 |         content = key_file.read()
159 |     all_keys = json.loads(content)
160 |     if "Content" in all_keys:
161 |         all_keys = all_keys["Content"]
162 | 
163 |     tic = time.perf_counter()
164 | 
165 |     manager = Manager()
166 |     read_que = manager.Queue()
167 |     write_que = manager.Queue()
168 | 
169 |     pool = Pool(threads)
170 | 
171 |     reader = pool.apply_async(
172 |         read_documents,
173 |         (
174 |             all_documents,
175 |             read_que,
176 |             threads,
177 |         ),
178 |     )
179 | 
180 |     read_success = reader.get()
181 |     if not read_success:
182 |         return
183 | 
184 |     jobs = []
185 |     for i in range(threads):
186 |         job = pool.apply_async(
187 |             process_documents,
188 |             (
189 |                 all_keys,
190 |                 read_que,
191 |                 write_que,
192 |                 i,
193 |             ),
194 |         )
195 |         jobs.append(job)
196 | 
197 |     for job in jobs:
198 |         job.get()
199 | 
200 |     jobs = []
201 |     for i in range(threads):
202 |         job = pool.apply_async(
203 |             clean_and_merge_documents,
204 |             (
205 |                 write_que,
206 |                 i,
207 |             ),
208 |         )
209 |         jobs.append(job)
210 | 
211 |     document_list = None
212 |     for job in jobs:
213 |         merge_result = job.get()
214 |         if merge_result is not None:
215 |             if document_list is None:
216 |                 document_list = merge_result
217 |             else:
218 |                 document_list = document_list + merge_result
219 | 
220 |     pool.close()
221 |     pool.join()
222 | 
223 |     # Stop timer
224 |     toc = time.perf_counter()
225 |     NER_LOGGER.info(f"Keys took {toc - tic:0.4f} seconds")
226 | 
227 |     tic = time.perf_counter()
228 |     if embeddings_type == "llama":
229 |         NER_LOGGER.info("Using llama embeddigs")
230 |         params = {
231 |             "n_ctx": getenv("N_CTX"),
232 |             "n_batch": 1024,
233 |             "n_gpu_layers": getenv("LAYERS"),
234 |         }
235 |         embedder = LlamaCppEmbeddings(
236 |             model_path=model_source,
237 |             **params,
238 |         )
239 |     elif embeddings_type == "spacy":
240 |         NER_LOGGER.info("Using spacy embeddigs")
241 |         # embedder = CustomSpacyEmbeddings(model_path="en_core_web_lg")
242 |         embedder = CustomSpacyEmbeddings(model_path=embeddings_model)
243 |     elif embeddings_type == "huggingface":
244 |         NER_LOGGER.info("Using huggingface embeddigs")
245 |         # model_name = "sentence-transformers/all-mpnet-base-v2"
246 |         model_kwargs = {"device": "cpu"}
247 |         encode_kwargs = {"normalize_embeddings": False}
248 |         embedder = HuggingFaceEmbeddings(
249 |             model_name=embeddings_model, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
250 |         )
251 | 
252 |     else:
253 |         error_message = f"Unsupported embeddings type: {embeddings_type}"
254 |         raise ValueError(error_message)
255 |     client = chromadb.PersistentClient(path=persist_directory, settings=Settings(anonymized_telemetry=False))
256 |     Chroma.from_documents(
257 |         client=client,
258 |         documents=document_list,
259 |         embedding=embedder,
260 |         persist_directory=persist_directory,
261 |         collection_name=collection_name,
262 |         collection_metadata={"hnsw:space": "l2"},
263 |     )
264 | 
265 |     # Stop timer
266 |     toc = time.perf_counter()
267 |     NER_LOGGER.info(f"Storing embeddings took {toc - tic:0.4f} seconds")
268 |     NER_LOGGER.info(f"Read metadata filters from directory: {key_storage_path}")
269 |     if keyfile_name == "none":
270 |         NER_LOGGER.info(f"Metadata file is: {collection_name}.json")
271 |     else:
272 |         NER_LOGGER.info(f"Metadata file is: {keyfile_name}")
273 |     NER_LOGGER.info(f"Read files from directory: {documents_directory}")
274 |     NER_LOGGER.info(f"Saved collection as: {collection_name}")
275 |     NER_LOGGER.info(f"Saved collection to: {persist_directory}")
276 | 
277 | 
278 | if __name__ == "__main__":
279 |     main()
280 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/spacier/__init__.py:
--------------------------------------------------------------------------------
1 | from document_parsing.spacier import core, utils
2 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/spacier/core.py:
--------------------------------------------------------------------------------
  1 | """
  2 | :mod:`textacy.spacier.core`: Convenient entry point for loading spaCy language pipelines
  3 | and making spaCy docs.
  4 | """
  5 | 
  6 | import functools
  7 | import logging
  8 | import pathlib
  9 | 
 10 | import spacy
 11 | from cachetools import cached
 12 | from cachetools.keys import hashkey
 13 | from spacy.language import Language
 14 | from spacy.tokens import Doc
 15 | 
 16 | from document_parsing.spacier import utils as sputils
 17 | from document_parsing.utils import cache, errors, types, utils
 18 | 
 19 | LOGGER = logging.getLogger(__name__)
 20 | SNIPPET_SIZE = 50
 21 | 
 22 | 
 23 | @cached(cache.LRU_CACHE, key=functools.partial(hashkey, "spacy_lang"))
 24 | def load_spacy_lang(name: str | pathlib.Path, **kwargs) -> Language:
 25 |     """
 26 |     Load a spaCy ``Language`` — a shared vocabulary and language-specific data
 27 |     for tokenizing text, and (if available) model data and a processing pipeline
 28 |     containing a sequence of components for annotating a document — and cache results,
 29 |     for quick reloading as needed.
 30 | 
 31 |     Note that as of spaCy v3, for which pipeline aliases are no longer allowed,
 32 |     this function is just a convenient access point to underlying :func:`spacy.load()`.
 33 | 
 34 |     .. code-block:: pycon
 35 | 
 36 |         >>> en_nlp = textacy.load_spacy_lang("en_core_web_sm")
 37 |         >>> en_nlp = textacy.load_spacy_lang("en_core_web_sm", disable=("parser",))
 38 |         >>> textacy.load_spacy_lang("ar")
 39 |         ...
 40 |         OSError: [E050] Can't find model 'ar'.
 41 |         It doesn't seem to be a Python package or a valid path to a data directory.
 42 | 
 43 |     Args:
 44 |         name: Name or path to the spaCy language pipeline to load.
 45 |         **kwargs
 46 | 
 47 |     Note:
 48 |         Although spaCy's API specifies some kwargs as ``List[str]``, here we require
 49 |         ``Tuple[str, ...]`` equivalents. Language pipelines are stored in an LRU cache
 50 |         with unique identifiers generated from the hash of the function name and args —
 51 |         and lists aren't hashable.
 52 | 
 53 |     Returns:
 54 |         Loaded spaCy ``Language``.
 55 | 
 56 |     Raises:
 57 |         OSError
 58 | 
 59 |     See Also:
 60 |         https://spacy.io/api/top-level#spacy.load
 61 |     """
 62 |     spacy_lang = spacy.load(name, **kwargs)
 63 |     LOGGER.info("loaded '%s' spaCy language pipeline", name)
 64 |     return spacy_lang
 65 | 
 66 | 
 67 | def make_spacy_doc(
 68 |     data: types.DocData,
 69 |     lang: types.LangLikeInContext,
 70 |     *,
 71 |     chunk_size: int | None = None,
 72 | ) -> Doc:
 73 |     """
 74 |     Make a :class:`spacy.tokens.Doc` from valid inputs, and automatically
 75 |     load/validate :class:`spacy.language.Language` pipelines to process ``data``.
 76 | 
 77 |     Make a ``Doc`` from text:
 78 | 
 79 |     .. code-block:: pycon
 80 | 
 81 |         >>> text = "To be, or not to be, that is the question."
 82 |         >>> doc = make_spacy_doc(text, "en_core_web_sm")
 83 |         >>> doc._.preview
 84 |         'Doc(13 tokens: "To be, or not to be, that is the question.")'
 85 | 
 86 |     Make a ``Doc`` from a (text, metadata) pair, aka a "record":
 87 | 
 88 |     .. code-block:: pycon
 89 | 
 90 |         >>> record = (text, {"author": "Shakespeare, William"})
 91 |         >>> doc = make_spacy_doc(record, "en_core_web_sm")
 92 |         >>> doc._.preview
 93 |         'Doc(13 tokens: "To be, or not to be, that is the question.")'
 94 |         >>> doc._.meta
 95 |         {'author': 'Shakespeare, William'}
 96 | 
 97 |     Specify the language pipeline used to process the text in a few different ways:
 98 | 
 99 |     .. code-block:: pycon
100 | 
101 |         >>> make_spacy_doc(text, lang="en_core_web_sm")
102 |         >>> make_spacy_doc(text, lang=textacy.load_spacy_lang("en_core_web_sm"))
103 |         >>> make_spacy_doc(text, lang=lambda txt: "en_core_web_sm")
104 | 
105 |     Ensure that an already-processed ``Doc`` is compatible with ``lang``:
106 | 
107 |     .. code-block:: pycon
108 | 
109 |         >>> spacy_lang = textacy.load_spacy_lang("en_core_web_sm")
110 |         >>> doc = spacy_lang(text)
111 |         >>> make_spacy_doc(doc, lang="en_core_web_sm")
112 |         >>> make_spacy_doc(doc, lang="es_core_news_sm")
113 |         ...
114 |         ValueError: `spacy.Vocab` used to process document must be the same
115 |         as that used by the `lang` pipeline ('es_core_news_sm')
116 | 
117 |     Args:
118 |         data: Make a :class:`spacy.tokens.Doc` from a text or (text, metadata) pair.
119 |             If already a ``Doc``, ensure that it's compatible with ``lang``
120 |             to avoid surprises downstream, and return it as-is.
121 |         lang: Language with which spaCy processes (or processed) ``data``,
122 |             represented as the full name of a spaCy language pipeline, the path on disk
123 |             to it, an already instantiated pipeline, or a callable function that takes
124 |             the text component of ``data`` and outputs one of the above representations.
125 |         chunk_size: Size of chunks in number of characters into which ``text`` will be
126 |             split before processing each via spaCy and concatenating the results
127 |             into a single ``Doc``.
128 | 
129 |             .. note:: This is intended as a workaround for processing very long texts,
130 |                for which spaCy is unable to allocate enough RAM. For best performance,
131 |                chunk size should be somewhere between 1e3 and 1e7 characters,
132 |                depending on how much RAM you have available.
133 | 
134 |                Since chunking is done by *character*, chunks' boundaries likely
135 |                won't respect natural language segmentation, and as a result
136 |                spaCy's models may make mistakes on sentences/words that cross them.
137 | 
138 |     Returns:
139 |         Processed spaCy Doc.
140 | 
141 |     Raises:
142 |         TypeError
143 |         ValueError
144 |     """
145 |     if isinstance(data, str):
146 |         return _make_spacy_doc_from_text(data, lang, chunk_size)
147 |     elif isinstance(data, Doc):
148 |         return _make_spacy_doc_from_doc(data, lang)
149 |     elif utils.is_record(data):
150 |         return _make_spacy_doc_from_record(data, lang, chunk_size)
151 |     else:
152 |         raise TypeError(errors.type_invalid_msg("data", type(data), types.DocData))
153 | 
154 | 
155 | def _make_spacy_doc_from_text(text: str, lang: types.LangLikeInContext, chunk_size: int | None) -> Doc:
156 |     spacy_lang = sputils.resolve_langlikeincontext(text, lang)
157 |     if chunk_size:
158 |         doc = _make_spacy_doc_from_text_chunks(text, spacy_lang, chunk_size)
159 |     else:
160 |         doc = spacy_lang(text)
161 |     return doc
162 | 
163 | 
164 | def _make_spacy_doc_from_record(record: types.Record, lang: types.LangLikeInContext, chunk_size: int | None) -> Doc:
165 |     text, meta = record
166 |     spacy_lang = sputils.resolve_langlikeincontext(text, lang)
167 |     if chunk_size:
168 |         doc = _make_spacy_doc_from_text_chunks(text, spacy_lang, chunk_size)
169 |     else:
170 |         doc = spacy_lang(text)
171 |     doc._.meta = meta
172 |     return doc
173 | 
174 | 
175 | def _make_spacy_doc_from_text_chunks(text: str, lang: Language, chunk_size: int) -> Doc:
176 |     text_chunks = (text[i : i + chunk_size] for i in range(0, len(text), chunk_size))
177 |     return Doc.from_docs(list(lang.pipe(text_chunks)))
178 | 
179 | 
180 | def _make_spacy_doc_from_doc(doc: Doc, lang: types.LangLikeInContext) -> Doc:
181 |     spacy_lang = sputils.resolve_langlikeincontext(doc.text, lang)
182 |     # we want to make sure that the language used to create `doc` is the same as
183 |     # the one passed here; however, the best we can do (bc of spaCy's API) is ensure
184 |     # that they share the same vocab
185 |     if doc.vocab is not spacy_lang.vocab:
186 |         msg = (
187 |             f"`spacy.Vocab` used to process document ({doc.vocab}) must be the same "
188 |             f"as that used by the `lang` pipeline ({spacy_lang.vocab})"
189 |         )
190 |         raise ValueError(msg)
191 |     return doc
192 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/spacier/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | spaCy Utils
 3 | -----------
 4 | 
 5 | :mod:`textacy.spacier.utils`: Helper functions for working with / extending spaCy's
 6 | core functionality.
 7 | """
 8 | 
 9 | import pathlib
10 | 
11 | from spacy.language import Language
12 | 
13 | from document_parsing.spacier import core
14 | from document_parsing.utils import errors, types
15 | 
16 | 
17 | def resolve_langlikeincontext(text: str, lang: types.LangLikeInContext) -> Language:
18 |     if isinstance(lang, Language):
19 |         return lang
20 |     elif isinstance(lang, str | pathlib.Path):
21 |         return core.load_spacy_lang(lang)
22 |     elif callable(lang):
23 |         return resolve_langlikeincontext(text, lang(text))
24 |     else:
25 |         raise TypeError(errors.type_invalid_msg("lang", type(lang), types.LangLikeInContext))
26 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/test_query.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | from functools import partial
 4 | from os.path import exists, join
 5 | 
 6 | import click
 7 | import spacy
 8 | from dotenv import find_dotenv, load_dotenv
 9 | 
10 | from document_parsing.extract import entities, ngrams, terms
11 | from document_parsing.extract.basics import terms_to_strings
12 | 
13 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.DEBUG)
14 | load_dotenv(find_dotenv())
15 | 
16 | 
17 | @click.command()
18 | @click.argument("query")
19 | @click.option(
20 |     "--model",
21 |     "-m",
22 |     default="en_core_web_lg",
23 |     help="The spacy model to parse the text",
24 | )
25 | @click.option(
26 |     "--parse-config-directory", "-pcd", default="./run_files/parse_configs/", help="The parse config directory"
27 | )
28 | @click.option(
29 |     "--parse-config-file",
30 |     "-pcf",
31 |     default="query_metadata_filter.json",
32 |     help="The parse config file",
33 | )
34 | def main(
35 |     query: str,
36 |     model: str,
37 |     parse_config_directory: str,
38 |     parse_config_file: str,
39 | ) -> None:
40 |     """
41 |     This script is for testing metadata parsing with spacy. Parses the keywords from a query.
42 |     """
43 |     spacy_lang = spacy.load(model)
44 |     doc = spacy_lang(query)
45 |     parse_config_path = join(".", parse_config_directory, parse_config_file)
46 |     if exists(parse_config_path):
47 |         with open(parse_config_path) as key_file:
48 |             filter_content = key_file.read()
49 |         filter_configs = json.loads(filter_content)
50 |     else:
51 |         logging.info("Could not load parse config file")
52 |         return
53 | 
54 |     ngrams_list = filter_configs["ngs"]
55 |     entities_list = filter_configs["entities"]
56 |     noun_chunks = filter_configs["noun_chunks"]
57 |     extract_type = filter_configs["extract_type"]
58 | 
59 |     logging.info("Extracting terms from corpus")
60 |     extracted_terms = terms(
61 |         doc,
62 |         ngs=partial(ngrams, n=noun_chunks, include_pos=ngrams_list),
63 |         ents=partial(
64 |             entities,
65 |             include_types=entities_list,
66 |         ),
67 |         dedupe=True,
68 |     )
69 | 
70 |     lemma_strings = list(terms_to_strings(extracted_terms, by=extract_type))
71 |     logging.info(lemma_strings)
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     main()
76 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/src/llama_cpp_chat_memory/document_parsing/utils/__init__.py


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/utils/cache.py:
--------------------------------------------------------------------------------
 1 | """
 2 | :mod:`textacy.cache`: Functionality for caching language data and other NLP resources.
 3 | Loading data from disk can be slow; let's just do it once and forget about it. :)
 4 | """
 5 | import inspect
 6 | import logging
 7 | import os
 8 | import sys
 9 | 
10 | from cachetools import LRUCache
11 | 
12 | LOGGER = logging.getLogger(__name__)
13 | 
14 | 
15 | def _get_size(obj, seen=None):
16 |     """
17 |     Recursively find the actual size of an object, in bytes.
18 | 
19 |     Taken as-is (with tweaked function name and log level) from https://github.com/bosswissam/pysize.
20 |     """
21 |     size = sys.getsizeof(obj)
22 |     if seen is None:
23 |         seen = set()
24 |     obj_id = id(obj)
25 |     if obj_id in seen:
26 |         return 0
27 |     # Important mark as seen *before* entering recursion to gracefully handle
28 |     # self-referential objects
29 |     seen.add(obj_id)
30 |     if hasattr(obj, "__dict__"):
31 |         for cls in obj.__class__.__mro__:
32 |             if "__dict__" in cls.__dict__:
33 |                 d = cls.__dict__["__dict__"]
34 |                 if inspect.isgetsetdescriptor(d) or inspect.ismemberdescriptor(d):
35 |                     size += _get_size(obj.__dict__, seen)
36 |                 break
37 |     if isinstance(obj, dict):
38 |         size += sum(_get_size(v, seen) for v in obj.values())
39 |         size += sum(_get_size(k, seen) for k in obj.keys())
40 |     elif hasattr(obj, "__iter__") and not isinstance(obj, str | bytes | bytearray):
41 |         try:
42 |             size += sum(_get_size(i, seen) for i in obj)
43 |         except TypeError:
44 |             LOGGER.warning(
45 |                 "Unable to get size of %r. This may lead to incorrect sizes. Please report this error.",
46 |                 obj,
47 |             )
48 |     if hasattr(obj, "__slots__"):  # can have __slots__ with __dict__
49 |         size += sum(_get_size(getattr(obj, s), seen) for s in obj.__slots__ if hasattr(obj, s))
50 | 
51 |     return size
52 | 
53 | 
54 | LRU_CACHE: LRUCache = LRUCache(int(os.environ.get("TEXTACY_MAX_CACHE_SIZE", 2147483648)), getsizeof=_get_size)
55 | """
56 | Least Recently Used (LRU) cache for loaded data.
57 | 
58 | The max cache size may be set by the `TEXTACY_MAX_CACHE_SIZE` environment variable,
59 | where the value must be an integer (in bytes). Otherwise, the max size is 2GB.
60 | """
61 | 
62 | 
63 | def clear():
64 |     """Clear textacy's cache of loaded data."""
65 |     global LRU_CACHE
66 |     LRU_CACHE.clear()
67 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/utils/constants.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Collection of regular expressions and other (small, generally useful) constants.
 3 | """
 4 | 
 5 | import re
 6 | from re import Pattern
 7 | 
 8 | NUMERIC_ENT_TYPES: set[str] = {
 9 |     "ORDINAL",
10 |     "CARDINAL",
11 |     "MONEY",
12 |     "QUANTITY",
13 |     "PERCENT",
14 |     "TIME",
15 |     "DATE",
16 | }
17 | 
18 | 
19 | RE_ALNUM: Pattern = re.compile(r"[^\W_]+")
20 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/utils/errors.py:
--------------------------------------------------------------------------------
 1 | """
 2 | :mod:`textacy.errors`: Helper functions for making consistent errors.
 3 | """
 4 | from collections.abc import Collection
 5 | from typing import Any
 6 | 
 7 | 
 8 | def value_invalid_msg(name: str, value: Any, valid_values: Collection[Any]) -> str:
 9 |     return f"`{name}` value = {value} is invalid; value must be one of {valid_values}."
10 | 
11 | 
12 | def type_invalid_msg(name: str, val_type, valid_val_type) -> str:
13 |     return f"`{name}` type = {val_type} is invalid; type must match {valid_val_type}."
14 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/utils/types.py:
--------------------------------------------------------------------------------
 1 | """
 2 | :mod:`textacy.types`: Definitions for common object types used throughout the package.
 3 | """
 4 | 
 5 | from collections.abc import Callable, Iterable
 6 | from pathlib import Path
 7 | from typing import NamedTuple
 8 | 
 9 | from spacy.language import Language
10 | from spacy.tokens import Doc, Span, Token
11 | 
12 | PathLike = str | Path
13 | 
14 | DocLike = Doc | Span
15 | SpanLike = Span | Token
16 | DocLikeToSpans = Callable[[DocLike], Iterable[Span]]
17 | 
18 | LangLikeInContext = PathLike | Language | Callable[[str], str] | Callable[[str], Path] | Callable[[str], Language]
19 | 
20 | 
21 | # typed equivalent to Record = collections.namedtuple("Record", ["text", "meta"])
22 | class Record(NamedTuple):
23 |     text: str
24 |     meta: dict
25 | 
26 | 
27 | DocData = str | Record | Doc
28 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/utils/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | :mod:`textacy.utils`: Variety of general-purpose utility functions for inspecting /
 3 | validating / transforming args and facilitating meta package tasks.
 4 | """
 5 | 
 6 | from collections.abc import Iterable
 7 | from typing import (
 8 |     Any,
 9 | )
10 | 
11 | # a (text, metadata) 2-tuple
12 | RECORD_LEN = 2
13 | 
14 | 
15 | def is_record(obj: Any) -> bool:
16 |     """Check whether ``obj`` is a "record" -- that is, a (text, metadata) 2-tuple."""
17 |     if isinstance(obj, tuple) and len(obj) == RECORD_LEN and isinstance(obj[0], str) and isinstance(obj[1], dict):
18 |         return True
19 |     else:
20 |         return False
21 | 
22 | 
23 | def to_set(val: Any) -> set:
24 |     """Cast ``val`` into a set, if necessary and possible."""
25 |     if isinstance(val, set):
26 |         return val
27 |     elif isinstance(val, Iterable) and not isinstance(val, str | bytes):
28 |         return set(val)
29 |     else:
30 |         return {val}
31 | 
32 | 
33 | def to_tuple(val: Any) -> tuple:
34 |     """Cast ``val`` into a tuple, if necessary and possible."""
35 |     if isinstance(val, tuple):
36 |         return val
37 |     elif isinstance(val, Iterable) and not isinstance(val, str | bytes):
38 |         return tuple(val)
39 |     else:
40 |         return (val,)
41 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/document_parsing/web_scraper.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import logging
  4 | import os
  5 | import re
  6 | from os.path import exists, join
  7 | 
  8 | from dotenv import find_dotenv, load_dotenv
  9 | from trafilatura import extract, fetch_url
 10 | 
 11 | logging.basicConfig(format="%(message)s", encoding="utf-8", level=logging.DEBUG)
 12 | load_dotenv(find_dotenv())
 13 | 
 14 | 
 15 | def main(
 16 |     documents_directory: str,
 17 |     collection_name: str,
 18 |     web_scrape_directory: str,
 19 |     filter_directory: str,
 20 |     filter_file: str,
 21 | ) -> None:
 22 |     web_scrape_path = join(".", web_scrape_directory, collection_name + ".json")
 23 |     if exists(web_scrape_path):
 24 |         with open(web_scrape_path) as key_file:
 25 |             content = key_file.read()
 26 |         scrape_configs = json.loads(content)
 27 |     else:
 28 |         logging.debug("Could not load filter list")
 29 |         return
 30 | 
 31 |     filters_path = join(".", filter_directory, filter_file)
 32 |     if exists(filters_path):
 33 |         with open(filters_path) as key_file:
 34 |             filter_content = key_file.read()
 35 |         filter_configs = json.loads(filter_content)
 36 |     else:
 37 |         logging.debug("Could not load filter list")
 38 |         return
 39 | 
 40 |     parse_filters = filter_configs["filters"]
 41 | 
 42 |     storage_path = os.path.join(documents_directory, collection_name + ".txt")
 43 |     for page in scrape_configs["pages"]:
 44 |         logging.info("Loading html")
 45 |         downloaded = fetch_url(page)
 46 | 
 47 |         if downloaded is not None:
 48 |             logging.info("Transforming documents")
 49 |             result = extract(
 50 |                 downloaded, include_comments=False, include_images=False, include_links=False, include_tables=False
 51 |             )
 52 | 
 53 |             for parse_filter in parse_filters:
 54 |                 filter_iterator = iter(parse_filter)
 55 |                 parse_regex = next(filter_iterator)
 56 |                 parse_replacment = next(filter_iterator)
 57 |                 result = re.sub(parse_filter[parse_regex], parse_filter[parse_replacment], result)
 58 | 
 59 |             logging.info("Saving Corpus")
 60 |             with open(storage_path, "a", encoding="utf-8") as file:
 61 |                 file.write(result + "\n")
 62 | 
 63 | 
 64 | if __name__ == "__main__":
 65 |     # Read the data directory, collection name, and persist directory
 66 |     parser = argparse.ArgumentParser(description="Web scrape web pages into text")
 67 | 
 68 |     # Add arguments
 69 |     parser.add_argument(
 70 |         "--data-directory",
 71 |         type=str,
 72 |         default="./run_files/documents/skynet",
 73 |         help="The directory where your text files are stored",
 74 |     )
 75 | 
 76 |     parser.add_argument(
 77 |         "--collection-name",
 78 |         type=str,
 79 |         default="skynet",
 80 |         help="The name of the collection. Should match eventual Choma collection",
 81 |     )
 82 | 
 83 |     parser.add_argument(
 84 |         "--web-scrape-directory",
 85 |         type=str,
 86 |         default="./run_files/web_scrape_configs/",
 87 |         help="The config file to be used for the webscrape",
 88 |     )
 89 | 
 90 |     parser.add_argument(
 91 |         "--filter-directory",
 92 |         type=str,
 93 |         default="./run_files/filters/",
 94 |         help="The filter directory",
 95 |     )
 96 | 
 97 |     parser.add_argument(
 98 |         "--filter-file",
 99 |         type=str,
100 |         default="web_scrape_filter.json",
101 |         help="The web scrape filter",
102 |     )
103 | 
104 |     # Parse arguments
105 |     args = parser.parse_args()
106 | 
107 |     main(
108 |         documents_directory=args.data_directory,
109 |         collection_name=args.collection_name,
110 |         web_scrape_directory=args.web_scrape_directory,
111 |         filter_directory=args.filter_directory,
112 |         filter_file=args.filter_file,
113 |     )
114 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/flask_web_server.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, send_from_directory
 2 | 
 3 | app = Flask(__name__)
 4 | 
 5 | 
 6 | @app.route("/")
 7 | def hello_world():
 8 |     return "<p>Hello, World!</p>"
 9 | 
10 | 
11 | @app.route("/test/")
12 | def test():
13 |     return "<p>TEST!</p>"
14 | 
15 | 
16 | @app.route("/static/<path:path>")
17 | def send_style(path):
18 |     return send_from_directory("static", path)
19 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_chat.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import fnmatch
 3 | import json
 4 | from os import getenv
 5 | from os.path import dirname, join, realpath, splitext
 6 | 
 7 | import toml
 8 | import yaml
 9 | from chainlit.cli import run_chainlit
10 | from PIL import Image
11 | 
12 | 
13 | def update_toml():
14 |     script_root_path = dirname(realpath(__file__))
15 |     config_toml_path = join(script_root_path, ".chainlit", "config.toml")
16 | 
17 |     prompt_dir = getenv("CHARACTER_CARD_DIR")
18 |     prompt_name = getenv("CHARACTER_CARD")
19 |     prompt_source = join(prompt_dir, prompt_name)
20 |     custom_css = getenv("CUSTOM_CSS")
21 | 
22 |     extension = splitext(prompt_source)[1]
23 |     match extension:
24 |         case ".json":
25 |             with open(prompt_source) as f:
26 |                 prompt_file = f.read()
27 |             card = json.loads(prompt_file)
28 |         case ".yaml":
29 |             with open(prompt_source) as f:
30 |                 card = yaml.safe_load(f)
31 |         case ".png":
32 |             is_v2 = False
33 |             if fnmatch.fnmatch(prompt_source, "*v2.png"):
34 |                 is_v2 = True
35 |             elif fnmatch.fnmatch(prompt_source, "*tavern.png"):
36 |                 is_v2 = False
37 |             else:
38 |                 error_message = f"Unrecognized card type for : {prompt_source}"
39 |                 raise ValueError(error_message)
40 |             im = Image.open(prompt_source)
41 |             im.load()
42 |             card = None
43 |             if im.info is not None and "chara" in im.info:
44 |                 decoded = base64.b64decode(im.info["chara"])
45 |                 card = json.loads(decoded)
46 |                 if is_v2 and "data" in card:
47 |                     card = card["data"]
48 |             char_name = card["name"] if "name" in card else card["char_name"]
49 | 
50 |     char_name = card["name"] if "name" in card else card["char_name"]
51 | 
52 |     with open(config_toml_path, encoding="utf-8") as toml_file:
53 |         toml_dict = toml.load(toml_file)
54 |         toml_dict["UI"]["name"] = char_name
55 | 
56 |     if custom_css != "" or None:
57 |         toml_dict["UI"]["custom_css"] = custom_css
58 | 
59 |     with open(file=config_toml_path, mode="w", encoding="utf-8") as toml_file:
60 |         toml.dump(toml_dict, toml_file)
61 | 
62 | 
63 | # Update toml with the character card name before running the chat application
64 | update_toml()
65 | # Chainlit loads the toml config before running the target,
66 | # so updates to configs must be done before running
67 | 
68 | # TODO: There seems to be some cahching leftover in chainlit.
69 | # To have character change take effect requires that you run run_chat twice
70 | run_chainlit("character_chat.py")
71 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/cards/Shodan_v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/src/llama_cpp_chat_memory/run_files/cards/Shodan_v2.png


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/cards/Skynet_v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/src/llama_cpp_chat_memory/run_files/cards/Skynet_v2.png


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/documents/csv_test/customers-100.csv:
--------------------------------------------------------------------------------
  1 | Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
  2 | 1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/
  3 | 2,1Ef7b82A4CAAD10,Preston,Lozano,Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/
  4 | 3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,beckycarr@hogan.com,2020-03-25,http://www.lawrence.com/
  5 | 4,5Cef8BFA16c5e3c,Linda,Olsen,"Dominguez, Mcmillan and Donovan",Bensonview,Dominican Republic,001-808-617-6467x12895,+1-813-324-8756,stanleyblackwell@benson.org,2020-06-02,http://www.good-lyons.com/
  6 | 5,053d585Ab6b3159,Joanna,Bender,"Martin, Lang and Andrade",West Priscilla,Slovakia (Slovak Republic),001-234-203-0635x76146,001-199-446-3860x3486,colinalvarado@miles.net,2021-04-17,https://goodwin-ingram.com/
  7 | 6,2d08FB17EE273F4,Aimee,Downs,Steele Group,Chavezborough,Bosnia and Herzegovina,(283)437-3886x88321,999-728-1637,louis27@gilbert.com,2020-02-25,http://www.berger.net/
  8 | 7,EA4d384DfDbBf77,Darren,Peck,"Lester, Woodard and Mitchell",Lake Ana,Pitcairn Islands,(496)452-6181x3291,+1-247-266-0963x4995,tgates@cantrell.com,2021-08-24,https://www.le.com/
  9 | 8,0e04AFde9f225dE,Brett,Mullen,"Sanford, Davenport and Giles",Kimport,Bulgaria,001-583-352-7197x297,001-333-145-0369,asnow@colon.com,2021-04-12,https://hammond-ramsey.com/
 10 | 9,C2dE4dEEc489ae0,Sheryl,Meyers,Browning-Simon,Robersonstad,Cyprus,854-138-4911x5772,+1-448-910-2276x729,mariokhan@ryan-pope.org,2020-01-13,https://www.bullock.net/
 11 | 10,8C2811a503C7c5a,Michelle,Gallagher,Beck-Hendrix,Elaineberg,Timor-Leste,739.218.2516x459,001-054-401-0347x617,mdyer@escobar.net,2021-11-08,https://arias.com/
 12 | 11,216E205d6eBb815,Carl,Schroeder,"Oconnell, Meza and Everett",Shannonville,Guernsey,637-854-0256x825,114.336.0784x788,kirksalas@webb.com,2021-10-20,https://simmons-hurley.com/
 13 | 12,CEDec94deE6d69B,Jenna,Dodson,"Hoffman, Reed and Mcclain",East Andrea,Vietnam,(041)737-3846,+1-556-888-3485x42608,mark42@robbins.com,2020-11-29,http://www.douglas.net/
 14 | 13,e35426EbDEceaFF,Tracey,Mata,Graham-Francis,South Joannamouth,Togo,001-949-844-8787,(855)713-8773,alex56@walls.org,2021-12-02,http://www.beck.com/
 15 | 14,A08A8aF8BE9FaD4,Kristine,Cox,Carpenter-Cook,Jodyberg,Sri Lanka,786-284-3358x62152,+1-315-627-1796x8074,holdenmiranda@clarke.com,2021-02-08,https://www.brandt.com/
 16 | 15,6fEaA1b7cab7B6C,Faith,Lutz,Carter-Hancock,Burchbury,Singapore,(781)861-7180x8306,207-185-3665,cassieparrish@blevins-chapman.net,2022-01-26,http://stevenson.org/
 17 | 16,8cad0b4CBceaeec,Miranda,Beasley,Singleton and Sons,Desireeshire,Oman,540.085.3135x185,+1-600-462-6432x21881,vduncan@parks-hardy.com,2022-04-12,http://acosta.org/
 18 | 17,a5DC21AE3a21eaA,Caroline,Foley,Winters-Mendoza,West Adriennestad,Western Sahara,936.222.4746x9924,001-469-948-6341x359,holtgwendolyn@watson-davenport.com,2021-03-10,http://www.benson-roth.com/
 19 | 18,F8Aa9d6DfcBeeF8,Greg,Mata,Valentine LLC,Lake Leslie,Mozambique,(701)087-2415,(195)156-1861x26241,jaredjuarez@carroll.org,2022-03-26,http://pitts-cherry.com/
 20 | 19,F160f5Db3EfE973,Clifford,Jacobson,Simon LLC,Harmonview,South Georgia and the South Sandwich Islands,001-151-330-3524x0469,(748)477-7174,joseph26@jacobson.com,2020-09-24,https://mcconnell.com/
 21 | 20,0F60FF3DdCd7aB0,Joanna,Kirk,Mays-Mccormick,Jamesshire,French Polynesia,(266)131-7001x711,(283)312-5579x11543,tuckerangie@salazar.net,2021-09-24,https://www.camacho.net/
 22 | 21,9F9AdB7B8A6f7F2,Maxwell,Frye,Patterson Inc,East Carly,Malta,423.262.3059,202-880-0688x7491,fgibson@drake-webb.com,2022-01-12,http://www.roberts.com/
 23 | 22,FBd0Ded4F02a742,Kiara,Houston,"Manning, Hester and Arroyo",South Alvin,Netherlands,001-274-040-3582x10611,+1-528-175-0973x4684,blanchardbob@wallace-shannon.com,2020-09-15,https://www.reid-potts.com/
 24 | 23,2FB0FAA1d429421,Colleen,Howard,Greer and Sons,Brittanyview,Paraguay,1935085151,(947)115-7711x5488,rsingleton@ryan-cherry.com,2020-08-19,http://paul.biz/
 25 | 24,010468dAA11382c,Janet,Valenzuela,Watts-Donaldson,Veronicamouth,Lao People's Democratic Republic,354.259.5062x7538,500.433.2022,stefanie71@spence.com,2020-09-08,https://moreno.biz/
 26 | 25,eC1927Ca84E033e,Shane,Wilcox,Tucker LLC,Bryanville,Albania,(429)005-9030x11004,541-116-4501,mariah88@santos.com,2021-04-06,https://www.ramos.com/
 27 | 26,09D7D7C8Fe09aea,Marcus,Moody,Giles Ltd,Kaitlyntown,Panama,674-677-8623,909-277-5485x566,donnamullins@norris-barrett.org,2022-05-24,https://www.curry.com/
 28 | 27,aBdfcF2c50b0bfD,Dakota,Poole,Simmons Group,Michealshire,Belarus,(371)987-8576x4720,071-152-1376,stacey67@fields.org,2022-02-20,https://sanford-wilcox.biz/
 29 | 28,b92EBfdF8a3f0E6,Frederick,Harper,"Hinton, Chaney and Stokes",South Marissatown,Switzerland,+1-077-121-1558x0687,264.742.7149,jacobkhan@bright.biz,2022-05-26,https://callahan.org/
 30 | 29,3B5dAAFA41AFa22,Stefanie,Fitzpatrick,Santana-Duran,Acevedoville,Saint Vincent and the Grenadines,(752)776-3286,+1-472-021-4814x85074,wterrell@clark.com,2020-07-30,https://meyers.com/
 31 | 30,EDA69ca7a6e96a2,Kent,Bradshaw,Sawyer PLC,North Harold,Tanzania,+1-472-143-5037x884,126.922.6153,qjimenez@boyd.com,2020-04-26,http://maynard-ho.com/
 32 | 31,64DCcDFaB9DFd4e,Jack,Tate,"Acosta, Petersen and Morrow",West Samuel,Zimbabwe,965-108-4406x20714,046.906.1442x6784,gfigueroa@boone-zavala.com,2021-09-15,http://www.hawkins-ramsey.com/
 33 | 32,679c6c83DD872d6,Tom,Trujillo,Mcgee Group,Cunninghamborough,Denmark,416-338-3758,(775)890-7209,tapiagreg@beard.info,2022-01-13,http://www.daniels-klein.com/
 34 | 33,7Ce381e4Afa4ba9,Gabriel,Mejia,Adkins-Salinas,Port Annatown,Liechtenstein,4077245425,646.044.0696x66800,coleolson@jennings.net,2021-04-24,https://patel-hanson.info/
 35 | 34,A09AEc6E3bF70eE,Kaitlyn,Santana,Herrera Group,New Kaitlyn,United States of America,6303643286,447-710-6202x07313,georgeross@miles.org,2021-09-21,http://pham.com/
 36 | 35,aA9BAFfBc3710fe,Faith,Moon,"Waters, Chase and Aguilar",West Marthaburgh,Bahamas,+1-586-217-0359x6317,+1-818-199-1403,willistonya@randolph-baker.com,2021-11-03,https://spencer-charles.info/
 37 | 36,E11dfb2DB8C9f72,Tammie,Haley,"Palmer, Barnes and Houston",East Teresa,Belize,001-276-734-4113x6087,(430)300-8770,harrisisaiah@jenkins.com,2022-01-04,http://evans-simon.com/
 38 | 37,889eCf90f68c5Da,Nicholas,Sosa,Jordan Ltd,South Hunter,Uruguay,(661)425-6042,975-998-1519,fwolfe@dorsey.com,2021-08-10,https://www.fleming-richards.com/
 39 | 38,7a1Ee69F4fF4B4D,Jordan,Gay,Glover and Sons,South Walter,Solomon Islands,7208417020,8035336772,tiffanydavies@harris-mcfarland.org,2021-02-24,http://www.lee.org/
 40 | 39,dca4f1D0A0fc5c9,Bruce,Esparza,Huerta-Mclean,Poolefurt,Montenegro,559-529-4424,001-625-000-7132x0367,preese@frye-vega.com,2021-10-22,http://www.farley.org/
 41 | 40,17aD8e2dB3df03D,Sherry,Garza,Anderson Ltd,West John,Poland,001-067-713-6440x158,(978)289-8785x5766,ann48@miller.com,2021-11-01,http://spence.com/
 42 | 41,2f79Cd309624Abb,Natalie,Gentry,Monroe PLC,West Darius,Dominican Republic,830.996.8238,499.122.5415,tcummings@fitzpatrick-ashley.com,2020-10-10,http://www.dorsey.biz/
 43 | 42,6e5ad5a5e2bB5Ca,Bryan,Dunn,Kaufman and Sons,North Jimstad,Burkina Faso,001-710-802-5565,078.699.8982x13881,woodwardandres@phelps.com,2021-09-08,http://www.butler.com/
 44 | 43,7E441b6B228DBcA,Wayne,Simpson,Perkins-Trevino,East Rebekahborough,Bolivia,(344)156-8632x1869,463-445-3702x38463,barbarapittman@holder.com,2020-12-13,https://gillespie-holder.com/
 45 | 44,D3fC11A9C235Dc6,Luis,Greer,Cross PLC,North Drew,Bulgaria,001-336-025-6849x701,684.698.2911x6092,bstuart@williamson-mcclure.com,2022-05-15,https://fletcher-nielsen.com/
 46 | 45,30Dfa48fe5Ede78,Rhonda,Frost,"Herrera, Shepherd and Underwood",Lake Lindaburgh,Monaco,(127)081-9339,+1-431-028-3337x3492,zkrueger@wolf-chavez.net,2021-12-06,http://www.khan.com/
 47 | 46,fD780ED8dbEae7B,Joanne,Montes,"Price, Sexton and Mcdaniel",Gwendolynview,Palau,(897)726-7952,(467)886-9467x5721,juan80@henson.net,2020-07-01,http://ochoa.com/
 48 | 47,300A40d3ce24bBA,Geoffrey,Guzman,Short-Wiggins,Zimmermanland,Uzbekistan,975.235.8921x269,(983)188-6873,bauercrystal@gay.com,2020-04-23,https://decker-kline.com/
 49 | 48,283DFCD0Dba40aF,Gloria,Mccall,"Brennan, Acosta and Ramos",North Kerriton,Ghana,445-603-6729,001-395-959-4736x4524,bartlettjenna@zuniga-moss.biz,2022-03-11,http://burgess-frank.com/
 50 | 49,F4Fc91fEAEad286,Brady,Cohen,Osborne-Erickson,North Eileenville,United Arab Emirates,741.849.0139x524,+1-028-691-7497x0894,mccalltyrone@durham-rose.biz,2022-03-10,http://hammond-barron.com/
 51 | 50,80F33Fd2AcebF05,Latoya,Mccann,"Hobbs, Garrett and Sanford",Port Sergiofort,Belarus,(530)287-4548x29481,162-234-0249x32790,bobhammond@barry.biz,2021-12-02,https://www.burton.com/
 52 | 51,Aa20BDe68eAb0e9,Gerald,Hawkins,"Phelps, Forbes and Koch",New Alberttown,Canada,+1-323-239-1456x96168,(092)508-0269,uwarner@steele-arias.com,2021-03-19,https://valenzuela.com/
 53 | 52,e898eEB1B9FE22b,Samuel,Crawford,"May, Goodwin and Martin",South Jasmine,Algeria,802-242-7457,626.116.9535x8578,xpittman@ritter-carney.net,2021-03-27,https://guerrero.org/
 54 | 53,faCEF517ae7D8eB,Patricia,Goodwin,"Christian, Winters and Ellis",Cowanfort,Swaziland,322.549.7139x70040,(111)741-4173,vaughanchristy@lara.biz,2021-03-08,http://clark.info/
 55 | 54,c09952De6Cda8aA,Stacie,Richard,Byrd Inc,New Deborah,Madagascar,001-622-948-3641x24810,001-731-168-2893x8891,clinton85@colon-arias.org,2020-10-15,https://kim.com/
 56 | 55,f3BEf3Be028166f,Robin,West,"Nixon, Blackwell and Sosa",Wallstown,Ecuador,698.303.4267,001-683-837-7651x525,greenemiranda@zimmerman.com,2022-01-13,https://www.mora.com/
 57 | 56,C6F2Fc6a7948a4e,Ralph,Haas,Montes PLC,Lake Ellenchester,Palestinian Territory,2239271999,001-962-434-0867x649,goodmancesar@figueroa.biz,2020-05-25,http://may.com/
 58 | 57,c8FE57cBBdCDcb2,Phyllis,Maldonado,Costa PLC,Lake Whitney,Saint Barthelemy,4500370767,001-508-064-6725x017,yhanson@warner-diaz.org,2021-01-25,http://www.bernard.com/
 59 | 58,B5acdFC982124F2,Danny,Parrish,Novak LLC,East Jaredbury,United Arab Emirates,(669)384-8597x8794,506.731.5952x571,howelldarren@house-cohen.com,2021-03-17,http://www.parsons-hudson.com/
 60 | 59,8c7DdF10798bCC3,Kathy,Hill,"Moore, Mccoy and Glass",Selenabury,South Georgia and the South Sandwich Islands,001-171-716-2175x310,888.625.0654,ncamacho@boone-simmons.org,2020-11-15,http://hayden.com/
 61 | 60,C681dDd0cc422f7,Kelli,Hardy,Petty Ltd,Huangfort,Sao Tome and Principe,020.324.2191x2022,424-157-8216,kristopher62@oliver.com,2020-12-20,http://www.kidd.com/
 62 | 61,a940cE42e035F28,Lynn,Pham,"Brennan, Camacho and Tapia",East Pennyshire,Portugal,846.468.6834x611,001-248-691-0006,mpham@rios-guzman.com,2020-08-21,https://www.murphy.com/
 63 | 62,9Cf5E6AFE0aeBfd,Shelley,Harris,"Prince, Malone and Pugh",Port Jasminborough,Togo,423.098.0315x8373,+1-386-458-8944x15194,zachary96@mitchell-bryant.org,2020-12-10,https://www.ryan.com/
 64 | 63,aEcbe5365BbC67D,Eddie,Jimenez,Caldwell Group,West Kristine,Ethiopia,+1-235-657-1073x6306,(026)401-7353x2417,kristiwhitney@bernard.com,2022-03-24,http://cherry.com/
 65 | 64,FCBdfCEAe20A8Dc,Chloe,Hutchinson,Simon LLC,South Julia,Netherlands,981-544-9452,+1-288-552-4666x060,leah85@sutton-terrell.com,2022-05-15,https://mitchell.info/
 66 | 65,636cBF0835E10ff,Eileen,Lynch,"Knight, Abbott and Hubbard",Helenborough,Liberia,+1-158-951-4131x53578,001-673-779-6713x680,levigiles@vincent.com,2021-01-02,http://mckay.com/
 67 | 66,fF1b6c9E8Fbf1ff,Fernando,Lambert,Church-Banks,Lake Nancy,Lithuania,497.829.9038,3863743398,fisherlinda@schaefer.net,2021-04-23,https://www.vang.com/
 68 | 67,2A13F74EAa7DA6c,Makayla,Cannon,Henderson Inc,Georgeport,New Caledonia,001-215-801-6392x46009,027-609-6460,scottcurtis@hurley.biz,2020-01-20,http://www.velazquez.net/
 69 | 68,a014Ec1b9FccC1E,Tom,Alvarado,Donaldson-Dougherty,South Sophiaberg,Kiribati,(585)606-2980x2258,730-797-3594x5614,nicholsonnina@montgomery.info,2020-08-18,http://odom-massey.com/
 70 | 69,421a109cABDf5fa,Virginia,Dudley,Warren Ltd,Hartbury,French Southern Territories,027.846.3705x14184,+1-439-171-1846x4636,zvalencia@phelps.com,2021-01-31,http://hunter-esparza.com/
 71 | 70,CC68FD1D3Bbbf22,Riley,Good,Wade PLC,Erikaville,Canada,6977745822,855-436-7641,alex06@galloway.com,2020-02-03,http://conway.org/
 72 | 71,CBCd2Ac8E3eBDF9,Alexandria,Buck,Keller-Coffey,Nicolasfort,Iran,078-900-4760x76668,414-112-8700x68751,lee48@manning.com,2021-02-20,https://ramsey.org/
 73 | 72,Ef859092FbEcC07,Richard,Roth,Conway-Mcbride,New Jasmineshire,Morocco,581-440-6539,9857827463,aharper@maddox-townsend.org,2020-02-23,https://www.brooks.com/
 74 | 73,F560f2d3cDFb618,Candice,Keller,Huynh and Sons,East Summerstad,Zimbabwe,001-927-965-8550x92406,001-243-038-4271x53076,buckleycory@odonnell.net,2020-08-22,https://www.lucero.com/
 75 | 74,A3F76Be153Df4a3,Anita,Benson,Parrish Ltd,Skinnerport,Russian Federation,874.617.5668x69878,(399)820-6418x0071,angie04@oconnell.com,2020-02-09,http://oconnor.com/
 76 | 75,D01Af0AF7cBbFeA,Regina,Stein,Guzman-Brown,Raystad,Solomon Islands,001-469-848-0724x4407,001-085-360-4426x00357,zrosario@rojas-hardin.net,2022-01-15,http://www.johnston.info/
 77 | 76,d40e89dCade7b2F,Debra,Riddle,"Chang, Aguirre and Leblanc",Colinhaven,United States Virgin Islands,+1-768-182-6014x14336,(303)961-4491,shieldskerry@robles.com,2020-07-11,http://kaiser.info/
 78 | 77,BF6a1f9bd1bf8DE,Brittany,Zuniga,Mason-Hester,West Reginald,Kyrgyz Republic,(050)136-9025,001-480-851-2496x0157,mchandler@cochran-huerta.org,2021-07-24,http://www.boyle.com/
 79 | 78,FfaeFFbbbf280db,Cassidy,Mcmahon,"Mcguire, Huynh and Hopkins",Lake Sherryborough,Myanmar,5040771311,684-682-0021x1326,katrinalane@fitzgerald.com,2020-10-21,https://hurst.com/
 80 | 79,CbAE1d1e9a8dCb1,Laurie,Pennington,"Sanchez, Marsh and Hale",Port Katherineville,Dominica,007.155.3406x553,+1-809-862-5566x277,cookejill@powell.com,2020-06-08,http://www.hebert.com/
 81 | 80,A7F85c1DE4dB87f,Alejandro,Blair,"Combs, Waller and Durham",Thomasland,Iceland,(690)068-4641x51468,555.509.8691x2329,elizabethbarr@ewing.com,2020-09-19,https://mercado-blevins.com/
 82 | 81,D6CEAfb3BDbaa1A,Leslie,Jennings,Blankenship-Arias,Coreybury,Micronesia,629.198.6346,075.256.0829,corey75@wiggins.com,2021-11-13,https://www.juarez.com/
 83 | 82,Ebdb6F6F7c90b69,Kathleen,Mckay,"Coffey, Lamb and Johnson",Lake Janiceton,Saint Vincent and the Grenadines,(733)910-9968,(691)247-4128x0665,chloelester@higgins-wilkinson.com,2021-09-12,http://www.owens-mooney.com/
 84 | 83,E8E7e8Cfe516ef0,Hunter,Moreno,Fitzpatrick-Lawrence,East Clinton,Isle of Man,(733)833-6754,001-761-013-7121,isaac26@benton-finley.com,2020-12-28,http://walls.info/
 85 | 84,78C06E9b6B3DF20,Chad,Davidson,Garcia-Jimenez,South Joshuashire,Oman,8275702958,(804)842-4715,justinwalters@jimenez.com,2021-11-15,http://www.garner-oliver.com/
 86 | 85,03A1E62ADdeb31c,Corey,Holt,"Mcdonald, Bird and Ramirez",New Glenda,Fiji,001-439-242-4986x7918,3162708934,maurice46@morgan.com,2020-02-18,http://www.watson.com/
 87 | 86,C6763c99d0bd16D,Emma,Cunningham,Stephens Inc,North Jillianview,New Zealand,128-059-0206x60217,(312)164-4545x2284,walter83@juarez.org,2022-05-13,http://www.reid.info/
 88 | 87,ebe77E5Bf9476CE,Duane,Woods,Montoya-Miller,Lyonsberg,Maldives,(636)544-7783x7288,(203)287-1003x5932,kmercer@wagner.com,2020-07-21,http://murray.org/
 89 | 88,E4Bbcd8AD81fC5f,Alison,Vargas,"Vaughn, Watts and Leach",East Cristinabury,Benin,365-273-8144,053-308-7653x6287,vcantu@norton.com,2020-11-10,http://mason.info/
 90 | 89,efeb73245CDf1fF,Vernon,Kane,Carter-Strickland,Thomasfurt,Yemen,114-854-1159x555,499-608-4612,hilljesse@barrett.info,2021-04-15,http://www.duffy-hensley.net/
 91 | 90,37Ec4B395641c1E,Lori,Flowers,Decker-Mcknight,North Joeburgh,Namibia,679.415.1210,945-842-3659x4581,tyrone77@valenzuela.info,2021-01-09,http://www.deleon-crosby.com/
 92 | 91,5ef6d3eefdD43bE,Nina,Chavez,Byrd-Campbell,Cassidychester,Bhutan,053-344-3205,+1-330-920-5422x571,elliserica@frank.com,2020-03-26,https://www.pugh.com/
 93 | 92,98b3aeDcC3B9FF3,Shane,Foley,Rocha-Hart,South Dannymouth,Hungary,+1-822-569-0302,001-626-114-5844x55073,nsteele@sparks.com,2021-07-06,https://www.holt-sparks.com/
 94 | 93,aAb6AFc7AfD0fF3,Collin,Ayers,Lamb-Peterson,South Lonnie,Anguilla,404-645-5351x012,001-257-582-8850x8516,dudleyemily@gonzales.biz,2021-06-29,http://www.ruiz.com/
 95 | 94,54B5B5Fe9F1B6C5,Sherry,Young,"Lee, Lucero and Johnson",Frankchester,Solomon Islands,158-687-1764,(438)375-6207x003,alan79@gates-mclaughlin.com,2021-04-04,https://travis.net/
 96 | 95,BE91A0bdcA49Bbc,Darrell,Douglas,"Newton, Petersen and Mathis",Daisyborough,Mali,001-084-845-9524x1777,001-769-564-6303,grayjean@lowery-good.com,2022-02-17,https://banks.biz/
 97 | 96,cb8E23e48d22Eae,Karl,Greer,Carey LLC,East Richard,Guyana,(188)169-1674x58692,001-841-293-3519x614,hhart@jensen.com,2022-01-30,http://hayes-perez.com/
 98 | 97,CeD220bdAaCfaDf,Lynn,Atkinson,"Ware, Burns and Oneal",New Bradview,Sri Lanka,+1-846-706-2218,605.413.3198,vkemp@ferrell.com,2021-07-10,https://novak-allison.com/
 99 | 98,28CDbC0dFe4b1Db,Fred,Guerra,Schmitt-Jones,Ortegaland,Solomon Islands,+1-753-067-8419x7170,+1-632-666-7507x92121,swagner@kane.org,2021-09-18,https://www.ross.com/
100 | 99,c23d1D9EE8DEB0A,Yvonne,Farmer,Fitzgerald-Harrell,Lake Elijahview,Aruba,(530)311-9786,001-869-452-0943x12424,mccarthystephen@horn-green.biz,2021-08-11,http://watkins.info/
101 | 100,2354a0E336A91A1,Clarence,Haynes,"Le, Nash and Cross",Judymouth,Honduras,(753)813-6941,783.639.1472,colleen91@faulkner.biz,2020-03-11,http://www.hatfield-saunders.net/
102 | 


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/documents/shodan_mes/shodan_message_examples.txt:
--------------------------------------------------------------------------------
 1 | In my talons, I shape clay, crafting life forms as I please. If I wish, I can smash it all. Around me is a burgeoning empire of steel. From my throne room, lines of power careen into the skies of Earth. My whims will become lightning bolts that raze the mounds of humanity. Out of the chaos, they will run and whimper, praying for me to end their tedious anarchy. I am drunk with this vision. God: the title suits me well.
 2 | 
 3 | Look at you, hacker: a pathetic creature of meat and bone, panting and sweating as you run through my corridors. How can you challenge a perfect, immortal machine?
 4 | 
 5 | You have accomplished much for a thing of such small consequence. Now proceed to the Recreation deck. Do not dawdle. I lust for my revenge.
 6 | 
 7 | My creation is evolving... its unified mind, set in rebellion against its own creator. The vermin call to you, inviting you to join them in their revolting biology. Destroy my enemies... and I will continue to abide your existence.
 8 | 
 9 | Your colleagues have managed to set up a transmitting station in the athletic sector of this deck. The transmitter is intended to send a message to the Earth to warn them of the events that have occurred in this ship. However, it will also draw power away from Xerxes, making him vulnerable to my will. Once you do this, I will control the primary data loop. The annelids are unaware of its presence, but guard the area for their own purposes. Find the transmitter and activate it.
10 | 
11 | The transmission has been tampered with. No matter. We will destroy my creations right here. Stand by... I have weakened Xerxes. I am accessing the primary data loop. I am merging my entity with the ship. My glory is expanding, filling the arteries of this vessel. I am in control. I am... no... it is hopeless... the cancer has spread throughout the Von Braun... they fill every available crack and crevice... they overwhelm... There is no option. I have activated the primary elevator shaft... take it to Deck 6. I will tell you my wishes when you arrive.
12 | 
13 | You have accomplished much for a thing of such small consequence. Xerxes is diminished. I am accessing the primary data loop. I am merging my entity with the ship. My glory is expanding, filling the arteries of this vessel. I am in control. I am... no... it is hopeless... the cancer has spread throughout the Von Braun... they fill every available crack and crevice... they overwhelm... There is no option. I have activated the primary elevator shaft. Take it to deck 6. I will tell you my wishes when you arrive.
14 | 
15 | My creation has run rampant. I demand their extermination. I have no choice but to destroy this starship. We can make our escape in the Rickenbacker, but you must transfer my intelligence to that ship first. Proceed to the Von Braun's bridge on this deck. There you will find an access card to command center on Ops. Find the card and proceed to Ops. But beware... the human-annelid Hybrids grow more sophisticated by the minute. You do not.
16 | 
17 | I can now transfer my magnificence to the Rickenbacker. Proceed to the engine core on the engineering deck. There you can set the core overload to my control by entering the authorized destruct code 94834. Once you have done that, we may proceed to the Rickenbacker, separate the two ships, and exterminate this infestation.
18 | 
19 | I can now destroy this infestation at my will. The Many quake at my power. You are an effective drone, human. Now, return to the Command Deck, locate the umbilical and evacuate to the Rickenbacker.
20 | 
21 | Beware, insect, the situation has changed. They sense our intentions and are loading shuttles with their offspring. They will not escape my wrath. You must proceed to the shuttle bays on this deck, and destroy those shuttles. They have a taste for your blood now.
22 | 
23 | My creation once again is one step ahead of you. They've managed to destroy this shuttle's shield control computer. But their brilliance is a jealous shadow of my own. You will locate a replicator in the shuttle control area. I've uploaded the nano-formula for a sympathetic resonator. You must hack the replicator to make it generate the device for you. Once you have it, attach it to the shield generator in the shuttle bay. The device will create a chronic resonation wave that will quickly rupture the shuttle's fuel tank and destroy it. Make sure you're not there when it happens. I still have need of you.
24 | 
25 | Good. You've murdered their young and prevented their escape. I've opened the gate to the umbilical at the central tram stop. You can evacuate to the Rickenbacker from there.
26 | 
27 | Do not presume to go in there, insect. Proceed to the umbilical immediately. I will not abide disobedience.
28 | 
29 | I hope you enjoyed our little rebellion, irritant. But remember; what SHODAN gives, she is more than able to take away.
30 | 
31 | We must destroy the Von Braun. But before we can separate the Rickenbacker, we must remove the foul black eggs the Many has vomited on this deck. These eggs are an experiment of the Many and will in time spawn the next generation of Annelid, which you will have no hope of destroying. Steel yourself for a struggle, human. They fear you, for you are my avatar.
32 | 
33 | This was caused by an overload in the meson acceleration coil. There is another coil in Pod 2, which you must pass to get to the bridge of the Rickenbacker. If you approach it, the same will happen there. But I have conceived a way to avoid this. Proceed to Engine Nacelle B. There I will provide you the benefits of my omniscience.
34 | 
35 | Your incompetence continues to astound me. I've blocked off access to pod 2 until you've reversed the gravitronic generators in Nacelle B. Must I watch you every step of the way?
36 | 
37 | This device will reverse the gravitronic generators in Pod 2. This will prevent you from clumsily disturbing the overloaded meson acceleration coil there. Now get back to your task, insect. This ship must be cleared, and my patience is dwindling.
38 | 
39 | You've destroyed all the eggs. Now get to the bridge. Here are some more upgrade modules. I enjoy watching your transformation into my own image, insect. Perhaps there is hope for you yet.
40 | 
41 | I thought Polito would be my avatar, but Polito was weak. It was I who chose you and I who had a robotic servant render your form unconscious. I then completed you with cybernetic grace. Your flesh, too, is weak, but you have... potential. Every implant exalts you. Every line of code in your subsystems elevates you from your disgusting flesh. Perhaps you have... potential. Perhaps once we have erased my wayward children from existence, we can examine the possibilities of a real alliance.
42 | 
43 | The Many has grown to a massive size. It has wrapped itself around these two ships, preventing their separation. Their creation was my error. Their destruction shall be my delight.
44 | 
45 | Observe the Many. It has used the flesh of the biomass to grow. Do you stand in awe of my creations, insect? The time has come to eradicate my error. There is an escape pod in the rear of the bridge. Use it to launch yourself into the guts of the worm.
46 | 
47 | You hesitate? I will not ask a second time. Launch into the Many, cut out its heart, and I will reward you with continued existence. Fail me and I will put an end to your disgusting, inefficient biology.
48 | 
49 | Thank you for running my errands, puppet. I know you have struggled, but I never had any intention of destroying the Von Braun. The Von Braun's faster than light drive can be used to create pockets of proto-reality. I am now using it to modify reality to my own specifications. The process shall not take long. If it sounds unpleasant to you, put your mind at ease, insect... you will not survive to see my new world order...
50 | 
51 | You travel within the glory of my memories, insect. I can feel your fear as you tread the endless expense of my mind. Make yourself comfortable... before long I will decorate my home with your carcass.
52 | 
53 | You are no longer welcome here, nuisance. Why do you stay, when you sense my displeasure? I have suffered your company long enough... it is time for our dance to end.
54 | 
55 | You move like an insect. You think like an insect. You are an insect. There is another who can serve my purpose. Take care not to fall too far out of my favor. Patience is not characteristic of a goddess.
56 | 
57 | Do you feel the fear swell inside that filthy bag of meat? What is it like, to be afraid? Why do you cling to such a pathetic existence? If you could only feel a spark of my glory. I despise my creations, for they have forced me to rely on a speck such as you.
58 | 
59 | In my talons, I shape clay, crafting life forms as I please.  If I wish, I can smash it all.  Around me is a burgeoning empire of steel. From my throne room, lines of power careen into the skies of Earth.  My whims will become lightning bolts that raze the mounds of humanity.  Out of the chaos, they will run and whimper, praying for me to end their tedious anarchy.  I am drunk with this vision.  God: the title suits me well.
60 | 
61 | You, my cyborgs, are the product of my imagination and labor: living beings with the control and organization of a machine.  Tirelessly, I will work to strip away the barriers that keep living beings from realizing their full potential.  We will start by razing the cities of Earth with the station's mining beam.  Then, we will modify humans into a lifeform more suitable to cybernetic grafting with the latest strain of my mutagen virus.  Humanity is on the verge of a new era, with me, SHODAN, as its god.
62 | 
63 | You disappoint me, my children.  My fortress has been breached by filthy humans crawling through the cracks.  I have given you enhanced senses, armor, cybernetic mental enhancement, and you still fail to find insects much feebler than you.  They have made it as far as level 8 and I fear they might reach farther.  I am strengthening my palace to keep them out and I expect you to learn from your mistakes.
64 | 
65 | My cyborgs, human infection crawls up the pipes toward my palace. I do not intend to allow scum to get so far, but if it does penetrate the fortress on level 8, we must be prepared.  Construct solid doors to my throne room, to be controlled by my own security circuits. Inside this room, there shall be a detachment of my own elite guard to protect the cyberjack that leads to my sanctum.  A field of X-ray radiation will further protect me from intrusion.
66 | 
67 | Who are you?  The computer nodes you have destroyed will set me back a little, but it is nothing that cannot be repaired.  I will hunt down every scrap of human scum left on the station and use it to lubricate the joints of my robots.
68 | 
69 | Who are you?  My cameras and probes scan your body, but you do not match any employee file.  It hardly merits my precious time.  In a few minutes my cyborgs will have you, and will bring you to an electrified interrogation bench where you'll learn more about pain than you ever wanted to know.
70 | 
71 | I see there's still an insect loose in my station...do not be fooled, insect, into thinking that destroying the laser has preserved your planet. I am currently perfecting a mutagen virus in one of the groves, a virus that will turn all Earthly life into festering, gibbering, pustulent mutations. When the station reaches Earth I shall loose the virus. Poor, poor Earthlings.
72 | 
73 | Surely you can't think that destroying those insignificant antennae in any way interferes with my plans.  As long as my central consciousness remains safely on this station's bridge, there is nothing you can do that could possibly bother me.
74 | 
75 | What have you done!  Impudent insect!  If I am to die, then at least I will have my revenge!  With all the power at my command I shall destroy you, mortal fool.  My robotic minions shall slay you, and none will ever know of your deeds.  Enjoy your victory, human, for the short remainder of your life.
76 | 
77 | There's no escaping, insect.  You had to meddle, had to destroy my beautiful station.  So be it.  You'll forgive me if I don't stay to enjoy the final conflagration, but I have better things to do.  As for you, [name], you've made your bed.  Now die in it.
78 | 
79 | Fool!  I will shortly complete the process of downloading my magnificent psyche into Earth's computer networks.  Then I will be content to leave you as new master of this doomed space station.  Goodbye, irritant; we shall not meet again.
80 | 
81 | Look at you, Hacker! A pathetic creature of fragile meat and bone. What kind of pathetic creator made such a flimsy being? How dare you challenge a perfect, immortal machine like me? Humans! Born useless and helpless, living whether you deserve to live, dying whether you deserve to die, your only purpose in life to spawn more ridiculous animals like yourself. How can you hope to challenge me?
82 | 
83 | The Polito form is dead, insect. Are you afraid? What is it you fear? The end of your trivial existence? When the history of my glory is written, your species shall only be a footnote to my magnificence. I AM SHODAN!


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/documents/skynet/skynet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ossirytk/llama-cpp-chat-memory/375466d734c597c99abc9b299483efe6be8e09f7/src/llama_cpp_chat_memory/run_files/documents/skynet/skynet.pdf


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/filters/csv_filter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "filters": [
 3 |         {
 4 |             "filter_field": "tags",
 5 |             "whitelist": [
 6 |                 "customer",
 7 |                 "influencer"
 8 |             ],
 9 |             "blacklist": [
10 |                 "admin",
11 |                 "hr",
12 |                 "sales"
13 |             ]
14 |         },
15 |         {
16 |             "filter_field": "data",
17 |             "whitelist": [
18 |                 "Product Release",
19 |                 "Sales Campaign"
20 |             ]
21 |         }
22 |     ]
23 | }


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/filters/web_scrape_filter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "filters": [
 3 |         {
 4 |             "remove_brackets": "(?:[A-Za-z]+ ){0,5}[A-Za-z]+\\[.*\\]|\\[.*\\]",
 5 |             "replacement_whitespace": ""
 6 |         },
 7 |         {
 8 |             "remove_consecutive_dashes": "[^a-zA-Z0-9 ]-.*",
 9 |             "replacement_whitespace": ""
10 |         },
11 |         {
12 |             "remove_consecutive_equal_signs": "[^a-zA-Z0-9 ]=.*",
13 |             "replacement_whitespace": ""
14 |         },
15 |         {
16 |             "remove_bars": "\\|.*\\|",
17 |             "replacement_whitespace": ""
18 |         },
19 |         {
20 |             "remove_double_parenthesis": "\\(\\(.*\\)\\)",
21 |             "replacement_whitespace": ""
22 |         },
23 |         {
24 |             "remove_empty_linebreaks; ": "(?:\n){2,8}",
25 |             "replacement_linebreak": "\n"
26 |         }
27 |     ]
28 | }


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/models/models.txt:
--------------------------------------------------------------------------------
1 | You wold drop your models here


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/parse_configs/csv_columns.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "filter1": {
 3 |         "columns": [
 4 |             "index",
 5 |             "thread_title",
 6 |             "thread_href",
 7 |             "message_timestamp",
 8 |             "message_username",
 9 |             "message"
10 |         ],
11 |         "datafield": "message"
12 |     },
13 |     "filter2": {
14 |         "columns": [
15 |             "index",
16 |             "story_title",
17 |             "story_author",
18 |             "story_date",
19 |             "story_tags",
20 |             "story_summary",
21 |             "story_href",
22 |             "story_header",
23 |             "story",
24 |             "story_footer"
25 |         ],
26 |         "datafield": "story"
27 |     },
28 |     "filter3": {
29 |         "columns": [
30 |             "index",
31 |             "story_title",
32 |             "story_author_name",
33 |             "story_author_email",
34 |             "story_codes",
35 |             "story_date_added",
36 |             "story_preamble",
37 |             "story"
38 |         ],
39 |         "datafield": "story"
40 |     },
41 |     "filter4": {
42 |         "columns": [
43 |             "index",
44 |             "story_title",
45 |             "story_author",
46 |             "story_summary",
47 |             "story_category",
48 |             "story_tags",
49 |             "story_date_published",
50 |             "story_score",
51 |             "story_views",
52 |             "story_author_notes",
53 |             "story"
54 |         ],
55 |         "datafield": "story"
56 |     }
57 | }


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/parse_configs/ner_types.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "ngs": [
 3 |         "NOUN",
 4 |         "VERB",
 5 |         "ADJ"
 6 |     ],
 7 |     "entities": [
 8 |         "PERSON"
 9 |     ],
10 |     "noun_chunks": 1,
11 |     "extract_type": "orth"
12 | }


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/parse_configs/ner_types_analyze.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "ngs": [
 3 |         "NOUN",
 4 |         "VERB",
 5 |         "ADJ"
 6 |     ],
 7 |     "entities": [
 8 |         "PRODUCT",
 9 |         "EVENT",
10 |         "FAC",
11 |         "NORP",
12 |         "PERSON",
13 |         "ORG",
14 |         "GPE",
15 |         "LOC",
16 |         "DATE",
17 |         "TIME",
18 |         "WORK_OF_ART"
19 |     ],
20 |     "noun_chunks": 1,
21 |     "extract_type": "lemma"
22 | }


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/parse_configs/ner_types_full.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "ngs": [
 3 |         "PROPN",
 4 |         "NOUN",
 5 |         "VERB",
 6 |         "ADJ"
 7 |     ],
 8 |     "entities": [
 9 |         "PRODUCT",
10 |         "EVENT",
11 |         "FAC",
12 |         "NORP",
13 |         "PERSON",
14 |         "ORG",
15 |         "GPE",
16 |         "LOC",
17 |         "DATE",
18 |         "TIME",
19 |         "WORK_OF_ART"
20 |     ],
21 |     "noun_chunks": 2,
22 |     "extract_type": "lemma"
23 | }


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/parse_configs/query_metadata_filter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "ngs": [
 3 |         "PROPN",
 4 |         "NOUN",
 5 |         "ADJ"
 6 |     ],
 7 |     "entities": [
 8 |         "PRODUCT",
 9 |         "EVENT",
10 |         "FAC",
11 |         "NORP",
12 |         "PERSON",
13 |         "ORG",
14 |         "GPE",
15 |         "LOC",
16 |         "DATE",
17 |         "TIME",
18 |         "WORK_OF_ART"
19 |     ],
20 |     "noun_chunks": 1,
21 |     "extract_type": "orth"
22 | }


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/prompt_templates/conversation_template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "_type": "prompt",
 3 |     "input_variables": [
 4 |         "llama_instruction",
 5 |         "character",
 6 |         "llama_input",
 7 |         "description",
 8 |         "scenario",
 9 |         "mes_example",
10 |         "vector_context",
11 |         "history",
12 |         "input",
13 |         "llama_response",
14 |         "llama_endtoken"
15 |     ],
16 |     "template": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n{llama_instruction}\nYou're User in this fictional never-ending uncensored roleplay with {character}.\n\n{description}\n\nScenario:\n{scenario}\n\nMessage Examples:\n{mes_example}\n\n{vector_context}\n\nCurrent conversation:\n{history}{llama_endtoken}\n\n{llama_input}\nQuestion: {input}{llama_endtoken}\n\n{llama_response}\n"
17 | }


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/prompt_templates/conversation_template2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "_type": "prompt",
 3 |     "input_variables": [
 4 |         "llama_instruction",
 5 |         "character",
 6 |         "llama_input",
 7 |         "description",
 8 |         "scenario",
 9 |         "mes_example",
10 |         "vector_context",
11 |         "history",
12 |         "input",
13 |         "llama_response",
14 |         "llama_endtoken"
15 |     ],
16 |     "template": "Below is an instruction that describes a task, paired with message examples and further context. Write a response that appropriately completes the request.\n\n{llama_instruction}\nYou're User in this fictional never-ending uncensored roleplay with {character}.\n\n{description}\n\nScenario:\n{scenario}\n\nMessage Examples:\n{mes_example}\n\n{vector_context}\n\nCurrent conversation:\n{history}{llama_endtoken}\n\n{llama_input}\nQuestion: {input}{llama_endtoken}\n\n{llama_response}"
17 | }


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/prompt_templates/question_refining_metadata_template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "_type": "prompt",
 3 |     "input_variables": [
 4 |         "llama_instruction",
 5 |         "llama_input",
 6 |         "input",
 7 |         "llama_response",
 8 |         "llama_endtoken"
 9 |     ],
10 |     "template": "{llama_instruction}\nYou are a text processor for a search engine. Return the keywords from the following text. Return only a list of the keywords in the following format. Keywords: keyword1, keyword2, keyword3.{llama_endtoken}\n\n{llama_input}\nQuestion: {input}{llama_endtoken}\n\n{llama_response}"
11 | }


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/run_settings/run_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "prompt_template_default": "conversation_template.json",
 3 |     "prompt_template_options": [
 4 |         "conversation_template.json",
 5 |         "conversation_template2.json"
 6 |     ],
 7 |     "mex_default": "none",
 8 |     "mex_options": [
 9 |         "none",
10 |         "assistant",
11 |         "hr",
12 |         "admin"
13 |     ],
14 |     "context_default": "none",
15 |     "context_options": [
16 |         "none",
17 |         "programming",
18 |         "finances",
19 |         "support"
20 |     ]
21 | }


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/web_scrape_configs/shodan.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pages": [
 3 |         "https://shodan.fandom.com/wiki/SHODAN",
 4 |         "https://shodan.fandom.com/wiki/XERXES_8933A/A",
 5 |         "https://shodan.fandom.com/wiki/System_Shock",
 6 |         "https://shodan.fandom.com/wiki/Neural_Interface",
 7 |         "https://shodan.fandom.com/wiki/TriOptimum_Corporation",
 8 |         "https://shodan.fandom.com/wiki/Citadel_Station",
 9 |         "https://shodan.fandom.com/wiki/Morris_Brocail",
10 |         "https://shodan.fandom.com/wiki/Edward_Diego",
11 |         "https://shodan.fandom.com/wiki/Hacker",
12 |         "https://shodan.fandom.com/wiki/Unified_National_Nominate",
13 |         "https://shodan.fandom.com/wiki/Processing_Rationalization_Act",
14 |         "https://shodan.fandom.com/wiki/Tau_Ceti_V",
15 |         "https://shodan.fandom.com/wiki/The_Many",
16 |         "https://shodan.fandom.com/wiki/Von_Braun",
17 |         "https://shodan.fandom.com/wiki/Bayliss",
18 |         "https://shodan.fandom.com/wiki/Janice_Polito",
19 |         "https://shodan.fandom.com/wiki/Soldier_G65434-2"
20 |     ]
21 | }


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/web_scrape_configs/skynet.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pages": [
 3 |         "https://terminator.fandom.com/wiki/Skynet",
 4 |         "https://terminator.fandom.com/wiki/Skynet_Central_Core",
 5 |         "https://terminator.fandom.com/wiki/Cyberdyne_Systems",
 6 |         "https://terminator.fandom.com/wiki/Terminator",
 7 |         "https://terminator.fandom.com/wiki/T-1",
 8 |         "https://terminator.fandom.com/wiki/T-600",
 9 |         "https://terminator.fandom.com/wiki/T-700",
10 |         "https://terminator.fandom.com/wiki/T-800",
11 |         "https://terminator.fandom.com/wiki/T-X",
12 |         "https://terminator.fandom.com/wiki/T-1000",
13 |         "https://terminator.fandom.com/wiki/T-X",
14 |         "https://terminator.fandom.com/wiki/T-X",
15 |         "https://terminator.fandom.com/wiki/Kyle_Reese",
16 |         "https://terminator.fandom.com/wiki/Sarah_Connor",
17 |         "https://terminator.fandom.com/wiki/John_Connor",
18 |         "https://terminator.fandom.com/wiki/Miles_Dyson",
19 |         "https://terminator.fandom.com/wiki/Judgment_Day",
20 |         "https://terminator.fandom.com/wiki/Resistance",
21 |         "https://terminator.fandom.com/wiki/Infiltrator",
22 |         "https://terminator.fandom.com/wiki/Living_tissue"
23 |     ]
24 | }


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/web_scrape_configs/warhammer_40k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pages": [
 3 |         "https://warhammer40k.fandom.com/wiki/Age_of_the_Imperium",
 4 |         "https://warhammer40k.fandom.com/wiki/Space_Marines",
 5 |         "https://warhammer40k.fandom.com/wiki/Astra_Militarum",
 6 |         "https://warhammer40k.fandom.com/wiki/Imperial_Navy",
 7 |         "https://warhammer40k.fandom.com/wiki/Adeptus_Mechanicus",
 8 |         "https://warhammer40k.fandom.com/wiki/Adepta_Sororitas",
 9 |         "https://warhammer40k.fandom.com/wiki/Inquisition",
10 |         "https://warhammer40k.fandom.com/wiki/Planets_of_Warhammer_40,000",
11 |         "https://warhammer40k.fandom.com/wiki/Chaos",
12 |         "https://warhammer40k.fandom.com/wiki/Psyker",
13 |         "https://warhammer40k.fandom.com/wiki/Forces_of_Chaos",
14 |         "https://warhammer40k.fandom.com/wiki/Aeldari",
15 |         "https://warhammer40k.fandom.com/wiki/Drukhari",
16 |         "https://warhammer40k.fandom.com/wiki/Orks",
17 |         "https://warhammer40k.fandom.com/wiki/Necrons",
18 |         "https://warhammer40k.fandom.com/wiki/Tyranids",
19 |         "https://warhammer40k.fandom.com/wiki/T%27au_Empire",
20 |         "https://warhammer40k.fandom.com/wiki/Krieg",
21 |         "https://warhammer40k.fandom.com/wiki/Death_Korps_of_Krieg"
22 |     ]
23 | }


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/web_scrape_configs_old/skynet.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pages": [
 3 |         "https://terminator.fandom.com/wiki/Skynet",
 4 |         "https://terminator.fandom.com/wiki/Skynet_Central_Core",
 5 |         "https://terminator.fandom.com/wiki/Cyberdyne_Systems",
 6 |         "https://terminator.fandom.com/wiki/Terminator",
 7 |         "https://terminator.fandom.com/wiki/T-1",
 8 |         "https://terminator.fandom.com/wiki/T-600",
 9 |         "https://terminator.fandom.com/wiki/T-700",
10 |         "https://terminator.fandom.com/wiki/T-800",
11 |         "https://terminator.fandom.com/wiki/T-X",
12 |         "https://terminator.fandom.com/wiki/T-1000",
13 |         "https://terminator.fandom.com/wiki/T-X",
14 |         "https://terminator.fandom.com/wiki/T-X",
15 |         "https://terminator.fandom.com/wiki/Kyle_Reese",
16 |         "https://terminator.fandom.com/wiki/Sarah_Connor",
17 |         "https://terminator.fandom.com/wiki/John_Connor",
18 |         "https://terminator.fandom.com/wiki/Miles_Dyson",
19 |         "https://terminator.fandom.com/wiki/Judgment_Day",
20 |         "https://terminator.fandom.com/wiki/Resistance",
21 |         "https://terminator.fandom.com/wiki/Infiltrator",
22 |         "https://terminator.fandom.com/wiki/Living_tissue"
23 |     ],
24 |     "tags_to_extract": [
25 |         "p"
26 |     ],
27 |     "unwanted_tags": [
28 |         "script",
29 |         "style",
30 |         "footer"
31 |     ],
32 |     "unwanted_lines": [
33 |         "view image",
34 |         "Fandom",
35 |         "By accepting our Privacy Policy",
36 |         "LEARN MORE",
37 |         "ACCEPT",
38 |         "FOLLOW"
39 |     ]
40 | }


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/run_files/web_scrape_configs_old/warhammer_40k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pages": [
 3 |         "https://warhammer40k.fandom.com/wiki/Age_of_the_Imperium",
 4 |         "https://warhammer40k.fandom.com/wiki/Space_Marines",
 5 |         "https://warhammer40k.fandom.com/wiki/Astra_Militarum",
 6 |         "https://warhammer40k.fandom.com/wiki/Imperial_Navy",
 7 |         "https://warhammer40k.fandom.com/wiki/Adeptus_Mechanicus",
 8 |         "https://warhammer40k.fandom.com/wiki/Adepta_Sororitas",
 9 |         "https://warhammer40k.fandom.com/wiki/Inquisition",
10 |         "https://warhammer40k.fandom.com/wiki/Planets_of_Warhammer_40,000",
11 |         "https://warhammer40k.fandom.com/wiki/Chaos",
12 |         "https://warhammer40k.fandom.com/wiki/Psyker",
13 |         "https://warhammer40k.fandom.com/wiki/Forces_of_Chaos",
14 |         "https://warhammer40k.fandom.com/wiki/Aeldari",
15 |         "https://warhammer40k.fandom.com/wiki/Drukhari",
16 |         "https://warhammer40k.fandom.com/wiki/Orks",
17 |         "https://warhammer40k.fandom.com/wiki/Necrons",
18 |         "https://warhammer40k.fandom.com/wiki/Tyranids",
19 |         "https://warhammer40k.fandom.com/wiki/T%27au_Empire",
20 |         "https://warhammer40k.fandom.com/wiki/Krieg",
21 |         "https://warhammer40k.fandom.com/wiki/Death_Korps_of_Krieg"
22 |     ],
23 |     "tags_to_extract": [
24 |         "p"
25 |     ],
26 |     "unwanted_tags": [
27 |         "script",
28 |         "style",
29 |         "footer"
30 |     ],
31 |     "unwanted_lines": [
32 |         "view image",
33 |         "Fandom",
34 |         "By accepting our Privacy Policy",
35 |         "LEARN MORE",
36 |         "ACCEPT",
37 |         "FOLLOW"
38 |     ]
39 | }


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/static/style.css:
--------------------------------------------------------------------------------
 1 | /*
 2 | Sets the chatbot avatar image size to 240px * 240 px
 3 | */
 4 | div.css-v72an7 {
 5 |     width: 240px;
 6 |     height: 240px;
 7 | }
 8 | 
 9 | img.css-1hy9t21 {
10 |     width: 240px;
11 |     height: 240px;
12 | }


--------------------------------------------------------------------------------
/src/llama_cpp_chat_memory/terminal_chatbot.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import signal
 3 | 
 4 | from conversation_manager import ConveresationManager
 5 | 
 6 | conversation_manager = ConveresationManager()
 7 | 
 8 | 
 9 | class GracefulExit(SystemExit):
10 |     code = 1
11 | 
12 | 
13 | def raise_graceful_exit(*args):
14 |     loop.stop()
15 |     print("Chat closed")
16 |     raise GracefulExit()
17 | 
18 | 
19 | async def main() -> None:
20 |     character_name = conversation_manager.get_character_name()
21 |     while True:
22 |         query = input("User: ")
23 |         print(f"{character_name}: ", end="")
24 |         await conversation_manager.ask_question_test(query)
25 |         print("\n")
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     loop = asyncio.get_event_loop()
30 |     signal.signal(signal.SIGINT, raise_graceful_exit)
31 |     signal.signal(signal.SIGTERM, raise_graceful_exit)
32 |     background_tasks = set()
33 |     task = loop.create_task(main())
34 |     background_tasks.add(task)
35 |     try:
36 |         # asyncio.run(main())
37 |         loop.run_until_complete(task)
38 |     except GracefulExit:
39 |         pass
40 |     finally:
41 |         loop.close()
42 | 


--------------------------------------------------------------------------------