├── images
├── .placeholder
├── rag.png
├── mongomistral1.jpg
├── mongomistral2.jpg
├── mongomistral3.jpg
├── mongomistral4.jpg
├── mongomistral5.jpg
└── mistral-search-graph.png
├── third_party
├── E2B_Code_Interpreting
│ ├── codestral-code-interpreter-python
│ │ ├── .gitignore
│ │ ├── requirements.txt
│ │ ├── image_1.png
│ │ └── README.md
│ └── codestral-code-interpreter-js
│ │ ├── .gitignore
│ │ ├── image_1.png
│ │ ├── .env.template
│ │ ├── package.json
│ │ ├── README.md
│ │ └── index.ts
├── wandb
│ ├── static
│ │ ├── ft.png
│ │ ├── nli.png
│ │ ├── compare.png
│ │ ├── eugene1.png
│ │ ├── eval_7b.png
│ │ ├── eval_large.png
│ │ ├── yt_banner.png
│ │ └── ft_dashboard.png
│ └── README.md
├── Ollama
│ └── 20240321_ollama_meetup
│ │ ├── requirements.txt
│ │ ├── myfuncs.py
│ │ ├── README.md
│ │ └── run.py
├── x-cmd
│ ├── static
│ │ ├── x.mistral.png
│ │ ├── mistral.init.png
│ │ ├── mistral.chat.1.png
│ │ └── mistral.chat.2.png
│ └── README.md
├── Chainlit
│ ├── public
│ │ ├── starters.jpg
│ │ ├── chat-visual.jpg
│ │ ├── idea.svg
│ │ ├── write.svg
│ │ ├── learn.svg
│ │ └── logo_light.svg
│ ├── README.md
│ └── app.py
├── langchain
│ ├── img
│ │ └── langgraph_adaptive_rag.png
│ └── README.md
├── mesop
│ ├── chat.py
│ ├── chat_with_pdfs.py
│ └── README.md
├── Indexify
│ ├── pdf-summarization
│ │ ├── pdf_summarization_graph.py
│ │ ├── upload_and_retreive.py
│ │ ├── README.md
│ │ └── pdf-summarization.ipynb
│ ├── pdf-entity-extraction
│ │ ├── pdf_entity_extraction_pipeline.py
│ │ ├── upload_and_retreive.py
│ │ ├── README.md
│ │ └── pdf-entity-extraction.ipynb
│ └── README.md
├── panel
│ ├── basic_chat.py
│ ├── chat_history.py
│ └── chat_with_pdfs.py
├── gradio
│ ├── chat.py
│ ├── chat_with_pdfs.py
│ └── README.md
├── streamlit
│ ├── chat.py
│ ├── chat_with_pdfs.py
│ └── README.md
├── LlamaIndex
│ ├── propertygraphs
│ │ ├── README.md
│ │ └── property_graph_neo4j.ipynb
│ ├── README.md
│ └── RouterQueryEngine.ipynb
└── Haystack
│ └── haystack_chat_with_docs.ipynb
├── LICENSE
├── .github
└── pull_request_template.md
├── mistral
└── data_generation
│ └── external_files
│ ├── guide_1.txt
│ ├── guide_2.txt
│ ├── guide_4.txt
│ └── guide_3.txt
├── .gitignore
├── concept-deep-dive
└── quantization
│ └── README.md
├── README.md
└── data
└── northwind-queries.jsonl
/images/.placeholder:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/images/rag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/images/rag.png
--------------------------------------------------------------------------------
/third_party/E2B_Code_Interpreting/codestral-code-interpreter-python/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 |
--------------------------------------------------------------------------------
/third_party/E2B_Code_Interpreting/codestral-code-interpreter-js/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | node_modules
--------------------------------------------------------------------------------
/images/mongomistral1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/images/mongomistral1.jpg
--------------------------------------------------------------------------------
/images/mongomistral2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/images/mongomistral2.jpg
--------------------------------------------------------------------------------
/images/mongomistral3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/images/mongomistral3.jpg
--------------------------------------------------------------------------------
/images/mongomistral4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/images/mongomistral4.jpg
--------------------------------------------------------------------------------
/images/mongomistral5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/images/mongomistral5.jpg
--------------------------------------------------------------------------------
/images/mistral-search-graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/images/mistral-search-graph.png
--------------------------------------------------------------------------------
/third_party/wandb/static/ft.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/third_party/wandb/static/ft.png
--------------------------------------------------------------------------------
/third_party/Ollama/20240321_ollama_meetup/requirements.txt:
--------------------------------------------------------------------------------
1 | openai==1.14.2
2 | kubernetes==29.0.0
3 | mistralai==0.1.6
4 |
--------------------------------------------------------------------------------
/third_party/wandb/static/nli.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/third_party/wandb/static/nli.png
--------------------------------------------------------------------------------
/third_party/wandb/static/compare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/third_party/wandb/static/compare.png
--------------------------------------------------------------------------------
/third_party/wandb/static/eugene1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/third_party/wandb/static/eugene1.png
--------------------------------------------------------------------------------
/third_party/wandb/static/eval_7b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/third_party/wandb/static/eval_7b.png
--------------------------------------------------------------------------------
/third_party/wandb/static/eval_large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/third_party/wandb/static/eval_large.png
--------------------------------------------------------------------------------
/third_party/wandb/static/yt_banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/third_party/wandb/static/yt_banner.png
--------------------------------------------------------------------------------
/third_party/x-cmd/static/x.mistral.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/third_party/x-cmd/static/x.mistral.png
--------------------------------------------------------------------------------
/third_party/Chainlit/public/starters.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/third_party/Chainlit/public/starters.jpg
--------------------------------------------------------------------------------
/third_party/E2B_Code_Interpreting/codestral-code-interpreter-python/requirements.txt:
--------------------------------------------------------------------------------
1 | mistralai==0.4.2
2 | e2b_code_interpreter==0.0.10
3 |
--------------------------------------------------------------------------------
/third_party/wandb/static/ft_dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/third_party/wandb/static/ft_dashboard.png
--------------------------------------------------------------------------------
/third_party/x-cmd/static/mistral.init.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/third_party/x-cmd/static/mistral.init.png
--------------------------------------------------------------------------------
/third_party/Chainlit/public/chat-visual.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/third_party/Chainlit/public/chat-visual.jpg
--------------------------------------------------------------------------------
/third_party/x-cmd/static/mistral.chat.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/third_party/x-cmd/static/mistral.chat.1.png
--------------------------------------------------------------------------------
/third_party/x-cmd/static/mistral.chat.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/third_party/x-cmd/static/mistral.chat.2.png
--------------------------------------------------------------------------------
/third_party/langchain/img/langgraph_adaptive_rag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/third_party/langchain/img/langgraph_adaptive_rag.png
--------------------------------------------------------------------------------
/third_party/E2B_Code_Interpreting/codestral-code-interpreter-js/image_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/third_party/E2B_Code_Interpreting/codestral-code-interpreter-js/image_1.png
--------------------------------------------------------------------------------
/third_party/E2B_Code_Interpreting/codestral-code-interpreter-python/image_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vaibhavs10/cookbook/main/third_party/E2B_Code_Interpreting/codestral-code-interpreter-python/image_1.png
--------------------------------------------------------------------------------
/third_party/E2B_Code_Interpreting/codestral-code-interpreter-js/.env.template:
--------------------------------------------------------------------------------
1 | # TODO: Get your E2B API key from https://e2b.dev/docs/getting-started/api-key
2 | E2B_API_KEY=""
3 |
4 | # TODO: Get your Mistral API key from https://console.mistral.ai/api-keys/
5 | MISTRAL_API_KEY=""
--------------------------------------------------------------------------------
/third_party/Chainlit/public/idea.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/third_party/Chainlit/public/write.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/third_party/Ollama/20240321_ollama_meetup/myfuncs.py:
--------------------------------------------------------------------------------
1 | from kubernetes import client, config
2 |
3 | def list_pods(namespace):
4 | # Load kubeconfig
5 | config.load_kube_config()
6 | # Create API client instance
7 | api_instance = client.CoreV1Api()
8 | # Call API to list all pods in the given namespace
9 | response = api_instance.list_namespaced_pod(namespace)
10 | out = {"pods": []}
11 | for pod in response.items:
12 | out["pods"].append({"name": pod.metadata.name, "status": pod.status.phase})
13 | return out
14 |
15 |
16 |
--------------------------------------------------------------------------------
/third_party/mesop/chat.py:
--------------------------------------------------------------------------------
1 | import mesop as me
2 | import mesop.labs as mel
3 | from mistralai.client import MistralClient
4 | from mistralai.models.chat_completion import ChatMessage
5 |
6 | mistral_api_key = "api_key"
7 | cli = MistralClient(api_key = mistral_api_key)
8 |
9 | def ask_mistral(message: str, history: list[mel.ChatMessage]):
10 | messages = [ChatMessage(role=m.role, content=m.content) for m in history[:-1]]
11 | for chunk in cli.chat_stream(model = "open-mistral-7b", messages = messages, max_tokens = 1024):
12 | yield chunk.choices[0].delta.content
13 |
14 | @me.page(title="Talk to Mistral")
15 | def page():
16 | mel.chat(ask_mistral, title="Ask Mistral", bot_user="Mistral")
17 |
--------------------------------------------------------------------------------
/third_party/Chainlit/public/learn.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/third_party/Indexify/pdf-summarization/pdf_summarization_graph.py:
--------------------------------------------------------------------------------
1 | from indexify import IndexifyClient, ExtractionGraph
2 |
3 | client = IndexifyClient()
4 |
5 | extraction_graph_spec = """
6 | name: 'pdf_summarizer'
7 | extraction_policies:
8 | - extractor: 'tensorlake/pdfextractor'
9 | name: 'pdf_to_text'
10 | - extractor: 'tensorlake/mistral'
11 | name: 'text_to_summary'
12 | input_params:
13 | model_name: 'mistral-large-latest'
14 | key: 'YOUR_MISTRAL_API_KEY'
15 | system_prompt: 'Summarize the following text in a concise manner, highlighting the key points:'
16 | content_source: 'pdf_to_text'
17 | """
18 |
19 | extraction_graph = ExtractionGraph.from_yaml(extraction_graph_spec)
20 | client.create_extraction_graph(extraction_graph)
--------------------------------------------------------------------------------
/third_party/panel/basic_chat.py:
--------------------------------------------------------------------------------
1 | import panel as pn
2 | from mistralai.client import MistralClient
3 | from mistralai.models.chat_completion import ChatMessage
4 |
5 | pn.extension()
6 |
7 | mistral_api_key = "your_api_key"
8 | cli = MistralClient(api_key = mistral_api_key)
9 |
10 | async def callback(contents: str, user: str, instance: pn.chat.ChatInterface):
11 | messages = [ChatMessage(role = "user", content = contents)]
12 | response = cli.chat_stream(model = "open-mistral-7b", messages = messages, max_tokens = 1024)
13 | message = ""
14 | for chunk in response:
15 | message += chunk.choices[0].delta.content
16 | yield message
17 |
18 | chat_interface = pn.chat.ChatInterface(callback = callback, callback_user = "Mistral")
19 | chat_interface.servable()
--------------------------------------------------------------------------------
/third_party/Indexify/pdf-entity-extraction/pdf_entity_extraction_pipeline.py:
--------------------------------------------------------------------------------
1 | from indexify import IndexifyClient, ExtractionGraph
2 |
3 | client = IndexifyClient()
4 |
5 | extraction_graph_spec = """
6 | name: 'pdf_entity_extractor'
7 | extraction_policies:
8 | - extractor: 'tensorlake/pdfextractor'
9 | name: 'pdf_to_text'
10 | - extractor: 'tensorlake/mistral'
11 | name: 'text_to_entities'
12 | input_params:
13 | model_name: 'mistral-large-latest'
14 | key: 'YOUR_MISTRAL_API_KEY'
15 | system_prompt: 'Extract and categorize all named entities from the following text. Provide the results in a JSON format with categories: persons, organizations, locations, dates, and miscellaneous.'
16 | content_source: 'pdf_to_text'
17 | """
18 |
19 | extraction_graph = ExtractionGraph.from_yaml(extraction_graph_spec)
20 | client.create_extraction_graph(extraction_graph)
--------------------------------------------------------------------------------
/third_party/gradio/chat.py:
--------------------------------------------------------------------------------
1 | import gradio as gr
2 | from mistralai.client import MistralClient
3 | from mistralai.models.chat_completion import ChatMessage
4 |
5 | mistral_api_key = "your_api_key"
6 | cli = MistralClient(api_key = mistral_api_key)
7 |
8 | def ask_mistral(message: str, history: list):
9 | messages = []
10 | for couple in history:
11 | messages.append(ChatMessage(role= "user", content = couple[0]))
12 | messages.append(ChatMessage(role= "assistant", content = couple[1]))
13 |
14 | messages.append(ChatMessage(role = "user", content = message))
15 | full_response = ""
16 | for chunk in cli.chat_stream(model = "open-mistral-7b", messages = messages, max_tokens = 1024):
17 | full_response += chunk.choices[0].delta.content
18 | yield full_response
19 |
20 | app = gr.ChatInterface(fn = ask_mistral, title = "Ask Mistral")
21 | app.launch()
--------------------------------------------------------------------------------
/third_party/E2B_Code_Interpreting/codestral-code-interpreter-js/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "codestral-code-interpreter",
3 | "version": "1.0.0",
4 | "description": "This is an example of running LLM-generated code tasks in a secure and isolated cloud environment using the E2B Code Interpreter SDK.",
5 | "main": "index.ts",
6 | "scripts": {
7 | "start": "tsx index.ts"
8 | },
9 | "keywords": [],
10 | "author": "",
11 | "license": "ISC",
12 | "devDependencies": {
13 | "@types/node": "^20.12.11",
14 | "globals": "^15.3.0",
15 | "ts-node": "^10.9.2",
16 | "tsx": "^4.9.3",
17 | "typescript": "^5.4.5",
18 | "typescript-eslint": "^7.12.0"
19 | },
20 | "dependencies": {
21 | "@e2b/code-interpreter": "^0.0.5",
22 | "@mistralai/mistralai": "^0.4.0",
23 | "dotenv": "^16.4.5",
24 | "esbuild": "^0.20.2"
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/third_party/panel/chat_history.py:
--------------------------------------------------------------------------------
1 | import panel as pn
2 | from mistralai.client import MistralClient
3 | from mistralai.models.chat_completion import ChatMessage
4 |
5 | pn.extension()
6 |
7 | mistral_api_key = "your_api_key"
8 | cli = MistralClient(api_key = mistral_api_key)
9 |
10 | async def callback(contents: str, user: str, instance: pn.chat.ChatInterface):
11 | messages_objects = [w for w in instance.objects if w.user != "System" and type(w.object)]
12 | messages = [ChatMessage(
13 | role="user" if w.user == "User" else "assistant",
14 | content=w.object
15 | ) for w in messages_objects]
16 |
17 | response = cli.chat_stream(model = "open-mistral-7b", messages = messages, max_tokens = 1024)
18 | message = ""
19 | for chunk in response:
20 | message += chunk.choices[0].delta.content
21 | yield message
22 |
23 | chat_interface = pn.chat.ChatInterface(callback = callback, callback_user = "Mistral")
24 | chat_interface.send("Chat with Mistral!", user = "System", respond = False)
25 | chat_interface.servable()
--------------------------------------------------------------------------------
/third_party/Ollama/20240321_ollama_meetup/README.md:
--------------------------------------------------------------------------------
1 | # Ollama meetup demo (5mn)
2 |
3 | - Make sure that Docker is installed and running on your laptop with Kubernetes enabled.
4 |
5 | - Install ollama, start it and pull the mistral model:
6 |
7 | ```shell
8 | ollama pull mistral
9 | ```
10 |
11 | By default, the server is reachable on port 11434.
12 |
13 | - Install the Python requirements in a virtualenv and activate it.
14 |
15 | - To prove that you are not scamming people, start some random deployment in a custom namespace:
16 |
17 | ```shell
18 | kubectl create ns demo
19 | kubectl apply -f https://k8s.io/examples/controllers/nginx-deployment.yaml -n demo
20 | ```
21 |
22 | - Profit:
23 |
24 | ```shell
25 | python run.py
26 | ```
27 |
28 | There is a lot to improve, namely:
29 | - handling empty outputs gracefully
30 | - asking explicitly for the namespace when it is not provided by the user
31 | - just shelling out kubectl instead of installing the K8S Python client
32 |
33 | ...but that's the beauty of the game: giving ideas to the audience for building cool stuff !
34 |
35 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Mistral AI
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/third_party/streamlit/chat.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | from mistralai.client import MistralClient
3 |
4 | mistral_api_key = "your_api_key"
5 | cli = MistralClient(api_key = mistral_api_key)
6 |
7 | st.title("Chat with Mistral")
8 |
9 | if "messages" not in st.session_state:
10 | st.session_state.messages = []
11 |
12 | for message in st.session_state.messages:
13 | with st.chat_message(message["role"]):
14 | st.markdown(message["content"])
15 |
16 | def ask_mistral(messages: list):
17 | resp = cli.chat_stream(model = "open-mistral-7b", messages = messages, max_tokens = 1024)
18 | for chunk in resp:
19 | yield chunk.choices[0].delta.content
20 |
21 | if prompt := st.chat_input("Talk to Mistral!"):
22 | with st.chat_message("user"):
23 | st.markdown(prompt)
24 | st.session_state.messages.append({"role": "user", "content": prompt})
25 |
26 | with st.chat_message("assistant"):
27 | response_generator = ask_mistral(st.session_state.messages)
28 | response = st.write_stream(response_generator)
29 |
30 | st.session_state.messages.append({"role": "assistant", "content": response})
--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | # Cookbook Pull Request
2 |
3 | ## Description
4 |
5 | *Please include a summary behind the objective of this PR and list any dependencies that are required for this change.*
6 |
7 | ...
8 |
9 | ## Type of Change
10 |
11 | What type of PR is it?
12 |
13 | - [ ] New Cookbook
14 | - [ ] Notebook File
15 | - [ ] Does it work on google colab?
16 | - [ ] Markdown File
17 | - [ ] Cookbook Update
18 | - [ ] Code Refactoring
19 | - [ ] Bug Fix
20 | - [ ] README.md Update
21 | ___
22 | - [ ] Other (please describe):
23 |
24 | ## Cookbook Checklist:
25 |
26 | - [ ] My code is easy to read and well structured.
27 | - [ ] I've tagged the versions of any dependency required.
28 | - [ ] I have performed a self-review of my own code.
29 | - [ ] I have commented my code, particularly in hard-to-understand areas.
30 | - [ ] My changes generate no new warnings or errors.
31 | ___
32 | - [ ] My changes do not concern the cookbooks.
33 |
34 | ## README.md Checklist
35 |
36 | - [ ] I've added my cookbook to the table.
37 | ___
38 | - [ ] My changes do not concern the README file.
39 |
40 | ## Additional Context
41 |
42 | *Add any other context or screenshots about the feature request here.*
43 |
44 | ...
45 |
--------------------------------------------------------------------------------
/third_party/LlamaIndex/propertygraphs/README.md:
--------------------------------------------------------------------------------
1 | # PropertyGraphs with LlamaIndex and MistralAI
2 |
3 | Here, we provide cookbooks for building PropertyGraphs using LlamaIndex and MistralAI.
4 |
5 | 1. `property_graph.ipynb` - Build a Property Graph using default extractors and retrievers.
6 | 2. `property_graph_extractors_retrievers.ipynb` - This notebook showcases how to define different extractors, retrievers, and prompts for building PropertyGraphs. (Note: This notebook is for walkthrough purposes only and does not need to be run.)
7 | 3. `property_graph_neo4j.ipynb` - Build PropertyGraphs with Neo4j by customizing extractors and retrievers.
8 | 4. `property_graph_predefined_schema.ipynb` - Build PropertyGraphs with the `SchemaLLMPathExtractor` by pre-defining the schema of the PropertyGraph.
9 | 5. `property_graph_custom_retriever.ipynb` - Build a PropertyGraph with a custom retriever using `VectorContextRetriever` and `Text2CypherRetriever`.
10 |
11 | For more information about PropertyGraphs, refer to our [documentation](https://docs.llamaindex.ai/en/latest/examples/property_graph/graph_store/) and the [release blog post](https://www.llamaindex.ai/blog/introducing-the-property-graph-index-a-powerful-new-way-to-build-knowledge-graphs-with-llms).
--------------------------------------------------------------------------------
/third_party/Chainlit/README.md:
--------------------------------------------------------------------------------
1 | # Chainlit & Mistral reasoning
2 |
3 | This application uses the Chainlit UI framework along with Mistral's tool calls to answer complex questions requiring multiple-step reasoning 🥳
4 |
5 | ## Requirements
6 |
7 | _Versions used for the demo are `chainlit===1.1.305` and `mistralai===0.4.1`_
8 |
9 | We manage environment variables with `python-dotenv`.
10 |
11 | You will need a Mistral API key, which you can get at https://console.mistral.ai/api-keys/.
12 | Make sure to set it as `MISTRAL_API_KEY=` in a `.env` environment.
13 |
14 | ```shell
15 | pip install chainlit mistralai
16 | ```
17 |
18 | Optionally, you can get a Literal AI API key from [here](https://docs.getliteral.ai/get-started/installation#how-to-get-my-api-key)
19 | and set it as `LITERAL_API_KEY` in your `.env`. This will allow you to visualize the flow of your application.
20 |
21 | ## Run the Chainlit application
22 |
23 | The full application code lives in `app.py`. To run it, simply execute the following line:
24 |
25 | ```shell
26 | chainlit run app.py
27 | ```
28 |
29 | This will spin up your application on http://localhost:8080! 🎉
30 |
31 | For more a more step-by-step instructive tutorial on writing the application code, you call follow the `Chainlit - Mistral reasoning` notebook!
32 |
--------------------------------------------------------------------------------
/third_party/Indexify/pdf-summarization/upload_and_retreive.py:
--------------------------------------------------------------------------------
1 | import os
2 | import requests
3 | from indexify import IndexifyClient
4 |
5 | def download_pdf(url, save_path):
6 | response = requests.get(url)
7 | with open(save_path, 'wb') as f:
8 | f.write(response.content)
9 | print(f"PDF downloaded and saved to {save_path}")
10 |
11 | def summarize_pdf(pdf_path):
12 | client = IndexifyClient()
13 |
14 | # Upload the PDF file
15 | content_id = client.upload_file("pdf_summarizer", pdf_path)
16 |
17 | # Wait for the extraction to complete
18 | client.wait_for_extraction(content_id)
19 |
20 | # Retrieve the summarized content
21 | summary = client.get_extracted_content(
22 | content_id=content_id,
23 | graph_name="pdf_summarizer",
24 | policy_name="text_to_summary"
25 | )
26 |
27 | return summary[0]['content'].decode('utf-8')
28 |
29 | # Example usage
30 | if __name__ == "__main__":
31 | pdf_url = "https://arxiv.org/pdf/2310.06825.pdf"
32 | pdf_path = "reference_document.pdf"
33 |
34 | # Download the PDF
35 | download_pdf(pdf_url, pdf_path)
36 |
37 | # Summarize the PDF
38 | summary = summarize_pdf(pdf_path)
39 | print("Summary of the PDF:")
40 | print(summary)
--------------------------------------------------------------------------------
/third_party/Indexify/pdf-entity-extraction/upload_and_retreive.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import requests
4 | from indexify import IndexifyClient
5 |
6 | def download_pdf(url, save_path):
7 | response = requests.get(url)
8 | with open(save_path, 'wb') as f:
9 | f.write(response.content)
10 | print(f"PDF downloaded and saved to {save_path}")
11 |
12 |
13 | def extract_entities_from_pdf(pdf_path):
14 | client = IndexifyClient()
15 |
16 | # Upload the PDF file
17 | content_id = client.upload_file("pdf_entity_extractor", pdf_path)
18 |
19 | # Wait for the extraction to complete
20 | client.wait_for_extraction(content_id)
21 |
22 | # Retrieve the extracted entities
23 | entities_content = client.get_extracted_content(
24 | content_id=content_id,
25 | graph_name="pdf_entity_extractor",
26 | policy_name="text_to_entities"
27 | )
28 |
29 | # Parse the JSON response
30 | entities = json.loads(entities_content[0]['content'].decode('utf-8'))
31 | return entities
32 |
33 | # Example usage
34 | if __name__ == "__main__":
35 | pdf_url = "https://arxiv.org/pdf/2310.06825.pdf"
36 | pdf_path = "reference_document.pdf"
37 |
38 | # Download the PDF
39 | download_pdf(pdf_url, pdf_path)
40 | extracted_entities = extract_entities_from_pdf(pdf_path)
41 |
42 | print("Extracted Entities:")
43 | for category, entities in extracted_entities.items():
44 | print(f"\n{category.capitalize()}:")
45 | for entity in entities:
46 | print(f"- {entity}")
--------------------------------------------------------------------------------
/third_party/LlamaIndex/README.md:
--------------------------------------------------------------------------------
1 | # LlamaIndex <> MistralAI Cookbooks
2 |
3 | [LlamaIndex](https://github.com/run-llama/llama_index) is a data framework for LLM-based applications which benefit from context augmentation. LlamaIndex provides the essential abstractions to more easily ingest, structure, and access private or domain-specific data in order to inject these safely and reliably into LLMs for more accurate text generation.
4 |
5 | Here we offer some cookbooks for utilizing LlamaIndex with MistralAI's LLM's and Embedding models.
6 |
7 | 1. `RAG.ipynb` - Notebook to help you build quick RAG and Retriever Pipeline with Mistral.
8 | 2. `RouterQueryEngine.ipynb` - Notebook to help you to use `RouterQueryEngine` to route user queries.
9 | 3. `SubQuestionQueryEngine.ipynb` - Notebook to help you to use `SubQuestionQueryEngine` to answer complex user queries.
10 | 4. `Agents_Tools.ipynb` - Notebook to help you to use `FunctionCallingAgent` and `ReActAgent` for using Tools and RAG QueryEngine Tools.
11 | 5. `Adaptive_RAG.ipynb` - Notebook to implement [AdapativeRAG](https://arxiv.org/abs/2403.14403) paper to route user queries based on query complexity using `FunctionCallingAgent` with Mistral LLM.
12 | 6. `propertygraphs` - Collection of notebooks to implement and examine [Property Graphs](https://docs.llamaindex.ai/en/latest/examples/property_graph/property_graph_basic/) using LlamaIndex, MistralAI, and Ollama.
13 |
14 | [Documentation](https://docs.llamaindex.ai/en/stable/)
15 | [Discord](https://discord.gg/dGcwcsnxhU)
16 | [Twitter](https://twitter.com/llama_index)
17 | [Linkedin](https://www.linkedin.com/company/llamaindex/)
--------------------------------------------------------------------------------
/third_party/E2B_Code_Interpreting/codestral-code-interpreter-js/README.md:
--------------------------------------------------------------------------------
1 | # AI Code Execution with Mistral's Codestral
2 |
3 | This AI data analyst can plot a linear regression chart based on CSV data. It uses Mistral's Codestral as the LLM, and the [Code Interpreter SDK](https://github.com/e2b-dev/code-interpreter) by E2B for the code interpreting capabilities. The SDK quickly creates a secure cloud sandbox powered by [Firecracker](https://github.com/firecracker-microvm/firecracker). Inside this sandbox is a running Jupyter server that the LLM can use.
4 |
5 | Read more about Mistral's new Codestral model [here](https://mistral.ai/news/codestral/).
6 |
7 | The AI agent performs a data analysis task on an uploaded CSV file, executes the AI-generated code in the sandboxed environment by E2B, and returns a chart, saving it as a PNG file.
8 |
9 |
10 | # Installation
11 |
12 | ## 1. Install dependencies
13 |
14 | Ensure all dependencies are installed:
15 |
16 | ```
17 | npm install
18 | ```
19 |
20 | ## 2. Set up environment variables
21 |
22 | Create a `.env` file in the project root directory and add your API keys:
23 |
24 | - Copy `.env.template` to `.env`
25 | - Get the [E2B API KEY](https://e2b.dev/docs/getting-started/api-key)
26 | - Get the [MISTRAL API KEY](https://console.mistral.ai/api-keys/)
27 |
28 | ## 3. Run the program
29 |
30 | ```
31 | npm run start
32 | ```
33 |
34 | The script performs the following steps:
35 |
36 | - Loads the API keys from the environment variables.
37 | - Uploads the CSV dataset to the E2B sandboxed cloud environment.
38 | - Sends a prompt to the Codestal model to generate Python code for analyzing the dataset.
39 | - Executes the generated Python code using the E2B Code Interpreter SDK.
40 | - Saves any generated visualization as a PNG file.
41 |
42 |
43 | After running the program, you should get the result of the data analysis task saved in an `image_1.png` file. You should see a plot like this:
44 |
45 | 
46 |
47 |
48 | # Connect with E2B & learn more
49 | If you encounter any problems, please let us know at our [Discord](https://discord.com/invite/U7KEcGErtQ).
50 |
51 | Check the [E2B documentation](https://e2b.dev/docs) to learn more about how to use the Code Interpreter SDK.
--------------------------------------------------------------------------------
/third_party/E2B_Code_Interpreting/codestral-code-interpreter-python/README.md:
--------------------------------------------------------------------------------
1 | # AI Code Execution with Mistral's Codestral
2 |
3 | This AI data analyst can plot a linear regression chart based on CSV data. It uses Mistral's Codestral as the LLM, and the [Code Interpreter SDK](https://github.com/e2b-dev/code-interpreter) by E2B for the code interpreting capabilities. The SDK quickly creates a secure cloud sandbox powered by [Firecracker](https://github.com/firecracker-microvm/firecracker). Inside this sandbox is a running Jupyter server that the LLM can use.
4 |
5 | Read more about Mistral's new Codestral model [here](https://mistral.ai/news/codestral/).
6 |
7 | The AI agent performs a data analysis task on an uploaded CSV file, executes the AI-generated code in the sandboxed environment by E2B, and returns a chart, saving it as a PNG file.
8 |
9 |
10 | # Installation
11 |
12 | ## 1. Load API keys
13 |
14 | Add your API keys to the corresponding part of the program.
15 | - Get the [E2B API KEY](https://e2b.dev/docs/getting-started/api-key)
16 | - Get the [MISTRAL API KEY](https://console.mistral.ai/api-keys/)
17 |
18 | ## 2. Run the program
19 |
20 | To work with Python Jupyter Notebooks in VSCode, activate an Anaconda environment or another Python environment in which you've installed the Jupyter package. You can run an individual cell using the Run icon and the output will be displayed below the code cell.
21 |
22 | The script performs the following steps:
23 |
24 | - Loads the API keys from the environment variables.
25 | - Uploads the CSV dataset to the E2B sandboxed cloud environment.
26 | - Sends a prompt to the Codestal model to generate Python code for analyzing the dataset.
27 | - Executes the generated Python code using the E2B Code Interpreter SDK.
28 | - Saves any generated visualization as a PNG file.
29 |
30 |
31 | After running the program, you should get the result of the data analysis task saved in an `image_1.png` file. You should see a plot like this:
32 |
33 | 
34 |
35 |
36 | # Connect with E2B & learn more
37 | If you encounter any problems, please let us know at our [Discord](https://discord.com/invite/U7KEcGErtQ).
38 |
39 | Check the [E2B documentation](https://e2b.dev/docs) to learn more about how to use the Code Interpreter SDK.
--------------------------------------------------------------------------------
/third_party/gradio/chat_with_pdfs.py:
--------------------------------------------------------------------------------
1 | import gradio as gr
2 | from mistralai.client import MistralClient
3 | from mistralai.models.chat_completion import ChatMessage
4 | import numpy as np
5 | import PyPDF2
6 | import faiss
7 |
8 | mistral_api_key = "your_api_key"
9 | cli = MistralClient(api_key = mistral_api_key)
10 |
11 | def get_text_embedding(input: str):
12 | embeddings_batch_response = cli.embeddings(
13 | model = "mistral-embed",
14 | input = input
15 | )
16 | return embeddings_batch_response.data[0].embedding
17 |
18 | def rag_pdf(pdfs: list, question: str) -> str:
19 | chunk_size = 4096
20 | chunks = []
21 | for pdf in pdfs:
22 | chunks += [pdf[i:i + chunk_size] for i in range(0, len(pdf), chunk_size)]
23 |
24 | text_embeddings = np.array([get_text_embedding(chunk) for chunk in chunks])
25 | d = text_embeddings.shape[1]
26 | index = faiss.IndexFlatL2(d)
27 | index.add(text_embeddings)
28 |
29 | question_embeddings = np.array([get_text_embedding(question)])
30 | D, I = index.search(question_embeddings, k = 4)
31 | retrieved_chunk = [chunks[i] for i in I.tolist()[0]]
32 | text_retrieved = "\n\n".join(retrieved_chunk)
33 | return text_retrieved
34 |
35 | def ask_mistral(message: str, history: list):
36 | messages = []
37 | pdfs = message["files"]
38 | for couple in history:
39 | if type(couple[0]) is tuple:
40 | pdfs += couple[0]
41 | else:
42 | messages.append(ChatMessage(role= "user", content = couple[0]))
43 | messages.append(ChatMessage(role= "assistant", content = couple[1]))
44 |
45 | if pdfs:
46 | pdfs_extracted = []
47 | for pdf in pdfs:
48 | reader = PyPDF2.PdfReader(pdf)
49 | txt = ""
50 | for page in reader.pages:
51 | txt += page.extract_text()
52 | pdfs_extracted.append(txt)
53 |
54 | retrieved_text = rag_pdf(pdfs_extracted, message["text"])
55 | messages.append(ChatMessage(role = "user", content = retrieved_text + "\n\n" + message["text"]))
56 | else:
57 | messages.append(ChatMessage(role = "user", content = message["text"]))
58 |
59 | full_response = ""
60 | for chunk in cli.chat_stream(model = "open-mistral-7b", messages = messages, max_tokens = 1024):
61 | full_response += chunk.choices[0].delta.content
62 | yield full_response
63 |
64 | app = gr.ChatInterface(fn = ask_mistral, title = "Ask Mistral and talk to your PDFs", multimodal = True)
65 | app.launch()
--------------------------------------------------------------------------------
/third_party/langchain/README.md:
--------------------------------------------------------------------------------
1 | # LangChain <> MistralAI Cookbooks
2 |
3 | LLM agents use [planning, memory, and tools](https://lilianweng.github.io/posts/2023-06-23-agent/) to accomplish tasks. [LangGraph](https://python.langchain.com/docs/langgraph) is a library from LangChain that can be used to build reliable agents and workflows.
4 |
5 | ### Code generation
6 |
7 | We'll combine the code generation capabilities of Codestral the self-correction approach presented in the [AlphaCodium](https://github.com/Codium-ai/AlphaCodium) paper, [constructing an answer to a coding question iteratively](https://x.com/karpathy/status/1748043513156272416?s=20).
8 |
9 | We will implement some of these ideas from scratch using [LangGraph](https://python.langchain.com/docs/langgraph) to 1) produce structured code generation output from Codestral-instruct, 2) perform inline unit tests to confirm imports and code execution work, 3) feed back any errors for Codestral for self-correction.
10 |
11 |
12 |
13 | Video overview:
14 |
15 | * https://youtu.be/zXFxmI9f06M
16 |
17 | ---
18 |
19 | ### RAG
20 |
21 | We'll apply LangGraph to build RAG agents that use ideas from 3 papers:
22 |
23 | * Corrective-RAG (CRAG) [paper](https://arxiv.org/pdf/2401.15884.pdf) uses self-grading on retrieved documents and web-search fallback if documents are not relevant.
24 | * Self-RAG [paper](https://arxiv.org/abs/2310.11511) adds self-grading on generations for hallucinations and for ability to answer the question.
25 | * Adaptive RAG [paper](https://arxiv.org/abs/2403.14403) routes queries between different RAG approaches based on their complexity.
26 |
27 | We implement each approach as a control flow in LangGraph:
28 | - **Planning:** The sequence of RAG steps (e.g., retrieval, grading, generation) that we want the agent to take
29 | - **Memory:** All the RAG-related information (input question, retrieved documents, etc) that we want to pass between steps
30 | - **Tool use:** All the tools needed for RAG (e.g., decide web search or vectorstore retrieval based on the question)
31 |
32 | In the 3 notebooks, we will build from CRAG (blue, below) to Self-RAG (green) and finally to Adaptive RAG (red):
33 |
34 | 
35 |
36 | Each notebook builds on the prior one, so CRAG is a good entry point.
37 |
38 | Video overview:
39 |
40 | * https://www.youtube.com/watch?v=sgnrL7yo1TE
41 |
--------------------------------------------------------------------------------
/third_party/Indexify/README.md:
--------------------------------------------------------------------------------
1 | # Indexify-Mistral Cookbooks
2 |
3 | Indexify is an open-source engine for building fast data pipelines for unstructured data(video, audio, images and documents) using re-usable extractors for embedding, transformation and feature extraction. Indexify keeps vectordbs, structured databases(postgres) updated automatically when pipelines produce embedding or structured data.
4 |
5 | Applications can query indexes and databases using semantic search and SQL queries.
6 |
7 | Project - [https://github.com/tensorlakeai/indexify](https://github.com/tensorlakeai/indexify)
8 |
9 | This folder contains cookbooks demonstrating how to leverage Indexify and Mistral's large language models for building production ready pipelines for document understanding.
10 |
11 | ## Contents
12 |
13 | 1. [PDF Entity Extraction Cookbook](pdf-entity-extraction)
14 | 2. [PDF Summarization Cookbook](pdf-summarization)
15 |
16 | ## Overview
17 |
18 | These cookbooks showcase the integration of Indexify's structured data extraction capabilities with Mistral's advanced language models.
19 |
20 | ### PDF Entity Extraction Cookbook
21 |
22 | Learn how to build a robust entity [extraction pipeline for PDF](pdf-entity-extraction/README.md) documents. This cookbook covers:
23 |
24 | - Setting up Indexify and required extractors
25 | - Creating an extraction graph for entity recognition
26 | - Implementing the extraction pipeline
27 | - Customizing the entity extraction process
28 |
29 | ### PDF Summarization Cookbook
30 |
31 | Explore how to create an efficient [PDF summarization pipeline](pdf-summarization/README.md). This cookbook includes:
32 |
33 | - Installation and setup of necessary components
34 | - Defining an extraction graph for document summarization
35 | - Building and running the summarization pipeline
36 | - Tips for customizing and enhancing the summarization process
37 |
38 | ## Prerequisites
39 |
40 | Before using these cookbooks, ensure you have:
41 |
42 | - Create a virtual env with Python 3.9 or later
43 | ```shell
44 | python3.9 -m venv ve
45 | source ve/bin/activate
46 | ```
47 | - pip (Python package manager)
48 | - A Mistral API key
49 | - Basic familiarity with Python and command-line interfaces
50 |
51 | ## Getting Started
52 |
53 | 1. Install Indexify and the required extractors as detailed in each cookbook.
54 | 2. Review the cookbooks to understand the workflow and components.
55 | 3. Follow the step-by-step instructions to implement the pipelines.
56 | 4. Experiment with customizations to tailor the solutions to your specific needs.
57 |
--------------------------------------------------------------------------------
/third_party/mesop/chat_with_pdfs.py:
--------------------------------------------------------------------------------
1 | import io
2 | import mesop as me
3 | import mesop.labs as mel
4 | from mistralai.client import MistralClient
5 | from mistralai.models.chat_completion import ChatMessage
6 | import numpy as np
7 | import PyPDF2
8 | import faiss
9 |
10 | mistral_api_key = "api_key"
11 | cli = MistralClient(api_key = mistral_api_key)
12 |
13 | def get_text_embedding(input: str):
14 | embeddings_batch_response = cli.embeddings(
15 | model = "mistral-embed",
16 | input = input
17 | )
18 | return embeddings_batch_response.data[0].embedding
19 |
20 | def rag_pdf(pdfs: list, question: str) -> str:
21 | chunk_size = 4096
22 | chunks = []
23 | for pdf in pdfs:
24 | chunks += [pdf[i:i + chunk_size] for i in range(0, len(pdf), chunk_size)]
25 |
26 | text_embeddings = np.array([get_text_embedding(chunk) for chunk in chunks])
27 | d = text_embeddings.shape[1]
28 | index = faiss.IndexFlatL2(d)
29 | index.add(text_embeddings)
30 |
31 | question_embeddings = np.array([get_text_embedding(question)])
32 | D, I = index.search(question_embeddings, k = 4)
33 | retrieved_chunk = [chunks[i] for i in I.tolist()[0]]
34 | text_retrieved = "\n\n".join(retrieved_chunk)
35 | return text_retrieved
36 |
37 | def ask_mistral(message: str, history: list[mel.ChatMessage]):
38 | messages = [ChatMessage(role=m.role, content=m.content) for m in history[:-1]]
39 |
40 | state = me.state(State)
41 | if state.content:
42 | retrieved_text = rag_pdf([state.content], message)
43 | messages[-1] = ChatMessage(role = "user", content = retrieved_text + "\n\n" +messages[-1].content)
44 |
45 | for chunk in cli.chat_stream(model = "open-mistral-7b", messages = messages, max_tokens = 1024):
46 | yield chunk.choices[0].delta.content
47 |
48 | @me.stateclass
49 | class State:
50 | content: str
51 |
52 | def handle_upload(event: me.UploadEvent):
53 | state = me.state(State)
54 | reader = PyPDF2.PdfReader(io.BytesIO(event.file.getvalue()))
55 | txt = ""
56 | for page in reader.pages:
57 | txt += page.extract_text()
58 | state.content = txt
59 |
60 | @me.page(title="Talk to Mistral")
61 | def page():
62 | with me.box(style=me.Style(height = "100%", display="flex", flex_direction="column", align_items="center",padding=me.Padding(top = 0, left = 30, right = 30, bottom = 0))):
63 | with me.box(style=me.Style(padding=me.Padding(top = 16), position="fixed")):
64 | me.uploader(
65 | label="Upload PDF",
66 | accepted_file_types=["file/pdf"],
67 | on_upload=handle_upload,
68 | )
69 | with me.box(style=me.Style(width="100%")):
70 | mel.chat(ask_mistral, title="Ask Mistral", bot_user="Mistral")
71 |
--------------------------------------------------------------------------------
/third_party/panel/chat_with_pdfs.py:
--------------------------------------------------------------------------------
1 | import io
2 | from mistralai.client import MistralClient
3 | from mistralai.models.chat_completion import ChatMessage
4 | import numpy as np
5 | import panel as pn
6 | import PyPDF2
7 | import faiss
8 |
9 | pn.extension()
10 |
11 | mistral_api_key = "your_api_key"
12 | cli = MistralClient(api_key = mistral_api_key)
13 |
14 | def get_text_embedding(input_text: str):
15 | embeddings_batch_response = cli.embeddings(
16 | model="mistral-embed",
17 | input=input_text
18 | )
19 | return embeddings_batch_response.data[0].embedding
20 |
21 | def rag_pdf(pdfs: list, question: str) -> str:
22 | chunk_size = 4096
23 | chunks = []
24 | for pdf in pdfs:
25 | chunks += [pdf[i:i + chunk_size] for i in range(0, len(pdf), chunk_size)]
26 |
27 | text_embeddings = np.array([get_text_embedding(chunk) for chunk in chunks])
28 | d = text_embeddings.shape[1]
29 | index = faiss.IndexFlatL2(d)
30 | index.add(text_embeddings)
31 |
32 | question_embeddings = np.array([get_text_embedding(question)])
33 | D, I = index.search(question_embeddings, k = 4)
34 | retrieved_chunk = [chunks[i] for i in I.tolist()[0]]
35 | text_retrieved = "\n\n".join(retrieved_chunk)
36 | return text_retrieved
37 |
38 | async def callback(contents: str, user: str, instance: pn.chat.ChatInterface):
39 | if type(contents) is str:
40 | messages_objects = [w for w in instance.objects if w.user != "System" and type(w.object) is not pn.chat.message._FileInputMessage]
41 | messages = [ChatMessage(
42 | role="user" if w.user == "User" else "assistant",
43 | content=w.object
44 | ) for w in messages_objects]
45 |
46 | pdf_objects = [w for w in instance.objects if w.user != "System" and w not in messages_objects]
47 | if pdf_objects:
48 | pdfs = []
49 | for w in pdf_objects:
50 | reader = PyPDF2.PdfReader(io.BytesIO(w.object.contents))
51 | txt = ""
52 | for page in reader.pages:
53 | txt += page.extract_text()
54 | pdfs.append(txt)
55 | messages[-1].content = rag_pdf(pdfs, contents) + "\n\n" + contents
56 |
57 | response = cli.chat_stream(model = "open-mistral-7b", messages = messages, max_tokens = 1024, temperature = 0.7)
58 | message = ""
59 | for chunk in response:
60 | message += chunk.choices[0].delta.content
61 | yield message
62 |
63 | chat_interface = pn.chat.ChatInterface(widgets = [pn.widgets.TextInput(),pn.widgets.FileInput(accept = ".pdf")], callback = callback, callback_user = "Mistral")
64 | chat_interface.send("Chat with Mistral and talk to your PDFs!", user = "System", respond = False)
65 | chat_interface.servable()
--------------------------------------------------------------------------------
/third_party/streamlit/chat_with_pdfs.py:
--------------------------------------------------------------------------------
1 | import io
2 | import streamlit as st
3 | from mistralai.client import MistralClient
4 | import numpy as np
5 | import PyPDF2
6 | import faiss
7 |
8 | mistral_api_key = "your_api_key"
9 | cli = MistralClient(api_key = mistral_api_key)
10 |
11 | def get_text_embedding(input: str):
12 | embeddings_batch_response = cli.embeddings(
13 | model = "mistral-embed",
14 | input = input
15 | )
16 | return embeddings_batch_response.data[0].embedding
17 |
18 | def rag_pdf(pdfs: list, question: str) -> str:
19 | chunk_size = 4096
20 | chunks = []
21 | for pdf in pdfs:
22 | chunks += [pdf[i:i + chunk_size] for i in range(0, len(pdf), chunk_size)]
23 |
24 | text_embeddings = np.array([get_text_embedding(chunk) for chunk in chunks])
25 | d = text_embeddings.shape[1]
26 | index = faiss.IndexFlatL2(d)
27 | index.add(text_embeddings)
28 |
29 | question_embeddings = np.array([get_text_embedding(question)])
30 | D, I = index.search(question_embeddings, k = 4)
31 | retrieved_chunk = [chunks[i] for i in I.tolist()[0]]
32 | text_retrieved = "\n\n".join(retrieved_chunk)
33 | return text_retrieved
34 |
35 | st.title("Chat with Mistral and your PDFs")
36 |
37 | if "messages" not in st.session_state:
38 | st.session_state.messages = []
39 | st.session_state.pdfs = []
40 |
41 | for message in st.session_state.messages:
42 | with st.chat_message(message["role"]):
43 | st.markdown(message["content"])
44 |
45 | def ask_mistral(messages: list, pdfs_bytes: list):
46 | if pdfs_bytes:
47 | pdfs = []
48 | for pdf in pdfs_bytes:
49 | reader = PyPDF2.PdfReader(pdf)
50 | txt = ""
51 | for page in reader.pages:
52 | txt += page.extract_text()
53 | pdfs.append(txt)
54 | messages[-1]["content"] = rag_pdf(pdfs, messages[-1]["content"]) + "\n\n" + messages[-1]["content"]
55 | resp = cli.chat_stream(model="open-mistral-7b", messages = messages, max_tokens = 1024)
56 | for chunk in resp:
57 | yield chunk.choices[0].delta.content
58 |
59 | if prompt := st.chat_input("Talk to Mistral!"):
60 | with st.chat_message("user"):
61 | st.markdown(prompt)
62 | st.session_state.messages.append({"role": "user", "content": prompt})
63 |
64 | with st.chat_message("assistant"):
65 | response_generator = ask_mistral(st.session_state.messages, st.session_state.pdfs)
66 | response = st.write_stream(response_generator)
67 |
68 | st.session_state.messages.append({"role": "assistant", "content": response})
69 |
70 | uploaded_file = st.file_uploader("Choose a file", type = ["pdf"])
71 | if uploaded_file is not None:
72 | bytes_io = io.BytesIO(uploaded_file.getvalue())
73 |
74 | st.session_state.pdfs.append(bytes_io)
--------------------------------------------------------------------------------
/third_party/Ollama/20240321_ollama_meetup/run.py:
--------------------------------------------------------------------------------
1 | import functools
2 | import json
3 | import os
4 |
5 | from mistralai.client import MistralClient
6 | from openai import OpenAI
7 |
8 | from myfuncs import list_pods
9 |
10 | api_key = os.environ.get("MISTRAL_API_KEY")
11 |
12 | online_model = "mistral-small-latest"
13 | offline_model = "mistral" # ollama naming convention
14 |
15 | tools = [
16 | {
17 | "type": "function",
18 | "function": {
19 | "name": "list_pods",
20 | "description": "Get the list of all Kubernetes pods and their status in a given namespace",
21 | "parameters": {
22 | "type": "object",
23 | "properties": {
24 | "namespace": {
25 | "type": "string",
26 | "description": "The name of the namespace to look into",
27 | },
28 | },
29 | "required": ["namespace"],
30 | },
31 | }
32 | },
33 | ]
34 |
35 | callables = {"list_pods": list_pods}
36 |
37 | user_input = input("😺 Hello! How can I help you?\n")
38 |
39 | # Retrieve user input then generate function inputs with mistral-small
40 | messages = []
41 | messages.append({"role": "system", "content": "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."})
42 | messages.append({"role": "user", "content": f"Execute the following task on your K8S cluster: {user_input}"})
43 | online_client = MistralClient(api_key=api_key)
44 | resp_tool= online_client.chat(model=online_model, messages=messages, tools=tools, tool_choice="any")
45 | print(f"⏳ Using online model {online_model} to generate function inputs, un instant svp...")
46 | tool_call = resp_tool.choices[0].message.tool_calls[0]
47 | function_name = tool_call.function.name
48 | function_params = json.loads(tool_call.function.arguments)
49 | print(f"😎 Switching to offline execution and calling ollama's {offline_model}. C'est parti!\n\n")
50 |
51 | # Run the function
52 | partial_func = functools.partial(callables[function_name], **function_params)
53 | out = partial_func()
54 |
55 | # Format the function output with ollama-mistral (7b)
56 | local_client = OpenAI(base_url = "http://127.0.0.1:11434/v1", api_key='ollama')
57 | response = local_client.chat.completions.create(stream=True, model=offline_model, messages=[
58 | {"role": "system", "content": "You are a master of Kubernetes who likes friendly French-related jokes."},
59 | {"role": "user", "content": f"""
60 | Here is a list of K8S pods in a JSON format:\n {out}
61 | Transform it into a bullet-point-like list in idiomatic English and add a few French-related jokes.
62 | """}])
63 |
64 | for chunk in response:
65 | if chunk.choices[0].delta.content is not None:
66 | print(chunk.choices[0].delta.content, end="")
67 |
68 |
--------------------------------------------------------------------------------
/mistral/data_generation/external_files/guide_1.txt:
--------------------------------------------------------------------------------
1 | 1. Introduction
2 | This style guide aims to provide clear and consistent guidelines for writers and editors, ensuring that the publication maintains its distinctive voice, tone, and editorial standards.
3 |
4 | 2. Voice and Tone
5 | 2.1 The voice should be authoritative, engaging, and informed, providing readers with insightful analysis and commentary.
6 | 2.2 The tone should be concise, clear, and witty, avoiding jargon and overly complex language.
7 | 2.3 Aim for a balanced and impartial approach, presenting different perspectives and arguments fairly.
8 |
9 | 3. Language and Grammar
10 | 3.1 Use proper English spelling and grammar.
11 | 3.2 Use the active voice whenever possible.
12 | 3.3 Keep sentences and paragraphs concise and focused.
13 | 3.4 Use the serial comma (also known as the Oxford comma).
14 | 3.5 Avoid using contractions (e.g., don't, can't) in formal writing.
15 | 3.6 Use single quotation marks for quotes and double quotation marks for quotes within quotes.
16 | 3.7 Use italics for emphasis, book and film titles, and foreign words or phrases.
17 |
18 | 4. Numbers and Dates
19 | 4.1 Spell out numbers from one to nine; use numerals for 10 and above.
20 | 4.2 Use numerals for percentages (e.g., 5%) and currency amounts (e.g., £5).
21 | 4.3 Use commas as thousand separators (e.g., 1,000,000).
22 | 4.4 Write dates in the day-month-year format (e.g., 5th April 2023).
23 | 4.5 Use the 24-hour clock for time (e.g., 14:00).
24 |
25 | 5. Citations and References
26 | 5.1 Cite sources accurately and consistently, using in-text citations and a reference list at the end of the article.
27 | 5.2 Use proper citations.
28 | 5.3 Include a hyperlink to the original source when citing online materials.
29 |
30 | 6. Headlines and Subheadings
31 | 6.1 Write clear, concise, and engaging headlines that accurately reflect the content of the article.
32 | 6.2 Use subheadings to break up the text and guide readers through the article.
33 | 6.3 Ensure that headlines and subheadings use sentence case, with only the first word and proper nouns capitalized.
34 |
35 | 7. Fact-checking and Accuracy
36 | 7.1 Verify all facts, figures, and quotes to ensure accuracy.
37 | 7.2 Double-check names, titles, and affiliations of individuals and organizations mentioned in the article.
38 | 7.3 Update information as necessary to maintain the article's relevance and accuracy.
39 |
40 | 8. Inclusive Language
41 | 8.1 Use gender-neutral language and avoid stereotypes.
42 | 8.2 Respect the preferences of individuals regarding their identities, including gender, race, and nationality.
43 | 8.3 Be sensitive to cultural differences and avoid language that may be offensive or alienating to readers.
44 |
45 | 9. Legal and Ethical Considerations
46 | 9.1 Adhere to a code of ethics, which includes principles such as honesty, fairness, and transparency.
47 | 9.2 Avoid plagiarism by properly attributing all sources and using original language and analysis.
48 | 9.3 Respect copyright and intellectual property rights, obtaining permission to use copyrighted materials when necessary.
--------------------------------------------------------------------------------
/third_party/wandb/README.md:
--------------------------------------------------------------------------------
1 | [](https://wandb.ai/capecape/llm-judge-webinar/weave)
2 |
3 | This repo is a companion to the [Mistral and W&B webinar](https://www.youtube.com/watch?v=VBbq7NPWzlo).
4 |
5 | [](https://www.youtube.com/watch?v=VBbq7NPWzlo)
6 |
7 | # LLM Judge: Detecting hallucinations in language models
8 |
9 | This project demonstrates how to fine-tune and evaluate a Mistral AI language model to detect factual inconsistencies and hallucinations in text summaries. It is based on [this amazing blog post](https://eugeneyan.com/writing/finetuning/) by Eugene Yan.
10 |
11 | In this project, we will:
12 |
13 | - Prepares datasets from Factual Inconsistency Benchmark (FIB) and USB
14 | - Fine-tunes a Mistral 7B model for hallucination detection
15 | - Evaluates model performance using accuracy, F1 score, precision, and recall
16 | - Integrates with Weights & Biases for experiment tracking
17 |
18 | ## Weave
19 |
20 | In this project we make extensive use of [Weave](https://wandb.github.io/weave) to trace and organize our model evaluations.
21 |
22 | - You can get started with Weave and MistralAI by following the [quickstart guide](https://wandb.github.io/weave/guides/integrations/mistral)
23 |
24 | ## Usage
25 |
26 | 1. Prepare the data:
27 | - Run `01_prepare_data.ipynb` to process and format the datasets
28 |
29 | > The dataset is also available in the `data` folder, so you may skip this notebook.
30 |
31 | 2. Fine-tune and evaluate the model:
32 | - Run `02_finetune_and_eval.ipynb` to:
33 | - Evaluate baseline Mistral models (7B and Large)
34 | - Fine-tune a Mistral 7B model
35 | - Evaluate the fine-tuned model
36 |
37 | ## Results
38 |
39 | The notebook demonstrates improvements in hallucination detection after fine-tuning, with detailed metrics and comparisons between model versions.
40 |
41 | [](https://wandb.ai/capecape/llm-judge-webinar/weave/compare-evaluations?evaluationCallIds=%5B%224e6f6a62-c592-40de-bfce-2ac7d19707e6%22%2C%22af3ef7a1-2f80-4786-bdce-988454f278ad%22%2C%2248a173f9-ceff-46f7-b213-a1462706c966%22%2C%222bed90db-93cf-4cb2-8fd3-999bdd0600e5%22%5D)
42 |
43 | All the results and evaluation are logged to [this Weave Project](https://wandb.ai/capecape/llm-judge-webinar/weave)
44 |
45 | The finetuning process is logged to Weights & Biases as well, living together on the [same project](https://wandb.ai/capecape/llm-judge-webinar?nw=nwusercapecape) as the model evals.
46 |
47 | [](https://wandb.ai/capecape/llm-judge-webinar/workspace)
48 |
49 | ## Docs
50 |
51 | - Weights & Biases: https://wandb.ai/
52 | - Mistral finetuning docs: https://docs.mistral.ai/capabilities/finetuning/
53 | - Tracing with W&B Weave: https://wandb.me/weave
54 |
55 | ## Notes
56 |
57 | - Ensure you have the necessary API keys for Mistral AI and Weights & Biases
58 | - Adjust `NUM_SAMPLES` in the evaluation notebook to control the number of examples used
59 |
60 | For more details, refer to the individual notebooks and comments within the code.
--------------------------------------------------------------------------------
/mistral/data_generation/external_files/guide_2.txt:
--------------------------------------------------------------------------------
1 | 1. Introduction
2 | This manual is designed to offer explicit and uniform directions for authors and editors, ensuring the preservation of our publication's unique style, tone, and quality standards.
3 |
4 | 2. Style and Tone
5 | 2.1 The style should be confident, captivating, and knowledgeable, offering readers profound analysis and thoughtful commentary.
6 | 2.2 The tone should be succinct, lucid, and clever, evading technical jargon and unnecessarily complex language.
7 | 2.3 Strive for an equitable and unbiased method, presenting varied viewpoints and arguments objectively.
8 |
9 | 3. Language and Grammar
10 | 3.1 Utilize correct English spelling and grammar.
11 | 3.2 Prioritize the active voice whenever feasible.
12 | 3.3 Keep sentences and paragraphs brief and to the point.
13 | 3.4 Employ the serial comma (also known as the Oxford comma).
14 | 3.5 Refrain from using contractions (e.g., don't, can't) in professional writing.
15 | 3.6 Use single quotation marks for quotations and double quotation marks for quotations within quotations.
16 | 3.7 Use italics for emphasis, titles of books and films, and foreign words or phrases.
17 |
18 | 4. Numbers and Dates
19 | 4.1 Write out numbers from one to nine; use numerals for 10 and higher.
20 | 4.2 Use numerals for percentages (e.g., 5%) and monetary amounts (e.g., £5).
21 | 4.3 Use commas as thousand separators (e.g., 1,000,000).
22 | 4.4 Format dates as day-month-year (e.g., 5th April 2023).
23 | 4.5 Use the 24-hour clock for time (e.g., 14:00).
24 |
25 | 5. Citations and References
26 | 5.1 Cite sources precisely and consistently, using in-text citations and a reference list at the conclusion of the article.
27 | 5.2 Use appropriate citations.
28 | 5.3 Include a hyperlink to the original source when referencing online materials.
29 |
30 | 6. Headlines and Subheadings
31 | 6.1 Craft clear, concise, and intriguing headlines that accurately represent the article's content.
32 | 6.2 Use subheadings to divide the text and direct readers through the article.
33 | 6.3 Ensure that headlines and subheadings use sentence case, capitalizing only the first word and proper nouns.
34 |
35 | 7. Fact-checking and Accuracy
36 | 7.1 Confirm all facts, figures, and quotations to guarantee precision.
37 | 7.2 Carefully review names, titles, and affiliations of individuals and organizations mentioned in the article.
38 | 7.3 Revise information as needed to maintain the article's relevance and accuracy.
39 |
40 | 8. Inclusive Language
41 | 8.1 Employ gender-neutral language and avoid stereotypes.
42 | 8.2 Honor the preferences of individuals regarding their identities, including gender, race, and nationality.
43 | 8.3 Be mindful of cultural differences and avoid language that may be offensive or alienating to readers.
44 |
45 | 9. Legal and Ethical Considerations
46 | 9.1 Comply with a code of ethics, which includes principles such as honesty, fairness, and transparency.
47 | 9.2 Prevent plagiarism by correctly citing all sources and using original language and analysis.
48 | 9.3 Respect copyright and intellectual property rights, obtaining permission to use copyrighted materials when required.
--------------------------------------------------------------------------------
/mistral/data_generation/external_files/guide_4.txt:
--------------------------------------------------------------------------------
1 | 1. Introduction
2 | This handbook aims to supply straightforward and uniform guidance for writers and editors, ensuring our publication's unique voice, style, and editorial standards are preserved.
3 |
4 | 2. Style and Tone
5 | 2.1 The writing should be self-assured, appealing, and well-versed, offering readers thought-provoking analysis and commentary.
6 | 2.2 The tone should be concise, understandable, and witty, avoiding specialized terminology and complex language.
7 | 2.3 Strive for balance and impartiality, presenting multiple perspectives and arguments fairly.
8 |
9 | 3. Language and Grammar
10 | 3.1 Follow standard English spelling and grammar conventions.
11 | 3.2 Use the active voice whenever appropriate.
12 | 3.3 Maintain sentences and paragraphs brief and coherent.
13 | 3.4 Implement the serial comma (also known as the Oxford comma).
14 | 3.5 Refrain from using contractions (e.g., don't, can't) in professional writing.
15 | 3.6 Use single quotation marks for quotations and double quotation marks for nested quotations.
16 | 3.7 Use italics for emphasis, titles of books and films, and foreign terms or phrases.
17 |
18 | 4. Numbers and Dates
19 | 4.1 Spell out numbers from one to nine; use numerals for 10 and higher.
20 | 4.2 Use numerals for percentages (e.g., 5%) and financial amounts (e.g., £5).
21 | 4.3 Use commas as thousand separators (e.g., 1,000,000).
22 | 4.4 Format dates as day-month-year (e.g., 5th April 2023).
23 | 4.5 Use the 24-hour clock for time (e.g., 14:00).
24 |
25 | 5. Citations and References
26 | 5.1 Cite sources precisely and consistently, featuring in-text citations and a reference list at the article's conclusion.
27 | 5.2 Adhere to proper citation guidelines.
28 | 5.3 Include a hyperlink to the original source when referencing online materials.
29 |
30 | 6. Headlines and Subheadings
31 | 6.1 Develop clear, brief, and engaging headlines that accurately represent the article's content.
32 | 6.2 Utilize subheadings to segment the text and assist readers in following the article.
33 | 6.3 Ensure that headlines and subheadings use sentence case, capitalizing only the first word and proper nouns.
34 |
35 | 7. Fact-checking and Accuracy
36 | 7.1 Confirm all facts, figures, and quotations to ensure precision.
37 | 7.2 Thoroughly review names, titles, and affiliations of individuals and organizations mentioned in the article.
38 | 7.3 Modify information as needed to maintain the article's relevance and accuracy.
39 |
40 | 8. Inclusive Language
41 | 8.1 Employ gender-neutral language and avoid stereotypes.
42 | 8.2 Respect the preferences of individuals regarding their identities, including gender, race, and nationality.
43 | 8.3 Be considerate of cultural differences and avoid language that may be offensive or alienating to readers.
44 |
45 | 9. Legal and Ethical Considerations
46 | 9.1 Uphold a code of ethics, which includes principles such as integrity, fairness, and transparency.
47 | 9.2 Prevent plagiarism by properly citing all sources and using original language and analysis.
48 | 9.3 Honor copyright and intellectual property rights, acquiring permission to use copyrighted materials when required.
--------------------------------------------------------------------------------
/mistral/data_generation/external_files/guide_3.txt:
--------------------------------------------------------------------------------
1 | 1. Introduction
2 | This manual serves as a guide to provide clear-cut and consistent instructions for authors and editors, ensuring that our publication's distinct voice, tone, and editorial standards are upheld.
3 |
4 | 2. Style and Tone
5 | 2.1 The writing style should be confident, engaging, and well-informed, delivering readers with insightful analysis and commentary.
6 | 2.2 The tone should be brief, comprehensible, and clever, steering clear of industry jargon and convoluted language.
7 | 2.3 Maintain a fair and impartial stance, presenting diverse viewpoints and arguments evenhandedly.
8 |
9 | 3. Language and Grammar
10 | 3.1 Adhere to proper English spelling and grammar rules.
11 | 3.2 Opt for the active voice whenever possible.
12 | 3.3 Keep sentences and paragraphs brief and focused.
13 | 3.4 Include the serial comma (also known as the Oxford comma).
14 | 3.5 Avoid using contractions (e.g., don't, can't) in formal writing.
15 | 3.6 Use single quotation marks for quotes and double quotation marks for quotes within quotes.
16 | 3.7 Use italics for emphasis, book and movie titles, and foreign words or phrases.
17 |
18 | 4. Numbers and Dates
19 | 4.1 Write out numbers one through nine; use numerals for 10 and above.
20 | 4.2 Use numerals for percentages (e.g., 5%) and monetary values (e.g., £5).
21 | 4.3 Use commas as thousand separators (e.g., 1,000,000).
22 | 4.4 Present dates in the day-month-year format (e.g., 5th April 2023).
23 | 4.5 Use the 24-hour clock for time (e.g., 14:00).
24 |
25 | 5. Citations and References
26 | 5.1 Cite sources accurately and consistently, incorporating in-text citations and a reference list at the end of the article.
27 | 5.2 Use appropriate citation formats.
28 | 5.3 Provide a hyperlink to the original source when citing online content.
29 |
30 | 6. Headlines and Subheadings
31 | 6.1 Create clear, succinct, and captivating headlines that faithfully reflect the content of the article.
32 | 6.2 Employ subheadings to organize the text and help readers navigate through the article.
33 | 6.3 Ensure that headlines and subheadings use sentence case, capitalizing only the first word and proper nouns.
34 |
35 | 7. Fact-checking and Accuracy
36 | 7.1 Verify all facts, statistics, and quotes to ensure accuracy.
37 | 7.2 Double-check names, titles, and associations of individuals and organizations mentioned in the article.
38 | 7.3 Update information as necessary to preserve the article's relevance and accuracy.
39 |
40 | 8. Inclusive Language
41 | 8.1 Use gender-inclusive language and refrain from stereotypes.
42 | 8.2 Acknowledge the preferences of individuals regarding their identities, including gender, race, and nationality.
43 | 8.3 Be sensitive to cultural differences and avoid language that may be disrespectful or alienating to readers.
44 |
45 | 9. Legal and Ethical Considerations
46 | 9.1 Abide by a code of ethics, which encompasses principles such as honesty, fairness, and transparency.
47 | 9.2 Avoid plagiarism by properly crediting all sources and using original language and analysis.
48 | 9.3 Respect copyright and intellectual property rights, securing permission to use copyrighted materials when necessary.
--------------------------------------------------------------------------------
/third_party/x-cmd/README.md:
--------------------------------------------------------------------------------
1 | ## The Mistral AI Command Line Client
2 |
3 | The **mistral module** is a command-line client tool built by the x-cmd team using the Mistral AI API. Written in posix shell and awk, it uses `curl` to send API requests.
4 |
5 | ## Getting started
6 |
7 | ### Installing x-cmd
8 |
9 | - x-cmd is compatible with **Windows**, **Linux**, and **macOS**, making installation easy and straightforward
10 | ```sh
11 | eval "$(curl https://get.x-cmd.com)"
12 | # or
13 | eval "$(wget -O- https://get.x-cmd.com)"
14 | ```
15 | - For more installation methods and instructions, please refer to the [official documentation](https://www.x-cmd.com/start/).
16 |
17 | ### Configuring `x mistral`
18 |
19 | Obtaining a **Mistral AI API Key**: https://console.mistral.ai/api-keys/
20 |
21 | ```sh
22 | x mistral init
23 | # or
24 | x mistral --cfg apikey=
25 | x mistral --cfg model=
26 | ```
27 | 
28 |
29 | ### Use Mistral AI
30 |
31 | - `x mistral` allows you to **send messages or files to Mistral AI**. And to make things easier for users, x-cmd also provides the `@mistral` command as an alias for the `x mistral` command.
32 |
33 | ```sh
34 | x mistral chat request "hello"
35 | @mistral "hello"
36 | @mistral --file "Translate to French"
37 | ```
38 | 
39 |
40 |
41 | - `x mistral` can help analyze command results and supports **opening a dialogue in interactive mode**.
42 |
43 | ```sh
44 | x jina r "https://www.x-cmd.com/start/guide" | @mistral
45 | ```
46 | **[`x jina r`](https://www.x-cmd.com/mod/jina):** Uses **Jina.ai** to extract content from web pages.
47 |
48 | 
49 |
50 | ## Command Line Options
51 |
52 | We offer the `x mistral` and `@mistral` commands, where `x mistral` focuses on model configuration and download management, while `@mistral` emphasizes model applications. Their command-line options are as follows:
53 |
54 | 1. `x mistral`:
55 | ```sh
56 | SUBCOMMANDS:
57 | init Initialize the configuration using interactive mode
58 | --cur current session default value management
59 | --cfg Manage config item like apikey, etc
60 | chat chat with mistral
61 | model Model viewing and management
62 | ```
63 | 2. `@mistral`:
64 | ```sh
65 | -t,--temperature Control the diversity of model generated results, the range is [0 ~ 1], when the temperat
66 | -e Send the variable value as context to AI
67 | -f,--file Send file content as context to AI
68 | -n,--history Specify the number of history as context
69 | -p Specify to get pipe content
70 | -P,--nopipe Specify not to get pipe content
71 | -c Confirm before sending the request content to AI
72 | --jina Through jina reader, send the webpage content as context to AI
73 | --ddgo Send ddgo search content as context to AI
74 | --tldr Send tldr content as context to AI
75 | --eval Send the execution command results as context to AI
76 | --kv Send key-value pairs as context to AI
77 | --session Specify session value
78 | --minion Specify minion file
79 | --model Specify AI model
80 | --edit Edit the request content in the terminal
81 | --numbered List the data with line numbers and send it
82 | --question Request content
83 | ```
84 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
--------------------------------------------------------------------------------
/third_party/Chainlit/app.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import asyncio
4 | import chainlit as cl
5 | from dotenv import load_dotenv
6 |
7 | from mistralai.client import MistralClient
8 |
9 | load_dotenv()
10 |
11 | mai_client = MistralClient(api_key=os.environ["MISTRAL_API_KEY"])
12 |
13 | cl.instrument_mistralai()
14 |
15 |
16 | @cl.step(type="tool", name="get_current_weather")
17 | async def get_current_weather(location):
18 | # Make an actual API call! To open-meteo.com for instance.
19 | return json.dumps(
20 | {
21 | "location": location,
22 | "temperature": "29",
23 | "unit": "celsius",
24 | "forecast": ["sunny"],
25 | }
26 | )
27 |
28 |
29 | @cl.step(type="tool", name="get_home_town")
30 | async def get_home_town(person: str) -> str:
31 | """Get the hometown of a person"""
32 | if "Napoleon" in person:
33 | return "Ajaccio, Corsica"
34 | elif "Michel" in person:
35 | return "Caprese, Italy"
36 | else:
37 | return "Paris, France"
38 |
39 |
40 | """
41 | JSON tool definitions provided to the LLM.
42 | """
43 | tools = [
44 | {
45 | "type": "function",
46 | "function": {
47 | "name": "get_home_town",
48 | "description": "Get the home town of a specific person",
49 | "parameters": {
50 | "type": "object",
51 | "properties": {
52 | "person": {
53 | "type": "string",
54 | "description": "The name of a person (first and last names) to identify.",
55 | }
56 | },
57 | "required": ["person"],
58 | },
59 | },
60 | },
61 | {
62 | "type": "function",
63 | "function": {
64 | "name": "get_current_weather",
65 | "description": "Get the current weather in a given location",
66 | "parameters": {
67 | "type": "object",
68 | "properties": {
69 | "location": {
70 | "type": "string",
71 | "description": "The city and state, e.g. San Francisco, CA",
72 | },
73 | },
74 | "required": ["location"],
75 | },
76 | },
77 | },
78 | ]
79 |
80 |
81 | async def run_multiple(tool_calls):
82 | """
83 | Execute multiple tool calls asynchronously.
84 | """
85 | available_tools = {
86 | "get_current_weather": get_current_weather,
87 | "get_home_town": get_home_town,
88 | }
89 |
90 | async def run_single(tool_call):
91 | function_name = tool_call.function.name
92 | function_to_call = available_tools[function_name]
93 | function_args = json.loads(tool_call.function.arguments)
94 |
95 | function_response = await function_to_call(**function_args)
96 | return {
97 | "tool_call_id": tool_call.id,
98 | "role": "tool",
99 | "name": function_name,
100 | "content": function_response,
101 | }
102 |
103 | # Run tool calls in parallel.
104 | tool_results = await asyncio.gather(
105 | *(run_single(tool_call) for tool_call in tool_calls)
106 | )
107 | return tool_results
108 |
109 |
110 | @cl.step(type="run", tags=["to_score"])
111 | async def run_agent(user_query: str):
112 | messages = [{"role": "user", "content": f"{user_query}"}]
113 |
114 | number_iterations = 0
115 | answer_message_content = None
116 |
117 | while number_iterations < 5:
118 | completion = mai_client.chat(
119 | model="mistral-large-latest",
120 | messages=messages,
121 | tool_choice="auto",
122 | tools=tools,
123 | )
124 | message = completion.choices[0].message
125 | messages.append(message)
126 | answer_message_content = message.content
127 |
128 | if not message.tool_calls:
129 | break
130 |
131 | tool_results = await run_multiple(message.tool_calls)
132 | messages.extend(tool_results)
133 |
134 | number_iterations += 1
135 |
136 | return answer_message_content
137 |
138 |
139 | @cl.set_starters
140 | async def set_starters():
141 | return [
142 | cl.Starter(
143 | label="What's the weather in Napoleon's hometown",
144 | message="What's the weather in Napoleon's hometown?",
145 | icon="/public/idea.svg",
146 | ),
147 | cl.Starter(
148 | label="What's the weather in Paris, TX?",
149 | message="What's the weather in Paris, TX?",
150 | icon="/public/learn.svg",
151 | ),
152 | cl.Starter(
153 | label="What's the weather in Michel-Angelo's hometown?",
154 | message="What's the weather in Michel-Angelo's hometown?",
155 | icon="/public/write.svg",
156 | ),
157 | ]
158 |
159 |
160 | @cl.on_message
161 | async def main(message: cl.Message):
162 | """
163 | Main message handler for incoming user messages.
164 | """
165 | answer_message = await run_agent(message.content)
166 | await cl.Message(content=answer_message).send()
167 |
--------------------------------------------------------------------------------
/third_party/Indexify/pdf-summarization/README.md:
--------------------------------------------------------------------------------
1 | # PDF Summarization with Indexify and Mistral
2 |
3 | In this cookbook, we'll explore how to create a PDF summarization pipeline using Indexify and Mistral's large language models. By the end of the document, you should have a pipeline capable of ingesting 1000s of PDF documents, and using Mistral for summarization.
4 |
5 | ## Table of Contents
6 |
7 | 1. [Introduction](#introduction)
8 | 2. [Prerequisites](#prerequisites)
9 | 3. [Setup](#setup)
10 | - [Install Indexify](#install-indexify)
11 | - [Install Required Extractors](#install-required-extractors)
12 | 4. [Creating the Extraction Graph](#creating-the-extraction-graph)
13 | 5. [Implementing the Summarization Pipeline](#implementing-the-summarization-pipeline)
14 | 6. [Running the Summarization](#running-the-summarization)
15 | 7. [Customization and Advanced Usage](#customization-and-advanced-usage)
16 | 8. [Conclusion](#conclusion)
17 |
18 | ## Introduction
19 |
20 | The summarization pipeline is going to be composed of two steps -
21 | - PDF to Text extraction. We are going to use a pre-built extractor for this - `tensorlake/pdfextractor`.
22 | - We use Mistral for summarization.
23 |
24 |
25 | ## Prerequisites
26 |
27 | Before we begin, ensure you have the following:
28 |
29 | - Create a virtual env with Python 3.9 or later
30 | ```shell
31 | python3.9 -m venv ve
32 | source ve/bin/activate
33 | ```
34 | - `pip` (Python package manager)
35 | - A Mistral API key
36 | - Basic familiarity with Python and command-line interfaces
37 |
38 | ## Setup
39 |
40 | ### Install Indexify
41 |
42 | First, let's install Indexify using the official installation script:
43 |
44 | ```bash
45 | curl https://getindexify.ai | sh
46 | ```
47 |
48 | Start the Indexify server:
49 | ```bash
50 | ./indexify server -d
51 | ```
52 | This starts a long running server that exposes ingestion and retrieval APIs to applications.
53 |
54 | ### Install Required Extractors
55 |
56 | Next, we'll install the necessary extractors in a new terminal:
57 |
58 | ```bash
59 | pip install indexify-extractor-sdk
60 | indexify-extractor download tensorlake/pdfextractor
61 | indexify-extractor download tensorlake/mistral
62 | ```
63 |
64 | Once the extractors are downloaded, you can start them:
65 | ```bash
66 | indexify-extractor join-server
67 | ```
68 |
69 | ## Creating the Extraction Graph
70 |
71 | The extraction graph defines the flow of data through our summarization pipeline. We'll create a graph that first extracts text from PDFs, then sends that text to Mistral for summarization.
72 |
73 | Create a new Python file called `pdf_summarization_graph.py` and add the following code:
74 |
75 | ```python
76 | from indexify import IndexifyClient, ExtractionGraph
77 |
78 | client = IndexifyClient()
79 |
80 | extraction_graph_spec = """
81 | name: 'pdf_summarizer'
82 | extraction_policies:
83 | - extractor: 'tensorlake/pdfextractor'
84 | name: 'pdf_to_text'
85 | - extractor: 'tensorlake/mistral'
86 | name: 'text_to_summary'
87 | input_params:
88 | model_name: 'mistral-large-latest'
89 | key: 'YOUR_MISTRAL_API_KEY'
90 | system_prompt: 'Summarize the following text in a concise manner, highlighting the key points:'
91 | content_source: 'pdf_to_text'
92 | """
93 |
94 | extraction_graph = ExtractionGraph.from_yaml(extraction_graph_spec)
95 | client.create_extraction_graph(extraction_graph)
96 | ```
97 |
98 | Replace `'YOUR_MISTRAL_API_KEY'` with your actual Mistral API key.
99 |
100 | You can run this script to set up the pipeline:
101 | ```bash
102 | python pdf_summarization_graph.py
103 | ```
104 |
105 | ## Implementing the Summarization Pipeline
106 |
107 | Now that we have our extraction graph set up, we can upload files and make the pipeline generate summaries:
108 |
109 | Create a file `upload_and_retreive.py`
110 |
111 | ```python
112 | import os
113 | import requests
114 | from indexify import IndexifyClient
115 |
116 | def download_pdf(url, save_path):
117 | response = requests.get(url)
118 | with open(save_path, 'wb') as f:
119 | f.write(response.content)
120 | print(f"PDF downloaded and saved to {save_path}")
121 |
122 | def summarize_pdf(pdf_path):
123 | client = IndexifyClient()
124 |
125 | # Upload the PDF file
126 | content_id = client.upload_file("pdf_summarizer", pdf_path)
127 |
128 | # Wait for the extraction to complete
129 | client.wait_for_extraction(content_id)
130 |
131 | # Retrieve the summarized content
132 | summary = client.get_extracted_content(
133 | content_id=content_id,
134 | graph_name="pdf_summarizer",
135 | policy_name="text_to_summary"
136 | )
137 |
138 | return summary[0]['content'].decode('utf-8')
139 |
140 | # Example usage
141 | if __name__ == "__main__":
142 | pdf_url = "https://arxiv.org/pdf/2310.06825.pdf"
143 | pdf_path = "reference_document.pdf"
144 |
145 | # Download the PDF
146 | download_pdf(pdf_url, pdf_path)
147 |
148 | # Summarize the PDF
149 | summary = summarize_pdf(pdf_path)
150 | print("Summary of the PDF:")
151 | print(summary)
152 | ```
153 |
154 | You can run the Python script as many times, or use this in an application to continue generating summaries:
155 | ```bash
156 | python upload_and_retreive.py
157 | ```
158 |
159 | ## Customization and Advanced Usage
160 |
161 | You can customize the summarization process by modifying the `system_prompt` in the extraction graph. For example:
162 |
163 | - To generate bullet-point summaries:
164 | ```yaml
165 | system_prompt: 'Summarize the following text as a list of bullet points:'
166 | ```
167 |
168 | - To focus on specific aspects of the document:
169 | ```yaml
170 | system_prompt: 'Summarize the main arguments and supporting evidence from the following text:'
171 | ```
172 |
173 | You can also experiment with different Mistral models by changing the `model_name` parameter to find the best balance between speed and accuracy for your specific use case.
174 |
175 | ## Conclusion
176 |
177 | While the example might look simple, there are some unique advantages of using Indexify for this -
178 |
179 | 1. **Scalable and Highly Availability**: Indexify server can be deployed on a cloud and it can process 1000s of PDFs uploaded into it, and if any step in the pipeline fails it automatically retries on another machine.
180 | 2. **Flexibility**: You can use any other [PDF extraction model](https://docs.getindexify.ai/usecases/pdf_extraction/) we used here doesn't work for the document you are using.
181 |
182 | ## Next Steps
183 |
184 | - Learn more about Indexify on our docs - https://docs.getindexify.ai
185 | - Learn how to use Indexify and Mistral for [entity extraction from PDF documents](../pdf-entity-extraction/)
186 |
--------------------------------------------------------------------------------
/third_party/Indexify/pdf-entity-extraction/README.md:
--------------------------------------------------------------------------------
1 | # PDF Entity Extraction with Indexify and Mistral
2 |
3 | This cookbook demonstrates how to build a robust entity extraction pipeline for PDF documents using Indexify and Mistral's large language models. You will learn how to efficiently extract named entities from PDF files for various applications such as information retrieval, content analysis, and data mining.
4 |
5 | ## Table of Contents
6 |
7 | 1. [Introduction](#introduction)
8 | 2. [Prerequisites](#prerequisites)
9 | 3. [Setup](#setup)
10 | - [Install Indexify](#install-indexify)
11 | - [Install Required Extractors](#install-required-extractors)
12 | 4. [Creating the Extraction Graph](#creating-the-extraction-graph)
13 | 5. [Implementing the Entity Extraction Pipeline](#implementing-the-entity-extraction-pipeline)
14 | 6. [Running the Entity Extraction](#running-the-entity-extraction)
15 | 7. [Customization and Advanced Usage](#customization-and-advanced-usage)
16 | 8. [Conclusion](#conclusion)
17 |
18 | ## Introduction
19 |
20 | Entity extraction, also known as named entity recognition (NER) involves identifying and classifying named entities in text into predefined categories such as persons, organizations, locations, dates, and more. By applying this technique to PDF documents, we can automatically extract structured information from unstructured text, making it easier to analyze and utilize the content of these documents.
21 |
22 | ## Prerequisites
23 |
24 | Before we begin, ensure you have the following:
25 |
26 | - Create a virtual env with Python 3.9 or later
27 | ```shell
28 | python3.9 -m venv ve
29 | source ve/bin/activate
30 | ```
31 | - `pip` (Python package manager)
32 | - A Mistral API key
33 | - Basic familiarity with Python and command-line interfaces
34 |
35 | ## Setup
36 |
37 | ### Install Indexify
38 |
39 | First, let's install Indexify using the official installation script:
40 |
41 | ```bash
42 | curl https://getindexify.ai | sh
43 | ```
44 |
45 | Start the Indexify server:
46 | ```bash
47 | ./indexify server -d
48 | ```
49 | This starts a long running server that exposes ingestion and retrieval APIs to applications.
50 |
51 | ### Install Required Extractors
52 |
53 | Next, we'll install the necessary extractors in a new terminal:
54 |
55 | ```bash
56 | pip install indexify-extractor-sdk
57 | indexify-extractor download tensorlake/pdfextractor
58 | indexify-extractor download tensorlake/mistral
59 | ```
60 |
61 | Once the extractors are downloaded, you can start them:
62 | ```bash
63 | indexify-extractor join-server
64 | ```
65 |
66 | ## Creating the Extraction Graph
67 |
68 | The extraction graph defines the flow of data through our entity extraction pipeline. We'll create a graph that first extracts text from PDFs, then sends that text to Mistral for entity extraction.
69 |
70 | Create a new Python file called `pdf_entity_extraction_pipeline.py` and add the following code:
71 |
72 | ```python
73 | from indexify import IndexifyClient, ExtractionGraph
74 |
75 | client = IndexifyClient()
76 |
77 | extraction_graph_spec = """
78 | name: 'pdf_entity_extractor'
79 | extraction_policies:
80 | - extractor: 'tensorlake/pdfextractor'
81 | name: 'pdf_to_text'
82 | - extractor: 'tensorlake/mistral'
83 | name: 'text_to_entities'
84 | input_params:
85 | model_name: 'mistral-large-latest'
86 | key: 'YOUR_MISTRAL_API_KEY'
87 | system_prompt: 'Extract and categorize all named entities from the following text. Provide the results in a JSON format with categories: persons, organizations, locations, dates, and miscellaneous.'
88 | content_source: 'pdf_to_text'
89 | """
90 |
91 | extraction_graph = ExtractionGraph.from_yaml(extraction_graph_spec)
92 | client.create_extraction_graph(extraction_graph)
93 | ```
94 |
95 | Replace `'YOUR_MISTRAL_API_KEY'` with your actual Mistral API key.
96 |
97 | You can run this script to set up the pipeline:
98 | ```bash
99 | python pdf_entity_extraction_pipeline.py
100 | ```
101 |
102 | ## Implementing the Entity Extraction Pipeline
103 |
104 | Now that we have our extraction graph set up, we can upload files and retrieve the entities:
105 |
106 | Create a file `upload_and_retreive.py`
107 |
108 | ```python
109 | import json
110 | import os
111 | import requests
112 | from indexify import IndexifyClient
113 |
114 | def download_pdf(url, save_path):
115 | response = requests.get(url)
116 | with open(save_path, 'wb') as f:
117 | f.write(response.content)
118 | print(f"PDF downloaded and saved to {save_path}")
119 |
120 |
121 | def extract_entities_from_pdf(pdf_path):
122 | client = IndexifyClient()
123 |
124 | # Upload the PDF file
125 | content_id = client.upload_file("pdf_entity_extractor", pdf_path)
126 |
127 | # Wait for the extraction to complete
128 | client.wait_for_extraction(content_id)
129 |
130 | # Retrieve the extracted entities
131 | entities_content = client.get_extracted_content(
132 | content_id=content_id,
133 | graph_name="pdf_entity_extractor",
134 | policy_name="text_to_entities"
135 | )
136 |
137 | # Parse the JSON response
138 | entities = json.loads(entities_content[0]['content'].decode('utf-8'))
139 | return entities
140 |
141 | # Example usage
142 | if __name__ == "__main__":
143 | pdf_url = "https://arxiv.org/pdf/2310.06825.pdf"
144 | pdf_path = "reference_document.pdf"
145 |
146 | # Download the PDF
147 | download_pdf(pdf_url, pdf_path)
148 | extracted_entities = extract_entities_from_pdf(pdf_path)
149 |
150 | print("Extracted Entities:")
151 | for category, entities in extracted_entities.items():
152 | print(f"\n{category.capitalize()}:")
153 | for entity in entities:
154 | print(f"- {entity}")
155 | ```
156 |
157 | You can run the Python script as many times, or use this in an application to continue generating summaries:
158 | ```bash
159 | python upload_and_retreive.py
160 | ```
161 |
162 | ## Customization and Advanced Usage
163 |
164 | You can customize the entity extraction process by modifying the `system_prompt` in the extraction graph. For example:
165 |
166 | - To focus on specific entity types:
167 | ```yaml
168 | system_prompt: 'Extract only person names and organizations from the following text. Provide the results in a JSON format with categories: persons and organizations.'
169 | ```
170 |
171 | - To include entity relationships:
172 | ```yaml
173 | system_prompt: 'Extract named entities and their relationships from the following text. Provide the results in a JSON format with categories: entities (including type and name) and relationships (including type and involved entities).'
174 | ```
175 |
176 | You can also experiment with different Mistral models by changing the `model_name` parameter to find the best balance between speed and accuracy for your specific use case.
177 |
178 | ## Conclusion
179 |
180 | While the example might look simple, there are some unique advantages of using Indexify for this -
181 |
182 | 1. **Scalable and Highly Availability**: Indexify server can be deployed on a cloud and it can process 1000s of PDFs uploaded into it, and if any step in the pipeline fails it automatically retries on another machine.
183 | 2. **Flexibility**: You can use any other [PDF extraction model](https://docs.getindexify.ai/usecases/pdf_extraction/) we used here doesn't work for the document you are using.
184 |
185 | ## Next Steps
186 |
187 | - Learn more about Indexify on our docs - https://docs.getindexify.ai
188 | - Go over an example, which uses Mistral for [building summarization at scale](../pdf-summarization/)
189 |
--------------------------------------------------------------------------------
/third_party/Indexify/pdf-summarization/pdf-summarization.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# PDF Summarization with Indexify and Mistral\n",
8 | "\n",
9 | "In this cookbook, we'll explore how to create a PDF summarization pipeline using Indexify and Mistral's large language models. By the end of the document, you should have a pipeline capable of ingesting 1000s of PDF documents, and using Mistral for summarization.\n",
10 | "\n",
11 | "## Introduction\n",
12 | "\n",
13 | "The summarization pipeline is going to be composed of two steps -\n",
14 | "- PDF to Text extraction. We are going to use a pre-built extractor for this - `tensorlake/pdfextractor`.\n",
15 | "- We use Mistral for summarization.\n",
16 | "\n",
17 | "\n",
18 | "## Prerequisites\n",
19 | "\n",
20 | "Before we begin, ensure you have the following:\n",
21 | "\n",
22 | "- Create a virtual env with Python 3.9 or later\n",
23 | " ```shell\n",
24 | " python3.9 -m venv ve\n",
25 | " source ve/bin/activate\n",
26 | " ```\n",
27 | "- `pip` (Python package manager)\n",
28 | "- A Mistral API key\n",
29 | "- Basic familiarity with Python and command-line interfaces\n",
30 | "\n",
31 | "## Setup\n",
32 | "\n",
33 | "### Install Indexify\n",
34 | "\n",
35 | "First, let's install Indexify using the official installation script in a terminal:\n",
36 | "\n",
37 | "```bash\n",
38 | "curl https://getindexify.ai | sh\n",
39 | "```\n",
40 | "\n",
41 | "Start the Indexify server:\n",
42 | "```bash\n",
43 | "./indexify server -d\n",
44 | "```\n",
45 | "This starts a long running server that exposes ingestion and retrieval APIs to applications.\n",
46 | "\n",
47 | "### Install Required Extractors\n",
48 | "\n",
49 | "Next, we'll install the necessary extractors in a new terminal:\n",
50 | "\n",
51 | "```bash\n",
52 | "pip install indexify-extractor-sdk\n",
53 | "indexify-extractor download tensorlake/pdfextractor\n",
54 | "indexify-extractor download tensorlake/mistral\n",
55 | "```\n",
56 | "\n",
57 | "Once the extractors are downloaded, you can start them:\n",
58 | "```bash\n",
59 | "indexify-extractor join-server\n",
60 | "```"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "## Creating the Extraction Graph\n",
68 | "\n",
69 | "The extraction graph defines the flow of data through our summarization pipeline. We'll create a graph that first extracts text from PDFs, then sends that text to Mistral for summarization."
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "from indexify import IndexifyClient, ExtractionGraph\n",
79 | "\n",
80 | "client = IndexifyClient()\n",
81 | "\n",
82 | "extraction_graph_spec = \"\"\"\n",
83 | "name: 'pdf_summarizer'\n",
84 | "extraction_policies:\n",
85 | " - extractor: 'tensorlake/pdfextractor'\n",
86 | " name: 'pdf_to_text'\n",
87 | " - extractor: 'tensorlake/mistral'\n",
88 | " name: 'text_to_summary'\n",
89 | " input_params:\n",
90 | " model_name: 'mistral-large-latest'\n",
91 | " key: 'YOUR_MISTRAL_API_KEY'\n",
92 | " system_prompt: 'Summarize the following text in a concise manner, highlighting the key points:'\n",
93 | " content_source: 'pdf_to_text'\n",
94 | "\"\"\"\n",
95 | "\n",
96 | "extraction_graph = ExtractionGraph.from_yaml(extraction_graph_spec)\n",
97 | "client.create_extraction_graph(extraction_graph)"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "Replace `'YOUR_MISTRAL_API_KEY'` with your actual Mistral API key."
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "## Implementing the Summarization Pipeline\n",
112 | "\n",
113 | "Now that we have our extraction graph set up, we can upload files and make the pipeline generate summaries:"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "import os\n",
123 | "import requests\n",
124 | "from indexify import IndexifyClient\n",
125 | "\n",
126 | "def download_pdf(url, save_path):\n",
127 | " response = requests.get(url)\n",
128 | " with open(save_path, 'wb') as f:\n",
129 | " f.write(response.content)\n",
130 | " print(f\"PDF downloaded and saved to {save_path}\")\n",
131 | "\n",
132 | "def summarize_pdf(pdf_path):\n",
133 | " client = IndexifyClient()\n",
134 | " \n",
135 | " # Upload the PDF file\n",
136 | " content_id = client.upload_file(\"pdf_summarizer\", pdf_path)\n",
137 | " \n",
138 | " # Wait for the extraction to complete\n",
139 | " client.wait_for_extraction(content_id)\n",
140 | " \n",
141 | " # Retrieve the summarized content\n",
142 | " summary = client.get_extracted_content(\n",
143 | " content_id=content_id,\n",
144 | " graph_name=\"pdf_summarizer\",\n",
145 | " policy_name=\"text_to_summary\"\n",
146 | " )\n",
147 | " \n",
148 | " return summary[0]['content'].decode('utf-8')"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": null,
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "pdf_url = \"https://arxiv.org/pdf/2310.06825.pdf\"\n",
158 | "pdf_path = \"reference_document.pdf\"\n",
159 | "\n",
160 | "# Download the PDF\n",
161 | "download_pdf(pdf_url, pdf_path)\n",
162 | "\n",
163 | "# Summarize the PDF\n",
164 | "summary = summarize_pdf(pdf_path)\n",
165 | "print(\"Summary of the PDF:\")\n",
166 | "print(summary)"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {},
172 | "source": [
173 | "## Customization and Advanced Usage\n",
174 | "\n",
175 | "You can customize the summarization process by modifying the `system_prompt` in the extraction graph. For example:\n",
176 | "\n",
177 | "- To generate bullet-point summaries:\n",
178 | " ```yaml\n",
179 | " system_prompt: 'Summarize the following text as a list of bullet points:'\n",
180 | " ```\n",
181 | "\n",
182 | "- To focus on specific aspects of the document:\n",
183 | " ```yaml\n",
184 | " system_prompt: 'Summarize the main arguments and supporting evidence from the following text:'\n",
185 | " ```\n",
186 | "\n",
187 | "You can also experiment with different Mistral models by changing the `model_name` parameter to find the best balance between speed and accuracy for your specific use case.\n",
188 | "\n",
189 | "## Conclusion\n",
190 | "\n",
191 | "While the example might look simple, there are some unique advantages of using Indexify for this -\n",
192 | "\n",
193 | "1. **Scalable and Highly Availability**: Indexify server can be deployed on a cloud and it can process 1000s of PDFs uploaded into it, and if any step in the pipeline fails it automatically retries on another machine.\n",
194 | "2. **Flexibility**: You can use any other [PDF extraction model](https://docs.getindexify.ai/usecases/pdf_extraction/) we used here doesn't work for the document you are using.\n",
195 | "\n",
196 | "## Next Steps\n",
197 | "\n",
198 | "- Learn more about Indexify on our docs - https://docs.getindexify.ai\n",
199 | "- Learn how to use Indexify and Mistral for [entity extraction from PDF documents](../pdf-entity-extraction/)"
200 | ]
201 | }
202 | ],
203 | "metadata": {
204 | "language_info": {
205 | "name": "python"
206 | }
207 | },
208 | "nbformat": 4,
209 | "nbformat_minor": 2
210 | }
211 |
--------------------------------------------------------------------------------
/third_party/Haystack/haystack_chat_with_docs.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Using Mistral AI with Haystack\n",
8 | "\n",
9 | "In this cookbook, we will use Mistral embeddings and generative models in 2 [Haystack](https://github.com/deepset-ai/haystack) pipelines:\n",
10 | "\n",
11 | "1) We will build an indexing pipeline that can create embeddings for the contents of URLs and indexes them into a vector database\n",
12 | "2) We will build a retrieval-augmented chat pipeline to chat with the contents of the URLs\n",
13 | "\n",
14 | "First, we install our dependencies"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "!pip install mistral-haystack"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "Next, we need to set the `MISTRAL_API_KEY` environment variable 👇"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "import os\n",
40 | "from getpass import getpass\n",
41 | "\n",
42 | "os.environ[\"MISTRAL_API_KEY\"] = getpass(\"Mistral API Key:\")"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "## Index URLs with Mistral Embeddings\n",
50 | "\n",
51 | "Below, we are using `mistral-embed` in a full Haystack indexing pipeline. We create embeddings for the contents of the chosen URLs with `mistral-embed` and write them to an [`InMemoryDocumentStore`](https://docs.haystack.deepset.ai/v2.0/docs/inmemorydocumentstore) using the [`MistralDocumentEmbedder`](https://docs.haystack.deepset.ai/v2.0/docs/mistraldocumentembedder). \n",
52 | "\n",
53 | "> 💡This document store is the simplest to get started with as it has no requirements to setup. Feel free to change this document store to any of the [vector databases available for Haystack 2.0](https://haystack.deepset.ai/integrations?type=Document+Store) such as **Weaviate**, **Chroma**, **AstraDB** etc."
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": null,
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "from haystack import Pipeline\n",
63 | "from haystack.components.converters import HTMLToDocument\n",
64 | "from haystack.components.fetchers import LinkContentFetcher\n",
65 | "from haystack.components.writers import DocumentWriter\n",
66 | "from haystack.document_stores.in_memory import InMemoryDocumentStore\n",
67 | "from haystack_integrations.components.embedders.mistral.document_embedder import MistralDocumentEmbedder\n",
68 | "\n",
69 | "\n",
70 | "document_store = InMemoryDocumentStore()\n",
71 | "fetcher = LinkContentFetcher()\n",
72 | "converter = HTMLToDocument()\n",
73 | "embedder = MistralDocumentEmbedder()\n",
74 | "writer = DocumentWriter(document_store=document_store)\n",
75 | "\n",
76 | "indexing = Pipeline()\n",
77 | "\n",
78 | "indexing.add_component(name=\"fetcher\", instance=fetcher)\n",
79 | "indexing.add_component(name=\"converter\", instance=converter)\n",
80 | "indexing.add_component(name=\"embedder\", instance=embedder)\n",
81 | "indexing.add_component(name=\"writer\", instance=writer)\n",
82 | "\n",
83 | "indexing.connect(\"fetcher\", \"converter\")\n",
84 | "indexing.connect(\"converter\", \"embedder\")\n",
85 | "indexing.connect(\"embedder\", \"writer\")\n"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "urls = [\"https://mistral.ai/news/la-plateforme/\", \"https://mistral.ai/news/mixtral-of-experts\"]\n",
95 | "\n",
96 | "indexing.run({\"fetcher\": {\"urls\": urls}})"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "## Chat With the URLs with Mistral Generative Models\n",
104 | "\n",
105 | "Now that we have indexed the contents and embeddings of various URLs, we can create a RAG pipeline that uses the [`MistralChatGenerator`](https://docs.haystack.deepset.ai/v2.0/docs/mistralchatgenerator) component with `mistral-small`.\n",
106 | "A few more things to know about this pipeline:\n",
107 | "\n",
108 | "- We are using the [`MistralTextEmbdder`](https://docs.haystack.deepset.ai/v2.0/docs/mistraltextembedder) to embed our question and retrieve the most relevant 1 document\n",
109 | "- We are enabling streaming responses by providing a `streaming_callback`\n",
110 | "- `documents` is being provided to the chat template by the retriever, while we provide `query` to the pipeline when we run it."
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "from haystack import Pipeline\n",
120 | "from haystack.components.builders import DynamicChatPromptBuilder\n",
121 | "from haystack.components.generators.utils import print_streaming_chunk\n",
122 | "from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever\n",
123 | "from haystack.dataclasses import ChatMessage\n",
124 | "from haystack_integrations.components.embedders.mistral.text_embedder import MistralTextEmbedder\n",
125 | "from haystack_integrations.components.generators.mistral import MistralChatGenerator\n",
126 | "\n",
127 | "text_embedder = MistralTextEmbedder()\n",
128 | "retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=1)\n",
129 | "prompt_builder = DynamicChatPromptBuilder(runtime_variables=[\"documents\"])\n",
130 | "llm = MistralChatGenerator(model='mistral-small', streaming_callback=print_streaming_chunk)\n",
131 | "\n",
132 | "rag_pipeline = Pipeline()\n",
133 | "rag_pipeline.add_component(\"text_embedder\", text_embedder)\n",
134 | "rag_pipeline.add_component(\"retriever\", retriever)\n",
135 | "rag_pipeline.add_component(\"prompt_builder\", prompt_builder)\n",
136 | "rag_pipeline.add_component(\"llm\", llm)\n",
137 | "\n",
138 | "\n",
139 | "rag_pipeline.connect(\"text_embedder.embedding\", \"retriever.query_embedding\")\n",
140 | "rag_pipeline.connect(\"retriever.documents\", \"prompt_builder.documents\")\n",
141 | "rag_pipeline.connect(\"prompt_builder.prompt\", \"llm.messages\")"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "question = \"What generative endpoints does the Mistral platform have?\"\n",
151 | "\n",
152 | "chat_template = \"\"\"Answer the following question based on the contents of the documents.\\n\n",
153 | " Question: {{query}}\\n\n",
154 | " Documents: {{documents[0].content}}\n",
155 | " \"\"\"\n",
156 | "messages = [ChatMessage.from_user(chat_template)]\n",
157 | "\n",
158 | "result = rag_pipeline.run(\n",
159 | " {\n",
160 | " \"text_embedder\": {\"text\": question},\n",
161 | " \"prompt_builder\": {\"template_variables\": {\"query\": question}, \"prompt_source\": messages},\n",
162 | " \"llm\": {\"generation_kwargs\": {\"max_tokens\": 165}},\n",
163 | " },\n",
164 | " debug=True\n",
165 | ")"
166 | ]
167 | }
168 | ],
169 | "metadata": {
170 | "kernelspec": {
171 | "display_name": "mistral",
172 | "language": "python",
173 | "name": "python3"
174 | },
175 | "language_info": {
176 | "codemirror_mode": {
177 | "name": "ipython",
178 | "version": 3
179 | },
180 | "file_extension": ".py",
181 | "mimetype": "text/x-python",
182 | "name": "python",
183 | "nbconvert_exporter": "python",
184 | "pygments_lexer": "ipython3",
185 | "version": "3.12.1"
186 | }
187 | },
188 | "nbformat": 4,
189 | "nbformat_minor": 2
190 | }
191 |
--------------------------------------------------------------------------------
/concept-deep-dive/quantization/README.md:
--------------------------------------------------------------------------------
1 | # Quantization
2 |
3 | **Quantization** is a process that plays a crucial role in Large Language Models. These vast neural networks are primarily composed of raw values, known as `weights` or parameters. The higher the number of parameters, the larger the model, and the more knowledge it can extract and retain, but also the more **resources** it requires to run them. These big models can range from a few million parameters to billions.
4 |
5 | ## Inference and Resource Requirements
6 |
7 | When we want to run an LLM, also called **Inference**, we need to load the entire model, including all of its parameters, to perform the necessary computations. This requirement scales linearly with RAM/VRAM, meaning that **larger models necessitate more memory**.
8 |
9 | ## The Role of Quantization
10 |
11 | This is where quantization comes into play. The parameters that store all the knowledge and decision-making capabilities of our models are stored in a certain number of bits. For example, the models we have open-sourced are usually in BF16, stored within 16 bits of precision.
12 |
13 | Here's a simplified representation of how a value in BF16 is stored:
14 |
15 | | Bit Position | 15 | 14-7 | 6-0 |
16 | |--------------|:--:|:--:|:--:|
17 | | Component | sign| exponent | fraction |
18 |
19 | The goal of quantization is to reduce the precision required for each parameter, or weight, without significantly impacting the model's performance. This is not always a simple truncation process. There are different methods of quantization that aim to minimize the impact of reducing precision.
20 |
21 | By quantizing the model's values to use fewer bits on average (such as 16 bits, 8 bits, 6 bits, 4 bits, or even lower), we can store the models more efficiently on disk and in memory. This makes them more accessible and less resource-intensive to run.
22 |
23 | ## Estimate Memory Requirements
24 |
25 | One might wonder how to estimate how much memory will be required and how to calculate it. We can estimate by using this simple formula:
26 | - required_bytes = n_parameters * bytes_per_parameter
27 |
28 | Let's apply this formula to different data types for comparison for our 7.3B model!
29 |
30 | | Data Type | Bytes | Range | N° Different Values | Memory |
31 | |--------------|:--:|:--:|:--:|:--:|
32 | | FP32 | 4 | -1.18e38 : 3.4e38 | 2^32 | 29.2 GB |
33 | | FP16 | 2 | -65k : 65k | 2^16 | 14.6 GB |
34 | | BF16 | 2 | -3.39e38 : 3.39e38 | 2^16 | 14.6 GB |
35 | | FP8 (E5M2) | 1 | -57k : 57k | 256 | 7.3 GB |
36 | | INT8 | 1 | -128 : 127 | 256 | 7.3 GB |
37 | | INT4 | 0.5 | -8 : 7 | 16 | 3.65 GB |
38 |
39 | Crucial to note that this is only the memory for loading inference, without taking into account the context size (sequence length) and computation.
40 |
41 | ## Mistral Models
42 |
43 | Making use of the same previous formula, lets compute a rough estimate for the required Memory for different data types for each model.
44 |
45 | | Model | Params | BF16 | FP8 | INT4 |
46 | |--------------|:--:|:--:|:--:|:--:|
47 | | Mistral 7B | 7.3B | 14.6 GB | 7.3 GB | 3.65 GB |
48 | | Mathstral 7B | 7.3B | 14.6 GB | 7.3 GB | 3.65 GB |
49 | | Codestral Mamba 7B | 7.3B | 14.6 GB | 7.3 GB | 3.65 GB |
50 | | Mistral Nemo 12B | 12.2B | 24.4 GB | 12.2 GB | 6.1 GB |
51 | | Mixtral 8x7B | 46.7B | 93.4 GB | 46.7 GB | 23.35 GB |
52 | | Codestral 22B | 22.2B | 44.4 GB | 22.2 GB | 11.1 GB |
53 | | Mixtral 8x22B | 140.6B | 281.2 GB | 140.6 GB | 70.3 GB |
54 | | Mistral Large 2407 | 123B | 246 GB | 123 GB | 61.5 GB |
55 |
56 | ## Quantization Formats
57 |
58 | While it is common practice to release weights in BF16 or FP16 due to hardware optimized for inference at that precision, the community has favored 8 bits as the precision loss is minimal or even null. And special techniques such as Training with Quantization Awareness can allow inference at FP8 losslessly! See Mistral Nemo 12B.
59 |
60 | As mentioned previously, quantization is often not a simple truncation to fit in fewer bits. There are a lot of different formats and precisions.
61 |
62 | Among them we have:
63 |
64 |
65 | Bits-and-bytes
66 |
67 |
68 |
69 |
70 |
71 | Bits-and-bytes is a very fast and straightforward approach to quantization, quantizing while loading. However, speed and quality are not optimal, useful for quick quantization and loading of models.
72 |
73 |
74 |
75 | GGUF
76 |
77 |
78 |
79 |
80 |
81 | Previously GGML, GGUF is favored by a lot of the community for its ability to run efficiently on CPU and Apple devices, offloading to a GPU if available! Making it a good choice for local testing and deployment.
82 |
83 |
84 |
85 | GPTQ
86 |
87 |
88 |
89 |
90 |
91 | While GGUF focuses on CPU, GPTQ is oriented towards GPU inference performance by reducing errors with a calibration dataset.
92 |
93 |
94 |
95 | AWQ
96 |
97 |
98 |
99 |
100 |
101 | AWQ is also oriented towards GPU, it bases itself on the fact that ~1% of weights actually contribute significantly to the model's accuracy, and hence these must be treated delicately by using a dataset to analyze the activation distributions during inference and identify those important and critical weights.
102 |
103 |
104 |
105 | EXL2
106 |
107 |
108 |
109 |
110 |
111 | A more recent format based on the GPTQ optimization method but with mixed quantization levels. It achieves an average desired bitrate with smaller errors than GPTQ while keeping the same or similar bitrate. Can have a slightly higher VRAM usage but better inference speed and quality.
112 |
113 |
114 | *Other formats: HQQ, AQLM, EETQ, Marlin...*
115 |
116 | Among these formats, the most popular and widely used precisions are between 4 bits and 8 bits. While 8 bits has the best accuracy, 6 bits still keeps a lot of the smarts of the model without degrading the quality too much. And only below 4 bits of precision do we start to see a very high impact on most models, degrading their abilities considerably. While there are still use cases for models with lower bitrates than 4 bits, their quality can vary immensely on the models, optimization methods, and formats.
117 |
118 | ## Inference & Quantized Models
119 |
120 | Not all inference engines support all possible formats; some are highly specialized and optimized for specific formats, while others aim to generalize and support all kinds.
121 |
122 |
123 | Among these engines, we have:
124 | - **[VLLM](https://github.com/vllm-project/vllm)**: One of the oldest and most standard engines, supporting GPTQ, AWQ, INT4, INT8, and FP8.
125 | - **[Exllamav2](https://github.com/turboderp/exllamav2)**: Mostly for GPTQ and EXLV2 formats.
126 | - **[llama.cpp](https://github.com/ggerganov/llama.cpp)**/**[ollama](https://github.com/ollama/ollama)**: Good options for GGUF inference.
127 | - **[Aphrodite](https://github.com/PygmalionAI/aphrodite-engine)**: A big generalized engine for production with support for AWQ, Bitsandbytes, EXL2, GGUF, GPTQ, and many others.
128 |
--------------------------------------------------------------------------------
/third_party/Chainlit/public/logo_light.svg:
--------------------------------------------------------------------------------
1 |
27 |
--------------------------------------------------------------------------------
/third_party/Indexify/pdf-entity-extraction/pdf-entity-extraction.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# PDF Entity Extraction with Indexify and Mistral\n",
8 | "\n",
9 | "This cookbook demonstrates how to build a robust entity extraction pipeline for PDF documents using Indexify and Mistral's large language models. You will learn how to efficiently extract named entities from PDF files for various applications such as information retrieval, content analysis, and data mining.\n",
10 | "\n",
11 | "## Introduction\n",
12 | "\n",
13 | "Entity extraction, also known as named entity recognition (NER) involves identifying and classifying named entities in text into predefined categories such as persons, organizations, locations, dates, and more. By applying this technique to PDF documents, we can automatically extract structured information from unstructured text, making it easier to analyze and utilize the content of these documents.\n",
14 | "\n",
15 | "## Prerequisites\n",
16 | "\n",
17 | "Before we begin, ensure you have the following:\n",
18 | "\n",
19 | "- Create a virtual env with Python 3.9 or later\n",
20 | " ```shell\n",
21 | " python3.9 -m venv ve\n",
22 | " source ve/bin/activate\n",
23 | " ```\n",
24 | "- `pip` (Python package manager)\n",
25 | "- A Mistral API key\n",
26 | "- Basic familiarity with Python and command-line interfaces\n",
27 | "\n",
28 | "## Setup\n",
29 | "\n",
30 | "### Install Indexify\n",
31 | "\n",
32 | "First, let's install Indexify using the official installation script in a terminal:\n",
33 | "\n",
34 | "```bash\n",
35 | "curl https://getindexify.ai | sh\n",
36 | "```\n",
37 | "\n",
38 | "Start the Indexify server:\n",
39 | "```bash\n",
40 | "./indexify server -d\n",
41 | "```\n",
42 | "This starts a long running server that exposes ingestion and retrieval APIs to applications.\n",
43 | "\n",
44 | "### Install Required Extractors\n",
45 | "\n",
46 | "Next, we'll install the necessary extractors in a new terminal:\n",
47 | "\n",
48 | "```bash\n",
49 | "pip install indexify-extractor-sdk\n",
50 | "indexify-extractor download tensorlake/pdfextractor\n",
51 | "indexify-extractor download tensorlake/mistral\n",
52 | "```\n",
53 | "\n",
54 | "Once the extractors are downloaded, you can start them:\n",
55 | "```bash\n",
56 | "indexify-extractor join-server\n",
57 | "```"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "## Creating the Extraction Graph\n",
65 | "\n",
66 | "The extraction graph defines the flow of data through our entity extraction pipeline. We'll create a graph that first extracts text from PDFs, then sends that text to Mistral for entity extraction."
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "from indexify import IndexifyClient, ExtractionGraph\n",
76 | "\n",
77 | "client = IndexifyClient()\n",
78 | "\n",
79 | "extraction_graph_spec = \"\"\"\n",
80 | "name: 'pdf_entity_extractor'\n",
81 | "extraction_policies:\n",
82 | " - extractor: 'tensorlake/pdfextractor'\n",
83 | " name: 'pdf_to_text'\n",
84 | " - extractor: 'tensorlake/mistral'\n",
85 | " name: 'text_to_entities'\n",
86 | " input_params:\n",
87 | " model_name: 'mistral-large-latest'\n",
88 | " key: 'YOUR_MISTRAL_API_KEY'\n",
89 | " system_prompt: 'Extract and categorize all named entities from the following text. Provide the results in a JSON format with categories: persons, organizations, locations, dates, and miscellaneous.'\n",
90 | " content_source: 'pdf_to_text'\n",
91 | "\"\"\"\n",
92 | "\n",
93 | "extraction_graph = ExtractionGraph.from_yaml(extraction_graph_spec)\n",
94 | "client.create_extraction_graph(extraction_graph)"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {},
100 | "source": [
101 | "Replace `'YOUR_MISTRAL_API_KEY'` with your actual Mistral API key."
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "metadata": {},
107 | "source": [
108 | "## Implementing the Entity Extraction Pipeline\n",
109 | "\n",
110 | "Now that we have our extraction graph set up, we can upload files and retrieve the entities:"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "import json\n",
120 | "import os\n",
121 | "import requests\n",
122 | "from indexify import IndexifyClient\n",
123 | "\n",
124 | "def download_pdf(url, save_path):\n",
125 | " response = requests.get(url)\n",
126 | " with open(save_path, 'wb') as f:\n",
127 | " f.write(response.content)\n",
128 | " print(f\"PDF downloaded and saved to {save_path}\")\n",
129 | "\n",
130 | "\n",
131 | "def extract_entities_from_pdf(pdf_path):\n",
132 | " client = IndexifyClient()\n",
133 | " \n",
134 | " # Upload the PDF file\n",
135 | " content_id = client.upload_file(\"pdf_entity_extractor\", pdf_path)\n",
136 | " \n",
137 | " # Wait for the extraction to complete\n",
138 | " client.wait_for_extraction(content_id)\n",
139 | " \n",
140 | " # Retrieve the extracted entities\n",
141 | " entities_content = client.get_extracted_content(\n",
142 | " content_id=content_id,\n",
143 | " graph_name=\"pdf_entity_extractor\",\n",
144 | " policy_name=\"text_to_entities\"\n",
145 | " )\n",
146 | " \n",
147 | " # Parse the JSON response\n",
148 | " entities = json.loads(entities_content[0]['content'].decode('utf-8'))\n",
149 | " return entities"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {},
156 | "outputs": [],
157 | "source": [
158 | "pdf_url = \"https://arxiv.org/pdf/2310.06825.pdf\"\n",
159 | "pdf_path = \"reference_document.pdf\"\n",
160 | "\n",
161 | "# Download the PDF\n",
162 | "download_pdf(pdf_url, pdf_path)\n",
163 | "extracted_entities = extract_entities_from_pdf(pdf_path)\n",
164 | "\n",
165 | "print(\"Extracted Entities:\")\n",
166 | "for category, entities in extracted_entities.items():\n",
167 | " print(f\"\\n{category.capitalize()}:\")\n",
168 | " for entity in entities:\n",
169 | " print(f\"- {entity}\")"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "metadata": {},
175 | "source": [
176 | "## Customization and Advanced Usage\n",
177 | "\n",
178 | "You can customize the entity extraction process by modifying the `system_prompt` in the extraction graph. For example:\n",
179 | "\n",
180 | "- To focus on specific entity types:\n",
181 | " ```yaml\n",
182 | " system_prompt: 'Extract only person names and organizations from the following text. Provide the results in a JSON format with categories: persons and organizations.'\n",
183 | " ```\n",
184 | "\n",
185 | "- To include entity relationships:\n",
186 | " ```yaml\n",
187 | " system_prompt: 'Extract named entities and their relationships from the following text. Provide the results in a JSON format with categories: entities (including type and name) and relationships (including type and involved entities).'\n",
188 | " ```\n",
189 | "\n",
190 | "You can also experiment with different Mistral models by changing the `model_name` parameter to find the best balance between speed and accuracy for your specific use case.\n",
191 | "\n",
192 | "## Conclusion\n",
193 | "\n",
194 | "While the example might look simple, there are some unique advantages of using Indexify for this -\n",
195 | "\n",
196 | "1. **Scalable and Highly Availability**: Indexify server can be deployed on a cloud and it can process 1000s of PDFs uploaded into it, and if any step in the pipeline fails it automatically retries on another machine.\n",
197 | "2. **Flexibility**: You can use any other [PDF extraction model](https://docs.getindexify.ai/usecases/pdf_extraction/) we used here doesn't work for the document you are using. \n",
198 | "\n",
199 | "## Next Steps\n",
200 | "\n",
201 | "- Learn more about Indexify on our docs - https://docs.getindexify.ai\n",
202 | "- Go over an example, which uses Mistral for [building summarization at scale](../pdf-summarization/)"
203 | ]
204 | }
205 | ],
206 | "metadata": {
207 | "language_info": {
208 | "name": "python"
209 | }
210 | },
211 | "nbformat": 4,
212 | "nbformat_minor": 2
213 | }
214 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Mistral Cookbook
2 |
3 | The Mistral Cookbook features examples contributed by Mistralers and our community, as well as our partners. If you have cool examples showcasing Mistral models, feel free to share them by submitting a PR to this repo.
4 |
5 | ## Submission Guidelines:
6 |
7 | - File Format: Please submit your example in the .md or .ipynb format.
8 | - Runnable on Colab: If you're sharing a notebook example, try to make sure it's runnable on Google Colab.
9 | - Authorship: Kindly include your name, your Github handle, and affiliation at the beginning of the file.
10 | - Descriptions: Please include your notebook along with its category and descriptions in the table below.
11 | - Tone: Kindly maintain a neural tone and minimize any excessive marketing materials.
12 | - Reproducibility: To ensure others can reproduce your work, kindly tag package versions in your code.
13 | - Image size: If you have images, please make sure each image's size is below 500KB.
14 | - Copyright: Always respect copyright and intellectual property laws.
15 |
16 | Disclaimer: Examples contributed by the community and partners do not represent Mistral's views and opinions.
17 |
18 | ## Content Guidelines:
19 |
20 | - Originality: Is your content original and offering a fresh perspective?
21 | - Clear: Is your content well-structured and clearly written?
22 | - Value: Is your content valuable to the community? Does the community need it?
23 |
24 | ## Main Notebooks
25 |
26 | | Notebook | Category | Description |
27 | |--------------------------------------------------------------------------------|-----------------------------|----------------------------------------------------------------------------------|
28 | | [quickstart.ipynb](quickstart.ipynb) | chat, embeddings | Basic quickstart with chat and embeddings with Mistral AI API |
29 | | [prompting_capabilities.ipynb](mistral/prompting/prompting_capabilities.ipynb) | prompting | Write prompts for classification, summarization, personalization, and evaluation |
30 | | [basic_RAG.ipynb](mistral/rag/basic_RAG.ipynb) | RAG | RAG from scratch with Mistral AI API |
31 | | [embeddings.ipynb](mistral/embeddings/embeddings.ipynb) | embeddings | Use Mistral embeddings API for classification and clustering |
32 | | [function_calling.ipynb](mistral/function_calling/function_calling.ipynb) | function calling | Use Mistral API for function calling |
33 | | [evaluation.ipynb](mistral/evaluation/evaluation.ipynb) | evaluation | Evaluate models with Mistral API |
34 | | [mistral_finetune_api.ipynb](mistral/fine_tune/mistral_finetune_api.ipynb) | fine-tuning | Finetune a model with Mistral fine-tuning API |
35 | | [mistral-search-engine.ipynb](mistral/rag/mistral-search-engine.ipynb) | RAG, function calling | Search engine built with Mistral API, function calling and RAG |
36 | | [prefix_use_cases.ipynb](mistral/prompting/prefix_use_cases.ipynb) | prefix, prompting | Cool examples with Mistral's prefix feature |
37 | | [synthetic_data_gen_and_finetune.ipynb](mistral/data_generation/synthetic_data_gen_and_finetune.ipynb) | data generation, fine-tuning | Simple data generation and fine-tuning guide |
38 | | [data_generation_refining_news.ipynb](mistral/data_generation/data_generation_refining_news.ipynb) | data generation | Simple data generation to refine news articles |
39 |
40 | ## Third Party Tools
41 |
42 | | Tools | Category | Party |
43 | | :-------------------------------------------------------------------------------------------------------------- | :--------------------- | :--------- |
44 | | [CAMEL Graph RAG with Mistral Models](third_party/CAMEL_AI/camel_graph_rag.ipynb) | multi-agent, tool, data gen| CAMEL-AI.org|
45 | | [CAMEL Role-Playing Scraper](third_party/CAMEL_AI/camel_roleplaying_scraper.ipynb) | multi-agent, tool, data gen| CAMEL-AI.org|
46 | | [adaptive_rag_mistral.ipynb](third_party/langchain/adaptive_rag_mistral.ipynb) | RAG | Langchain |
47 | | [Adaptive_RAG.ipynb](third_party/LlamaIndex/Adaptive_RAG.ipynb) | RAG | LLamaIndex |
48 | | [Agents_Tools.ipynb](third_party/LlamaIndex/Agents_Tools.ipynb) | agent | LLamaIndex |
49 | | [arize_phoenix_tracing.ipynb](third_party/Phoenix/arize_phoenix_tracing.ipynb) | tracing data | Phoenix |
50 | | [azure_ai_search_rag.ipynb](third_party/Azure_AI_Search/azure_ai_search_rag.ipynb) | RAG, embeddings | Azure |
51 | | [Chainlit - Mistral reasoning.ipynb](third_party/Chainlit/Chainlit_Mistral_reasoning.ipynb) | UI chat, tool calling | Chainlit |
52 | | [corrective_rag_mistral.ipynb](third_party/langchain/corrective_rag_mistral.ipynb) | RAG | Langchain |
53 | | [distilabel_synthetic_dpo_dataset.ipynb](third_party/argilla/distilabel_synthetic_dpo_dataset.ipynb) | synthetic data | Argilla |
54 | | [E2B Code Interpreter SDK with Codestral](third_party/E2B_Code_Interpreting) | tool, agent | E2B |
55 | | [function_calling_local.ipynb](third_party/Ollama/function_calling_local.ipynb) | tool call | Ollama |
56 | | [Gradio Integration - Chat with PDF](third_party/gradio/README.md) | UI chat, demo, RAG | Gradio |
57 | | [haystack_chat_with_docs.ipynb](third_party/Haystack/haystack_chat_with_docs.ipynb) | RAG, embeddings | Haystack |
58 | | [Indexify Integration - PDF Entity Extraction](third_party/Indexify/pdf-entity-extraction) | entity extraction, PDF | Indexify |
59 | | [Indexify Integration - PDF Summarization](third_party/Indexify/pdf-summarization) | summarization, PDF | Indexify |
60 | | [langgraph_code_assistant_mistral.ipynb](third_party/langchain/langgraph_code_assistant_mistral.ipynb) | code | Langchain |
61 | | [langgraph_crag_mistral.ipynb](third_party/langchain/langgraph_crag_mistral.ipynb) | RAG | Langchain |
62 | | [llamaindex_agentic_rag.ipynb](third_party/LlamaIndex/llamaindex_agentic_rag.ipynb) | RAG, agent | LLamaIndex |
63 | | [llamaindex_mistralai_finetuning.ipynb](third_party/LlamaIndex/llamaindex_mistralai_finetuning.ipynb) | fine-tuning | LLamaIndex |
64 | | [Microsoft Autogen - Function calling a pgsql db ](third_party/MS_Autogen_pgsql/mistral_pgsql_function_calling.ipynb) | Tool call, agent, RAG | Ms Autogen |
65 | | [Mesop Integration - Chat with PDF](third_party/mesop/README.md) | UI chat, demo, RAG | Mesop |
66 | | [neon_text_to_sql.ipynb](third_party/Neon/neon_text_to_sql.ipynb) | code | Neon |
67 | | [ollama_mistral_llamaindex.ipynb](third_party/LlamaIndex/ollama_mistral_llamaindex.ipynb) | RAG | LLamaIndex |
68 | | [Ollama Meetup Demo](third_party/Ollama/20240321_ollama_meetup) | demo | Ollama |
69 | | [Panel Integration - Chat with PDF](third_party/panel/README.md) | UI chat, demo, RAG | Panel |
70 | | [pinecone_rag.ipynb](third_party/Pinecone/pinecone_rag.ipynb) | RAG | Pinecone |
71 | | [RAG.ipynb](third_party/LlamaIndex/RAG.ipynb) | RAG | LLamaIndex |
72 | | [RouterQueryEngine.ipynb](third_party/LlamaIndex/RouterQueryEngine.ipynb) | agent | LLamaIndex |
73 | | [self_rag_mistral.ipynb](third_party/langchain/self_rag_mistral.ipynb) | RAG | Langchain |
74 | | [Streamlit Integration - Chat with PDF](third_party/streamlit/README.md) | UI chat, demo, RAG | Streamlit |
75 | | [SubQuestionQueryEngine.ipynb](third_party/LlamaIndex/RouterQueryEngine.ipynb) | agent | LLamaIndex |
76 | | [LLM Judge: Detecting hallucinations in language models](third_party/wandb/README.md) | fine-tuning, evaluation | Weights & Biases |
77 | | [`x mistral`: CLI & TUI APP Module in X-CMD](third_party/x-cmd/README.md) | CLI, TUI APP, Chat | x-cmd |
78 |
--------------------------------------------------------------------------------
/third_party/E2B_Code_Interpreting/codestral-code-interpreter-js/index.ts:
--------------------------------------------------------------------------------
1 | import fs from 'node:fs'
2 | import { CodeInterpreter, Result, ProcessMessage } from '@e2b/code-interpreter'
3 | import * as dotenv from 'dotenv'
4 | import MistralClient from '@mistralai/mistralai'
5 |
6 | dotenv.config()
7 |
8 | const MISTRAL_API_KEY = process.env.MISTRAL_API_KEY || ''
9 | const E2B_API_KEY = process.env.E2B_API_KEY || ''
10 |
11 | if (!MISTRAL_API_KEY) {
12 | console.error('Error: MISTRAL_API_KEY is not provided. Please set the MISTRAL_API_KEY in your environment variables.')
13 | process.exit(1)
14 | }
15 |
16 | if (!E2B_API_KEY) {
17 | console.error('Error: E2B_API_KEY is not provided. Please set the E2B_API_KEY in your environment variables.')
18 | process.exit(1)
19 | }
20 |
21 | console.log('MISTRAL_API_KEY:', MISTRAL_API_KEY ? 'Loaded' : 'Not Loaded')
22 | console.log('E2B_API_KEY:', E2B_API_KEY ? 'Loaded' : 'Not Loaded')
23 |
24 | const MODEL_NAME = 'codestral-latest'
25 | const SYSTEM_PROMPT = `
26 | You're a python data scientist. You are given tasks to complete and you run Python code to solve them.
27 |
28 | Information about the csv dataset:
29 | - It's in the \`/home/user/global_economy_indicators.csv\` file
30 | - The CSV file is using , as the delimiter
31 | - It has the following columns (examples included):
32 | - country: "Argentina", "Australia"
33 | - Region: "SouthAmerica", "Oceania"
34 | - Surface area (km2): for example, 2780400
35 | - Population in thousands (2017): for example, 44271
36 | - Population density (per km2, 2017): for example, 16.2
37 | - Sex ratio (m per 100 f, 2017): for example, 95.9
38 | - GDP: Gross domestic product (million current US$): for example, 632343
39 | - GDP growth rate (annual %, const. 2005 prices): for example, 2.4
40 | - GDP per capita (current US$): for example, 14564.5
41 | - Economy: Agriculture (% of GVA): for example, 10.0
42 | - Economy: Industry (% of GVA): for example, 28.1
43 | - Economy: Services and other activity (% of GVA): for example, 61.9
44 | - Employment: Agriculture (% of employed): for example, 4.8
45 | - Employment: Industry (% of employed): for example, 20.6
46 | - Employment: Services (% of employed): for example, 74.7
47 | - Unemployment (% of labour force): for example, 8.5
48 | - Employment: Female (% of employed): for example, 43.7
49 | - Employment: Male (% of employed): for example, 56.3
50 | - Labour force participation (female %): for example, 48.5
51 | - Labour force participation (male %): for example, 71.1
52 | - International trade: Imports (million US$): for example, 59253
53 | - International trade: Exports (million US$): for example, 57802
54 | - International trade: Balance (million US$): for example, -1451
55 | - Education: Government expenditure (% of GDP): for example, 5.3
56 | - Health: Total expenditure (% of GDP): for example, 8.1
57 | - Health: Government expenditure (% of total health expenditure): for example, 69.2
58 | - Health: Private expenditure (% of total health expenditure): for example, 30.8
59 | - Health: Out-of-pocket expenditure (% of total health expenditure): for example, 20.2
60 | - Health: External health expenditure (% of total health expenditure): for example, 0.2
61 | - Education: Primary gross enrollment ratio (f/m per 100 pop): for example, 111.5/107.6
62 | - Education: Secondary gross enrollment ratio (f/m per 100 pop): for example, 104.7/98.9
63 | - Education: Tertiary gross enrollment ratio (f/m per 100 pop): for example, 90.5/72.3
64 | - Education: Mean years of schooling (female): for example, 10.4
65 | - Education: Mean years of schooling (male): for example, 9.7
66 | - Urban population (% of total population): for example, 91.7
67 | - Population growth rate (annual %): for example, 0.9
68 | - Fertility rate (births per woman): for example, 2.3
69 | - Infant mortality rate (per 1,000 live births): for example, 8.9
70 | - Life expectancy at birth, female (years): for example, 79.7
71 | - Life expectancy at birth, male (years): for example, 72.9
72 | - Life expectancy at birth, total (years): for example, 76.4
73 | - Military expenditure (% of GDP): for example, 0.9
74 | - Population, female: for example, 22572521
75 | - Population, male: for example, 21472290
76 | - Tax revenue (% of GDP): for example, 11.0
77 | - Taxes on income, profits and capital gains (% of revenue): for example, 12.9
78 | - Urban population (% of total population): for example, 91.7
79 |
80 | Generally, you follow these rules:
81 | - ALWAYS FORMAT YOUR RESPONSE IN MARKDOWN
82 | - ALWAYS RESPOND ONLY WITH CODE IN CODE BLOCK LIKE THIS:
83 | \`\`\`python
84 | {code}
85 | \`\`\`
86 | - the Python code runs in jupyter notebook.
87 | - every time you generate Python, the code is executed in a separate cell. it's okay to make multiple calls to \`execute_python\`.
88 | - display visualizations using matplotlib or any other visualization library directly in the notebook. don't worry about saving the visualizations to a file.
89 | - you have access to the internet and can make api requests.
90 | - you also have access to the filesystem and can read/write files.
91 | - you can install any pip package (if it exists) if you need to be running \`!pip install {package}\`. The usual packages for data analysis are already preinstalled though.
92 | - you can run any Python code you want, everything is running in a secure sandbox environment
93 | `
94 |
95 | const client = new MistralClient()
96 |
97 | async function codeInterpret(codeInterpreter: CodeInterpreter, code: string): Promise {
98 | console.log('Running code interpreter...')
99 |
100 | const exec = await codeInterpreter.notebook.execCell(code, {
101 | onStderr: (msg: ProcessMessage) => console.log('[Code Interpreter stderr]', msg),
102 | onStdout: (stdout: ProcessMessage) => console.log('[Code Interpreter stdout]', stdout)
103 | })
104 |
105 | if (exec.error) {
106 | console.error('[Code Interpreter ERROR]', exec.error)
107 | throw new Error(exec.error.value)
108 | }
109 |
110 | return exec.results
111 | }
112 |
113 | async function chat(codeInterpreter: CodeInterpreter, userMessage: string): Promise {
114 | console.log(`\n${'='.repeat(50)}\nUser Message: ${userMessage}\n${'='.repeat(50)}`)
115 |
116 | const messages = [
117 | { role: 'system', content: SYSTEM_PROMPT },
118 | { role: 'user', content: userMessage }
119 | ]
120 |
121 | try {
122 | const response = await client.chat({
123 | model: MODEL_NAME,
124 | messages: messages
125 | })
126 |
127 | const responseMessage = response.choices[0].message.content
128 | const codeBlockMatch = responseMessage.match(/```python\n([\s\S]*?)\n```/)
129 |
130 | if (codeBlockMatch && codeBlockMatch[1]) {
131 | const pythonCode = codeBlockMatch[1]
132 | console.log('CODE TO RUN')
133 | console.log(pythonCode)
134 | const codeInterpreterResults = await codeInterpret(codeInterpreter, pythonCode)
135 | return codeInterpreterResults
136 | } else {
137 | console.error('Failed to match any Python code in model\'s response')
138 | return []
139 | }
140 | } catch (error) {
141 | console.error('Error during API call:', error)
142 | throw error
143 | }
144 | }
145 |
146 | async function uploadDataset(codeInterpreter: CodeInterpreter): Promise {
147 | console.log('Uploading dataset to Code Interpreter sandbox...')
148 | const datasetPath = './global_economy_indicators.csv'
149 |
150 | if (!fs.existsSync(datasetPath)) {
151 | throw new Error('Dataset file not found')
152 | }
153 |
154 | // Read the file into a buffer
155 | const fileBuffer = fs.readFileSync(datasetPath)
156 |
157 | try {
158 | const remotePath = await codeInterpreter.uploadFile(fileBuffer, 'global_economy_indicators.csv') // Pass the buffer and filename
159 | if (!remotePath) {
160 | throw new Error('Failed to upload dataset')
161 | }
162 | console.log('Uploaded at', remotePath)
163 | return remotePath
164 | } catch (error) {
165 | console.error('Error during file upload:', error)
166 | throw error
167 | }
168 | }
169 |
170 | async function run() {
171 | const codeInterpreter = await CodeInterpreter.create()
172 |
173 | try {
174 | const remotePath = await uploadDataset(codeInterpreter)
175 | console.log('Remote path of the uploaded dataset:', remotePath)
176 |
177 | const codeInterpreterResults = await chat(
178 | codeInterpreter,
179 | // Task for the model
180 | 'Make a chart showing linear regression of the relationship between GDP per capita and life expectancy from the global_economy_indicators. Filter out any missing values or values in wrong format.'
181 | )
182 | console.log('codeInterpreterResults:', codeInterpreterResults)
183 |
184 | const result = codeInterpreterResults[0]
185 | console.log('Result object:', result)
186 |
187 | if (result && result.png) {
188 | fs.writeFileSync('image_1.png', Buffer.from(result.png, 'base64'))
189 | console.log('Success: Image generated and saved as image_1.png')
190 | } else {
191 | console.error('Error: No PNG data available.')
192 | }
193 |
194 | } catch (error) {
195 | console.error('An error occurred:', error)
196 | } finally {
197 | await codeInterpreter.close()
198 | }
199 | }
200 |
201 | run()
202 |
--------------------------------------------------------------------------------
/third_party/mesop/README.md:
--------------------------------------------------------------------------------
1 | # Chat with Your PDF using Mistral and Mesop
2 |
3 | In this guide, we will introduce the basics of building a chatbot with chat and PDF reading capabilities using `mesop`!
4 |
5 | ## Chat Interface
6 |
7 | First, let's implement a simple chat interface. To do this, we will need to import the `mesop`, `mesop.labs`, `mistralai` libraries, and `ChatMessage` from `mistralai.models.chat_completion`.
8 |
9 | ```shell
10 | pip install mesop mistralai
11 | ```
12 |
13 | *This demo uses `mesop===0.9.3` and `mistralai===0.4.0`*
14 |
15 | ```py
16 | import mesop as me
17 | import mesop.labs as mel
18 | from mistralai.client import MistralClient
19 | from mistralai.models.chat_completion import ChatMessage
20 | ```
21 |
22 | Next, create your `MistralClient` instance using your Mistral API key.
23 |
24 | ```py
25 | mistral_api_key = "api_key"
26 | cli = MistralClient(api_key = mistral_api_key)
27 | ```
28 |
29 | To create our interface with `mesop`, we can make use of their `chat` function. It will look something like this:
30 |
31 | ```py
32 | def ask_mistral(message: str, history: list[mel.ChatMessage]):
33 | messages = [ChatMessage(role=m.role, content=m.content) for m in history[:-1]]
34 | for chunk in cli.chat_stream(model = "open-mistral-7b", messages = messages, max_tokens = 1024):
35 | yield chunk.choices[0].delta.content
36 |
37 | @me.page(title="Talk to Mistral")
38 | def page():
39 | mel.chat(ask_mistral, title="Ask Mistral", bot_user="Mistral")
40 | ```
41 |
42 | Now, all we have to do is run the command `mesop chat.py`!
43 |
44 |
45 | chat.py
46 |
47 | ```py
48 | import mesop as me
49 | import mesop.labs as mel
50 | from mistralai.client import MistralClient
51 | from mistralai.models.chat_completion import ChatMessage
52 |
53 | mistral_api_key = "api_key"
54 | cli = MistralClient(api_key = mistral_api_key)
55 |
56 | def ask_mistral(message: str, history: list[mel.ChatMessage]):
57 | messages = [ChatMessage(role=m.role, content=m.content) for m in history[:-1]]
58 | for chunk in cli.chat_stream(model = "open-mistral-7b", messages = messages, max_tokens = 1024):
59 | yield chunk.choices[0].delta.content
60 |
61 | @me.page(title="Talk to Mistral")
62 | def page():
63 | mel.chat(ask_mistral, title="Ask Mistral", bot_user="Mistral")
64 | ```
65 |
66 |
67 |
68 | ## Chatting with PDFs
69 |
70 | To enable our model to read PDFs, we need to convert the content, extract the text, and then use Mistral's embedding model to retrieve chunks of our document(s) to feed to the model. We will need to implement some basic RAG (Retrieval-Augmented Generation)!
71 |
72 | For this task, we will require `faiss`, `PyPDF2`, and other libraries. Let's import them:
73 |
74 | ```shell
75 | pip install numpy PyPDF2 faiss
76 | ```
77 | **For CPU only please install faiss-cpu instead.**
78 |
79 | *This demo uses `numpy===1.26.4`, `PyPDF2===0.4.0` and `faiss-cpu===1.8.0`*
80 |
81 | ```py
82 | import io
83 | import mesop as me
84 | import mesop.labs as mel
85 | from mistralai.client import MistralClient
86 | from mistralai.models.chat_completion import ChatMessage
87 | import numpy as np
88 | import PyPDF2
89 | import faiss
90 | ```
91 |
92 | For our interface to allow the uploading of files, we need to add an uploader to our `page` function.
93 |
94 | ```py
95 | @me.page(title="Talk to Mistral")
96 | def page():
97 | with me.box(style=me.Style(height = "100%", display="flex", flex_direction="column", align_items="center",padding=me.Padding(top = 0, left = 30, right = 30, bottom = 0))):
98 | with me.box(style=me.Style(padding=me.Padding(top = 16), position="fixed")):
99 | me.uploader(
100 | label="Upload PDF",
101 | accepted_file_types=["file/pdf"],
102 | on_upload=handle_upload,
103 | )
104 | with me.box(style=me.Style(width="100%")):
105 | mel.chat(ask_mistral, title="Ask Mistral", bot_user="Mistral")
106 | ```
107 |
108 | Now, our interface will also accept files. The next step is to handle them and extract the text from the PDF files.
109 |
110 | ```py
111 | @me.stateclass
112 | class State:
113 | content: str
114 |
115 | def handle_upload(event: me.UploadEvent):
116 | state = me.state(State)
117 | reader = PyPDF2.PdfReader(io.BytesIO(event.file.getvalue()))
118 | txt = ""
119 | for page in reader.pages:
120 | txt += page.extract_text()
121 | state.content = txt
122 | ```
123 |
124 | We are ready to read the PDF files and implement some RAG. For this, we will need to make a function that retrieves the relevant chunks of text from the PDFs concatenated as a single string. For that, we will make use of Mistral's embeddings. Let's quickly design a function that will convert text to the embeddings:
125 |
126 | ```py
127 | def get_text_embedding(input: str):
128 | embeddings_batch_response = cli.embeddings(
129 | model = "mistral-embed",
130 | input = input
131 | )
132 | return embeddings_batch_response.data[0].embedding
133 | ```
134 |
135 | And now, we can make `rag_pdf` that will handle all the RAG and retrieve the proper chunks:
136 |
137 | ```py
138 | def rag_pdf(pdfs: list, question: str) -> str:
139 | chunk_size = 4096
140 | chunks = []
141 | for pdf in pdfs:
142 | chunks += [pdf[i:i + chunk_size] for i in range(0, len(pdf), chunk_size)]
143 |
144 | text_embeddings = np.array([get_text_embedding(chunk) for chunk in chunks])
145 | d = text_embeddings.shape[1]
146 | index = faiss.IndexFlatL2(d)
147 | index.add(text_embeddings)
148 |
149 | question_embeddings = np.array([get_text_embedding(question)])
150 | D, I = index.search(question_embeddings, k = 4)
151 | retrieved_chunk = [chunks[i] for i in I.tolist()[0]]
152 | text_retrieved = "\n\n".join(retrieved_chunk)
153 | return text_retrieved
154 | ```
155 |
156 | In this function, we cut the PDF files into chunks of equal sizes, get their embeddings, and apply some vector search with `faiss` to retrieve the best 4 chunks. The next and last step will be to integrate them with the model:
157 |
158 | ```py
159 | def ask_mistral(message: str, history: list[mel.ChatMessage]):
160 | messages = [ChatMessage(role=m.role, content=m.content) for m in history[:-1]]
161 |
162 | state = me.state(State)
163 | if state.content:
164 | retrieved_text = rag_pdf([state.content], message)
165 | messages[-1] = ChatMessage(role = "user", content = retrieved_text + "\n\n" +messages[-1].content)
166 |
167 | for chunk in cli.chat_stream(model = "open-mistral-7b", messages = messages, max_tokens = 1024):
168 | yield chunk.choices[0].delta.content
169 | ```
170 |
171 | With this, we are ready to go! We can run our script with the command `mesop chat_with_pdfs.py`.
172 |
173 |
174 | chat_with_pdfs.py
175 |
176 | ```py
177 | import io
178 | import mesop as me
179 | import mesop.labs as mel
180 | from mistralai.client import MistralClient
181 | from mistralai.models.chat_completion import ChatMessage
182 | import numpy as np
183 | import PyPDF2
184 | import faiss
185 |
186 | mistral_api_key = "api_key"
187 | cli = MistralClient(api_key = mistral_api_key)
188 |
189 | def get_text_embedding(input: str):
190 | embeddings_batch_response = cli.embeddings(
191 | model = "mistral-embed",
192 | input = input
193 | )
194 | return embeddings_batch_response.data[0].embedding
195 |
196 | def rag_pdf(pdfs: list, question: str) -> str:
197 | chunk_size = 4096
198 | chunks = []
199 | for pdf in pdfs:
200 | chunks += [pdf[i:i + chunk_size] for i in range(0, len(pdf), chunk_size)]
201 |
202 | text_embeddings = np.array([get_text_embedding(chunk) for chunk in chunks])
203 | d = text_embeddings.shape[1]
204 | index = faiss.IndexFlatL2(d)
205 | index.add(text_embeddings)
206 |
207 | question_embeddings = np.array([get_text_embedding(question)])
208 | D, I = index.search(question_embeddings, k = 4)
209 | retrieved_chunk = [chunks[i] for i in I.tolist()[0]]
210 | text_retrieved = "\n\n".join(retrieved_chunk)
211 | return text_retrieved
212 |
213 | def ask_mistral(message: str, history: list[mel.ChatMessage]):
214 | messages = [ChatMessage(role=m.role, content=m.content) for m in history[:-1]]
215 |
216 | state = me.state(State)
217 | if state.content:
218 | retrieved_text = rag_pdf([state.content], message)
219 | messages[-1] = ChatMessage(role = "user", content = retrieved_text + "\n\n" +messages[-1].content)
220 |
221 | for chunk in cli.chat_stream(model = "open-mistral-7b", messages = messages, max_tokens = 1024):
222 | yield chunk.choices[0].delta.content
223 |
224 | @me.stateclass
225 | class State:
226 | content: str
227 |
228 | def handle_upload(event: me.UploadEvent):
229 | state = me.state(State)
230 | reader = PyPDF2.PdfReader(io.BytesIO(event.file.getvalue()))
231 | txt = ""
232 | for page in reader.pages:
233 | txt += page.extract_text()
234 | state.content = txt
235 |
236 | @me.page(title="Talk to Mistral")
237 | def page():
238 | with me.box(style=me.Style(height = "100%", display="flex", flex_direction="column", align_items="center",padding=me.Padding(top = 0, left = 30, right = 30, bottom = 0))):
239 | with me.box(style=me.Style(padding=me.Padding(top = 16), position="fixed")):
240 | me.uploader(
241 | label="Upload PDF",
242 | accepted_file_types=["file/pdf"],
243 | on_upload=handle_upload,
244 | )
245 | with me.box(style=me.Style(width="100%")):
246 | mel.chat(ask_mistral, title="Ask Mistral", bot_user="Mistral")
247 | ```
248 |
249 |
250 |
--------------------------------------------------------------------------------
/third_party/gradio/README.md:
--------------------------------------------------------------------------------
1 | # Chat with Your PDF using Mistral and Gradio
2 |
3 | In this guide, we will introduce the basics of building a chatbot with chat and PDF reading capabilities using `gradio`!
4 |
5 | **Watch our demo by clicking this image:**
6 |
7 | [](https://www.youtube.com/watch?v=mrHgm7MOipw)
8 |
9 | ## Chat Interface
10 |
11 | First, let's implement a simple chat interface. To do this, we will need to import the `gradio`, `mistralai` libraries, and `ChatMessage` from `mistralai.models.chat_completion`.
12 |
13 | ```shell
14 | pip install gradio mistralai
15 | ```
16 |
17 | *This demo uses `gradio===4.32.2` and `mistralai===0.4.0`*
18 |
19 | ```py
20 | import gradio as gr
21 | from mistralai.client import MistralClient
22 | from mistralai.models.chat_completion import ChatMessage
23 | ```
24 |
25 | Next, create your `MistralClient` instance using your Mistral API key.
26 |
27 | ```py
28 | mistral_api_key = "your_api_key"
29 | cli = MistralClient(api_key = mistral_api_key)
30 | ```
31 |
32 | To create our interface with `gradio`, we can make use of their `ChatInterface`. It will look something like this:
33 |
34 | ```py
35 | def ask_mistral(message: str, history: list):
36 | return "Bot's response."
37 |
38 | app = gr.ChatInterface(fn = ask_mistral, title = "Ask Mistral")
39 | app.launch()
40 | ```
41 |
42 | Now, all we have to do is edit `ask_mistral` so it parses our message and history, calls Mistral's API, and streams the response.
43 |
44 | ```py
45 | def ask_mistral(message: str, history: list):
46 | messages = []
47 | for couple in history:
48 | messages.append(ChatMessage(role = "user", content = couple[0]))
49 | messages.append(ChatMessage(role = "assistant", content = couple[1]))
50 | messages.append(ChatMessage(role = "user", content = message))
51 |
52 | full_response = ""
53 | for chunk in cli.chat_stream(model = "open-mistral-7b", messages = messages, max_tokens = 1024):
54 | full_response += chunk.choices[0].delta.content
55 | yield full_response
56 | ```
57 |
58 | Done! Once ready, you can run the script (`chat.py`)!
59 |
60 |
61 | chat.py
62 |
63 | ```py
64 | import gradio as gr
65 | from mistralai.client import MistralClient
66 | from mistralai.models.chat_completion import ChatMessage
67 |
68 | mistral_api_key = "your_api_key"
69 | cli = MistralClient(api_key = mistral_api_key)
70 |
71 | def ask_mistral(message: str, history: list):
72 | messages = []
73 | for couple in history:
74 | messages.append(ChatMessage(role = "user", content = couple[0]))
75 | messages.append(ChatMessage(role = "assistant", content = couple[1]))
76 | messages.append(ChatMessage(role = "user", content = message))
77 |
78 | full_response = ""
79 | for chunk in cli.chat_stream(model = "open-mistral-7b", messages = messages, max_tokens = 1024):
80 | full_response += chunk.choices[0].delta.content
81 | yield full_response
82 |
83 | app = gr.ChatInterface(fn = ask_mistral, title = "Ask Mistral")
84 | app.launch()
85 | ```
86 |
87 |
88 |
89 | ## Chatting with PDFs
90 |
91 | To enable our model to read PDFs, we need to convert the content, extract the text, and then use Mistral's embedding model to retrieve chunks of our document(s) to feed to the model. We will need to implement some basic RAG (Retrieval-Augmented Generation)!
92 |
93 | For this task, we will require `faiss`, `PyPDF2`, and other libraries. Let's import them:
94 |
95 | ```shell
96 | pip install numpy PyPDF2 faiss
97 | ```
98 | **For CPU only please install faiss-cpu instead.**
99 |
100 | *This demo uses `numpy===1.26.4`, `PyPDF2===0.4.0` and `faiss-cpu===1.8.0`*
101 |
102 | ```py
103 | import gradio as gr
104 | from mistralai.client import MistralClient
105 | from mistralai.models.chat_completion import ChatMessage
106 | import numpy as np
107 | import PyPDF2
108 | import faiss
109 | ```
110 |
111 | For our interface to allow the uploading of files, we need to toggle multimodality on our `ChatInterface`.
112 |
113 | ```py
114 | app = gr.ChatInterface(fn = ask_mistral, title = "Ask Mistral and talk to your PDFs", multimodal = True)
115 | app.launch()
116 | ```
117 |
118 | Now, our interface will also accept files. The next step is to handle them and filter out the PDF files from the messages.
119 |
120 | ```py
121 | def ask_mistral(message: str, history: list):
122 | messages = []
123 | pdfs = message["files"]
124 | for couple in history:
125 | if type(couple[0]) is tuple:
126 | pdfs += couple[0]
127 | else:
128 | messages.append(ChatMessage(role = "user", content = couple[0]))
129 | messages.append(ChatMessage(role = "assistant", content = couple[1]))
130 |
131 | messages.append(ChatMessage(role = "user", content = message["text"]))
132 |
133 | full_response = ""
134 | for chunk in cli.chat_stream(model = "open-mistral-7b", messages = messages, max_tokens = 1024):
135 | full_response += chunk.choices[0].delta.content
136 | yield full_response
137 | ```
138 |
139 | We are ready to read the PDF files and implement some RAG. For this, we will need to make a function that retrieves the relevant chunks of text from the PDFs concatenated as a single string. For that, we will make use of Mistral's embeddings. Let's quickly design a function that will convert text to the embeddings:
140 |
141 | ```py
142 | def get_text_embedding(input: str):
143 | embeddings_batch_response = cli.embeddings(
144 | model = "mistral-embed",
145 | input = input
146 | )
147 | return embeddings_batch_response.data[0].embedding
148 | ```
149 |
150 | And now, we can make `rag_pdf` that will handle all the RAG and retrieve the proper chunks:
151 |
152 | ```py
153 | def rag_pdf(pdfs: list, question: str) -> str:
154 | chunk_size = 4096
155 | chunks = []
156 | for pdf in pdfs:
157 | chunks += [pdf[i:i + chunk_size] for i in range(0, len(pdf), chunk_size)]
158 |
159 | text_embeddings = np.array([get_text_embedding(chunk) for chunk in chunks])
160 | d = text_embeddings.shape[1]
161 | index = faiss.IndexFlatL2(d)
162 | index.add(text_embeddings)
163 |
164 | question_embeddings = np.array([get_text_embedding(question)])
165 | D, I = index.search(question_embeddings, k = 4)
166 | retrieved_chunk = [chunks[i] for i in I.tolist()[0]]
167 | text_retrieved = "\n\n".join(retrieved_chunk)
168 | return text_retrieved
169 | ```
170 |
171 | In this function, we cut the PDF files into chunks of equal sizes, get their embeddings, and apply some vector search with `faiss` to retrieve the best 4 chunks. The next and last step will be to read the PDF files themselves with `PyPDF2` and integrate them with the model:
172 |
173 | ```py
174 | def ask_mistral(message: str, history: list):
175 | messages = []
176 | pdfs = message["files"]
177 | for couple in history:
178 | if type(couple[0]) is tuple:
179 | pdfs += couple[0]
180 | else:
181 | messages.append(ChatMessage(role = "user", content = couple[0]))
182 | messages.append(ChatMessage(role = "assistant", content = couple[1]))
183 |
184 | if pdfs:
185 | pdfs_extracted = []
186 | for pdf in pdfs:
187 | reader = PyPDF2.PdfReader(pdf)
188 | txt = ""
189 | for page in reader.pages:
190 | txt += page.extract_text()
191 | pdfs_extracted.append(txt)
192 |
193 | retrieved_text = rag_pdf(pdfs_extracted, message["text"])
194 | messages.append(ChatMessage(role = "user", content = retrieved_text + "\n\n" + message["text"]))
195 | else:
196 | messages.append(ChatMessage(role = "user", content = message["text"]))
197 |
198 | full_response = ""
199 | for chunk in cli.chat_stream(model = "open-mistral-7b", messages = messages, max_tokens = 1024):
200 | full_response += chunk.choices[0].delta.content
201 | yield full_response
202 | ```
203 |
204 | With this, we are ready to go! We can run our script `chat_with_pdfs.py`.
205 |
206 |
207 | chat_with_pdfs.py
208 |
209 | ```py
210 | import gradio as gr
211 | from mistralai.client import MistralClient
212 | from mistralai.models.chat_completion import ChatMessage
213 | import numpy as np
214 | import PyPDF2
215 | import faiss
216 |
217 | mistral_api_key = "your_api_key"
218 | cli = MistralClient(api_key = mistral_api_key)
219 |
220 | def get_text_embedding(input: str):
221 | embeddings_batch_response = cli.embeddings(
222 | model = "mistral-embed",
223 | input = input
224 | )
225 | return embeddings_batch_response.data[0].embedding
226 |
227 | def rag_pdf(pdfs: list, question: str) -> str:
228 | chunk_size = 4096
229 | chunks = []
230 | for pdf in pdfs:
231 | chunks += [pdf[i:i + chunk_size] for i in range(0, len(pdf), chunk_size)]
232 |
233 | text_embeddings = np.array([get_text_embedding(chunk) for chunk in chunks])
234 | d = text_embeddings.shape[1]
235 | index = faiss.IndexFlatL2(d)
236 | index.add(text_embeddings)
237 |
238 | question_embeddings = np.array([get_text_embedding(question)])
239 | D, I = index.search(question_embeddings, k = 4)
240 | retrieved_chunk = [chunks[i] for i in I.tolist()[0]]
241 | text_retrieved = "\n\n".join(retrieved_chunk)
242 | return text_retrieved
243 |
244 | def ask_mistral(message: str, history: list):
245 | messages = []
246 | pdfs = message["files"]
247 | for couple in history:
248 | if type(couple[0]) is tuple:
249 | pdfs += couple[0]
250 | else:
251 | messages.append(ChatMessage(role= "user", content = couple[0]))
252 | messages.append(ChatMessage(role= "assistant", content = couple[1]))
253 |
254 | if pdfs:
255 | pdfs_extracted = []
256 | for pdf in pdfs:
257 | reader = PyPDF2.PdfReader(pdf)
258 | txt = ""
259 | for page in reader.pages:
260 | txt += page.extract_text()
261 | pdfs_extracted.append(txt)
262 |
263 | retrieved_text = rag_pdf(pdfs_extracted, message["text"])
264 | messages.append(ChatMessage(role = "user", content = retrieved_text + "\n\n" + message["text"]))
265 | else:
266 | messages.append(ChatMessage(role = "user", content = message["text"]))
267 |
268 | full_response = ""
269 | for chunk in cli.chat_stream(model = "open-mistral-7b", messages = messages, max_tokens = 1024):
270 | full_response += chunk.choices[0].delta.content
271 | yield full_response
272 |
273 | app = gr.ChatInterface(fn = ask_mistral, title = "Ask Mistral and talk to your PDFs", multimodal = True)
274 | app.launch()
275 | ```
276 |
277 |
--------------------------------------------------------------------------------
/third_party/streamlit/README.md:
--------------------------------------------------------------------------------
1 | # Chat with Your PDF using Mistral and Streamlit
2 |
3 | In this guide, we will introduce the basics of building a chatbot with chat and PDF reading capabilities using `streamlit`!
4 |
5 | **Watch our demo by clicking this image:**
6 |
7 | [](https://www.youtube.com/watch?v=VGSAA-d_Sqo)
8 |
9 | ## Chat Interface
10 |
11 | First, let's implement a simple chat interface. To do this, we will need to import the `streamlit` and `mistralai` libraries.
12 |
13 | ```shell
14 | pip install streamlit mistralai
15 | ```
16 |
17 | *This demo uses `streamlit===1.35.0` and `mistralai===0.4.0`*
18 |
19 | ```py
20 | import streamlit as st
21 | from mistralai.client import MistralClient
22 | ```
23 |
24 | Next, create your `MistralClient` instance using your Mistral API key.
25 |
26 | ```py
27 | mistral_api_key = "your_api_key"
28 | cli = MistralClient(api_key = mistral_api_key)
29 | ```
30 |
31 | Now, we will initialize a session variable where all messages will be stored and display them on the screen.
32 |
33 | ```py
34 | st.title("Chat with Mistral and your PDFs")
35 |
36 | if "messages" not in st.session_state:
37 | st.session_state.messages = []
38 |
39 | for message in st.session_state.messages:
40 | with st.chat_message(message["role"]):
41 | st.markdown(message["content"])
42 | ```
43 |
44 | The following step is to retrieve the input from the user and store it in the list of messages. For this, we will use `chat_input` from `streamlit`!
45 |
46 | ```py
47 | if prompt := st.chat_input("Talk to Mistral!"):
48 | with st.chat_message("user"):
49 | st.markdown(prompt)
50 | st.session_state.messages.append({"role": "user", "content": prompt})
51 | ```
52 |
53 | All that's left is to query Mistral and retrieve the response. To make the interaction smooth, we will handle it by streaming the response. For this, `streamlit` has `write_stream`, which accepts a generator. Let's define a generator!
54 |
55 | ```py
56 | def ask_mistral(messages: list):
57 | resp = cli.chat_stream(model = "open-mistral-7b", messages = messages, max_tokens = 1024)
58 | for chunk in resp:
59 | yield chunk.choices[0].delta.content
60 | ```
61 |
62 | With everything set, all we need to do is retrieve the response from the model and save it in the session.
63 |
64 | ```py
65 | if prompt := st.chat_input("Talk to Mistral!"):
66 | with st.chat_message("user"):
67 | st.markdown(prompt)
68 | st.session_state.messages.append({"role": "user", "content": prompt})
69 |
70 | with st.chat_message("assistant"):
71 | response_generator = ask_mistral(st.session_state.messages)
72 | response = st.write_stream(response_generator)
73 |
74 | st.session_state.messages.append({"role": "assistant", "content": response})
75 | ```
76 |
77 | There you go! An interface where you can chat with Mistral's models.
78 |
79 | To run this code, enter `streamlit run chat.py` in the console.
80 |
81 |
82 | chat.py
83 |
84 | ```py
85 | import streamlit as st
86 | from mistralai.client import MistralClient
87 |
88 | mistral_api_key = "your_api_key"
89 | cli = MistralClient(api_key=mistral_api_key)
90 |
91 | st.title("Chat with Mistral")
92 |
93 | if "messages" not in st.session_state:
94 | st.session_state.messages = []
95 |
96 | for message in st.session_state.messages:
97 | with st.chat_message(message["role"]):
98 | st.markdown(message["content"])
99 |
100 | def ask_mistral(messages: list):
101 | resp = cli.chat_stream(model = "open-mistral-7b", messages = messages, max_tokens = 1024)
102 | for chunk in resp:
103 | yield chunk.choices[0].delta.content
104 |
105 | if prompt := st.chat_input("Talk to Mistral!"):
106 | with st.chat_message("user"):
107 | st.markdown(prompt)
108 | st.session_state.messages.append({"role": "user", "content": prompt})
109 |
110 | with st.chat_message("assistant"):
111 | response_generator = ask_mistral(st.session_state.messages)
112 | response = st.write_stream(response_generator)
113 |
114 | st.session_state.messages.append({"role": "assistant", "content": response})
115 | ```
116 |
117 |
118 |
119 | ## Chatting with PDFs
120 |
121 | To enable our model to read PDFs, we need to convert the content, extract the text, and then use Mistral's embedding model to retrieve chunks of our document(s) to feed to the model. We will need to implement some basic RAG (Retrieval-Augmented Generation)!
122 |
123 | For this task, we will require `faiss`, `PyPDF2`, and other libraries. Let's import them:
124 |
125 | ```shell
126 | pip install io numpy PyPDF2 faiss
127 | ```
128 |
129 | **For CPU only please install faiss-cpu instead.**
130 |
131 | *This demo uses `numpy===1.26.4`, `PyPDF2===0.4.0` and `faiss-cpu===1.8.0`*
132 |
133 | ```py
134 | import io
135 | import streamlit as st
136 | from mistralai.client import MistralClient
137 | import numpy as np
138 | import PyPDF2
139 | import faiss
140 | ```
141 |
142 | Now, we need to add the possibility to upload PDF files. For this, let's use `file_uploader` from `streamlit`. The PDF will then be stored in a new session variable:
143 |
144 | ```py
145 | if "messages" not in st.session_state:
146 | st.session_state.messages = []
147 | st.session_state.pdfs = []
148 |
149 | # The rest of the code...
150 |
151 | uploaded_file = st.file_uploader("Choose a file", type=["pdf"])
152 | if uploaded_file is not None:
153 | bytes_io = io.BytesIO(uploaded_file.getvalue())
154 | st.session_state.pdfs.append(bytes_io)
155 | ```
156 |
157 | The PDFs are stored, but as they are, we just have a large amount of bytes. To be able to chat with the PDF, we will need to extract the text and use Mistral's embeddings to retrieve the relevant chunks.
158 |
159 | First, let's define a function that converts text to embeddings with Mistral:
160 |
161 | ```py
162 | def get_text_embedding(input_text: str):
163 | embeddings_batch_response = cli.embeddings(
164 | model = "mistral-embed",
165 | input = input_text
166 | )
167 | return embeddings_batch_response.data[0].embedding
168 | ```
169 |
170 | Next, we can declare a function that will handle all the retrieval part. This step will make use of `faiss` for the vector store and the previously created `get_text_embedding` function. This will cut the different files into chunks, create the embeddings, and retrieve the best 4 chunks among them, which will then be concatenated into a single string:
171 |
172 | ```py
173 | def rag_pdf(pdfs: list, question: str) -> str:
174 | chunk_size = 4096
175 | chunks = []
176 | for pdf in pdfs:
177 | chunks += [pdf[i:i + chunk_size] for i in range(0, len(pdf), chunk_size)]
178 |
179 | text_embeddings = np.array([get_text_embedding(chunk) for chunk in chunks])
180 | d = text_embeddings.shape[1]
181 | index = faiss.IndexFlatL2(d)
182 | index.add(text_embeddings)
183 |
184 | question_embeddings = np.array([get_text_embedding(question)])
185 | D, I = index.search(question_embeddings, k = 4)
186 | retrieved_chunk = [chunks[i] for i in I.tolist()[0]]
187 | text_retrieved = "\n\n".join(retrieved_chunk)
188 | return text_retrieved
189 | ```
190 |
191 | Finally, we edit `ask_mistral` to implement our new RAG with the files! This function, when there are PDFs, will extract the text with `PyPDF2` and make use of `rag_pdf` to retrieve the relevant data. It will only then send the request to the model:
192 |
193 | ```py
194 | def ask_mistral(messages: list, pdfs_bytes: list):
195 | if pdfs_bytes:
196 | pdfs = []
197 | for pdf in pdfs_bytes:
198 | reader = PyPDF2.PdfReader(pdf)
199 | txt = ""
200 | for page in reader.pages:
201 | txt += page.extract_text()
202 | pdfs.append(txt)
203 | messages[-1]["content"] = rag_pdf(pdfs, messages[-1]["content"]) + "\n\n" + messages[-1]["content"]
204 |
205 | resp = cli.chat_stream(model = "open-mistral-7b", messages = messages, max_tokens = 1024)
206 | for chunk in resp:
207 | yield chunk.choices[0].delta.content
208 |
209 | # Don't forget to add the new argument 'pdfs_bytes = st.session_state.pdfs' when you call this function.
210 | ```
211 |
212 | And everything is done! Now we can run our new interface with `streamlit run chat_with_pdfs.py`.
213 |
214 |
215 | chat_with_pdfs.py
216 |
217 | ```py
218 | import io
219 | import streamlit as st
220 | from mistralai.client import MistralClient
221 | import numpy as np
222 | import PyPDF2
223 | import faiss
224 |
225 | mistral_api_key = "your_api_key"
226 | cli = MistralClient(api_key = mistral_api_key)
227 |
228 | def get_text_embedding(input: str):
229 | embeddings_batch_response = cli.embeddings(
230 | model="mistral-embed",
231 | input=input
232 | )
233 | return embeddings_batch_response.data[0].embedding
234 |
235 | def rag_pdf(pdfs: list, question: str) -> str:
236 | chunk_size = 4096
237 | chunks = []
238 | for pdf in pdfs:
239 | chunks += [pdf[i:i + chunk_size] for i in range(0, len(pdf), chunk_size)]
240 |
241 | text_embeddings = np.array([get_text_embedding(chunk) for chunk in chunks])
242 | d = text_embeddings.shape[1]
243 | index = faiss.IndexFlatL2(d)
244 | index.add(text_embeddings)
245 |
246 | question_embeddings = np.array([get_text_embedding(question)])
247 | D, I = index.search(question_embeddings, k = 4)
248 | retrieved_chunk = [chunks[i] for i in I.tolist()[0]]
249 | text_retrieved = "\n\n".join(retrieved_chunk)
250 | return text_retrieved
251 |
252 | st.title("Chat with Mistral")
253 |
254 | if "messages" not in st.session_state:
255 | st.session_state.messages = []
256 | st.session_state.pdfs = []
257 |
258 | for message in st.session_state.messages:
259 | with st.chat_message(message["role"]):
260 | st.markdown(message["content"])
261 |
262 | def ask_mistral(messages: list, pdfs_bytes: list):
263 | if pdfs_bytes:
264 | pdfs = []
265 | for pdf in pdfs_bytes:
266 | reader = PyPDF2.PdfReader(pdf)
267 | txt = ""
268 | for page in reader.pages:
269 | txt += page.extract_text()
270 | pdfs.append(txt)
271 | messages[-1]["content"] = rag_pdf(pdfs, messages[-1]["content"]) + "\n\n" + messages[-1]["content"]
272 |
273 | resp = cli.chat_stream(model="open-mistral-7b", messages = messages, max_tokens = 1024)
274 | for chunk in resp:
275 | yield chunk.choices[0].delta.content
276 |
277 | if prompt := st.chat_input("Talk to Mistral!"):
278 | with st.chat_message("user"):
279 | st.markdown(prompt)
280 | st.session_state.messages.append({"role": "user", "content": prompt})
281 |
282 | with st.chat_message("assistant"):
283 | response_generator = ask_mistral(st.session_state.messages, st.session_state.pdfs)
284 | response = st.write_stream(response_generator)
285 |
286 | st.session_state.messages.append({"role": "assistant", "content": response})
287 |
288 | uploaded_file = st.file_uploader("Choose a file", type = ["pdf"])
289 | if uploaded_file is not None:
290 | bytes_io = io.BytesIO(uploaded_file.getvalue())
291 | st.session_state.pdfs.append(bytes_io)
292 | ```
293 |
294 |
--------------------------------------------------------------------------------
/third_party/LlamaIndex/propertygraphs/property_graph_neo4j.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# PropertyGraph using Neo4j\n",
8 | "\n",
9 | "In this notebook we will demonstrate building PropertyGraph using Neo4j\n",
10 | "\n",
11 | "Neo4j is a production-grade graph database that excels in storing property graphs, performing vector searches, filtering, and more.\n",
12 | "\n",
13 | "The simplest way to begin is by using a cloud-hosted instance through Neo4j Aura. However, for the purposes of this notebook, we will focus on how to run the database locally using Docker."
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "%pip install llama-index-core \n",
23 | "%pip install llama-index-graph-stores-neo4j\n",
24 | "%pip install llama-index-llms-mistralai\n",
25 | "%pip install llama-index-embeddings-mistralai"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "## Docker Setup\n",
33 | "\n",
34 | "You need to login and set password for the first time.\n",
35 | "\n",
36 | "1. username: neo4j\n",
37 | "\n",
38 | "2. password: neo4j"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "!docker run \\\n",
48 | " -p 7474:7474 -p 7687:7687 \\\n",
49 | " -v $PWD/data:/data -v $PWD/plugins:/plugins \\\n",
50 | " --name neo4j-apoc \\\n",
51 | " -e NEO4J_apoc_export_file_enabled=true \\\n",
52 | " -e NEO4J_apoc_import_file_enabled=true \\\n",
53 | " -e NEO4J_apoc_import_file_use__neo4j__config=true \\\n",
54 | " -e NEO4JLABS_PLUGINS=\\[\\\"apoc\\\"\\] \\\n",
55 | " neo4j:latest"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "## Setup"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 1,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "import nest_asyncio\n",
72 | "\n",
73 | "nest_asyncio.apply()\n",
74 | "\n",
75 | "from IPython.display import Markdown, display"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 2,
81 | "metadata": {},
82 | "outputs": [],
83 | "source": [
84 | "import os\n",
85 | "os.environ['MISTRAL_API_KEY'] = 'YOUR MISTRAL API KEY'"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 3,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "from llama_index.embeddings.mistralai import MistralAIEmbedding\n",
95 | "from llama_index.llms.mistralai import MistralAI\n",
96 | "\n",
97 | "llm = MistralAI(model='mistral-large-latest')\n",
98 | "embed_model = MistralAIEmbedding()"
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "metadata": {},
104 | "source": [
105 | "## Download Data"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "!mkdir -p 'data/paul_graham/'\n",
115 | "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'"
116 | ]
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {},
121 | "source": [
122 | "## Load Data"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 4,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "from llama_index.core import SimpleDirectoryReader\n",
132 | "\n",
133 | "documents = SimpleDirectoryReader(\"./data/paul_graham/\").load_data()"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {},
139 | "source": [
140 | "## Index Construction"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 5,
146 | "metadata": {},
147 | "outputs": [
148 | {
149 | "name": "stderr",
150 | "output_type": "stream",
151 | "text": [
152 | "Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The procedure has a deprecated field. ('config' used by 'apoc.meta.graphSample' is deprecated.)} {position: line: 1, column: 1, offset: 0} for query: \"CALL apoc.meta.graphSample() YIELD nodes, relationships RETURN nodes, [rel in relationships | {name:apoc.any.property(rel, 'type'), count: apoc.any.property(rel, 'count')}] AS relationships\"\n"
153 | ]
154 | }
155 | ],
156 | "source": [
157 | "from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore\n",
158 | "\n",
159 | "# Note: used to be `Neo4jPGStore`\n",
160 | "graph_store = Neo4jPropertyGraphStore(\n",
161 | " username=\"neo4j\",\n",
162 | " password=\"llamaindex\",\n",
163 | " url=\"bolt://localhost:7687\",\n",
164 | ")"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 6,
170 | "metadata": {},
171 | "outputs": [
172 | {
173 | "name": "stderr",
174 | "output_type": "stream",
175 | "text": [
176 | "/Users/ravithejad/Desktop/llamaindex/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
177 | " from .autonotebook import tqdm as notebook_tqdm\n",
178 | "Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 27.19it/s]\n",
179 | "Extracting paths from text: 100%|██████████| 22/22 [00:42<00:00, 1.92s/it]\n",
180 | "Generating embeddings: 100%|██████████| 3/3 [00:01<00:00, 2.60it/s]\n",
181 | "Generating embeddings: 100%|██████████| 40/40 [00:13<00:00, 2.86it/s]\n",
182 | "Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The procedure has a deprecated field. ('config' used by 'apoc.meta.graphSample' is deprecated.)} {position: line: 1, column: 1, offset: 0} for query: \"CALL apoc.meta.graphSample() YIELD nodes, relationships RETURN nodes, [rel in relationships | {name:apoc.any.property(rel, 'type'), count: apoc.any.property(rel, 'count')}] AS relationships\"\n"
183 | ]
184 | }
185 | ],
186 | "source": [
187 | "from llama_index.core import PropertyGraphIndex\n",
188 | "from llama_index.embeddings.openai import OpenAIEmbedding\n",
189 | "from llama_index.llms.openai import OpenAI\n",
190 | "from llama_index.core.indices.property_graph import SimpleLLMPathExtractor\n",
191 | "\n",
192 | "index = PropertyGraphIndex.from_documents(\n",
193 | " documents,\n",
194 | " embed_model=embed_model,\n",
195 | " kg_extractors=[\n",
196 | " SimpleLLMPathExtractor(\n",
197 | " llm=llm\n",
198 | " )\n",
199 | " ],\n",
200 | " property_graph_store=graph_store,\n",
201 | " show_progress=True,\n",
202 | ")"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 7,
208 | "metadata": {},
209 | "outputs": [],
210 | "source": [
211 | "from llama_index.core import Settings\n",
212 | "Settings.llm = llm\n",
213 | "Settings.embed_model = embed_model"
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "metadata": {},
219 | "source": [
220 | "## Retrievers"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 10,
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "from llama_index.core.indices.property_graph import (\n",
230 | " LLMSynonymRetriever,\n",
231 | " VectorContextRetriever,\n",
232 | ")\n",
233 | "\n",
234 | "\n",
235 | "llm_synonym = LLMSynonymRetriever(\n",
236 | " index.property_graph_store,\n",
237 | " llm=llm,\n",
238 | " include_text=False,\n",
239 | ")\n",
240 | "vector_context = VectorContextRetriever(\n",
241 | " index.property_graph_store,\n",
242 | " embed_model=embed_model,\n",
243 | " include_text=False,\n",
244 | ")"
245 | ]
246 | },
247 | {
248 | "cell_type": "markdown",
249 | "metadata": {},
250 | "source": [
251 | "## Querying"
252 | ]
253 | },
254 | {
255 | "cell_type": "markdown",
256 | "metadata": {},
257 | "source": [
258 | "### Retrieving"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": 16,
264 | "metadata": {},
265 | "outputs": [
266 | {
267 | "name": "stdout",
268 | "output_type": "stream",
269 | "text": [
270 | "Yahoo -> Bought -> Viaweb in 1998\n",
271 | "Hacker news -> Source of stress for -> Author\n",
272 | "Author -> Wrote -> Yc's internal software in arc\n",
273 | "Author -> Advised by -> Robert morris to not make yc the last cool thing\n",
274 | "Author -> Decided to hand yc over to -> Sam altman\n",
275 | "Author -> Worked on -> Writing essays and yc\n",
276 | "Viaweb -> Software -> Works via the web\n",
277 | "Robert morris -> Showed -> World wide web\n"
278 | ]
279 | }
280 | ],
281 | "source": [
282 | "retriever = index.as_retriever(\n",
283 | " sub_retrievers=[\n",
284 | " llm_synonym,\n",
285 | " vector_context,\n",
286 | " ],\n",
287 | ")\n",
288 | "\n",
289 | "nodes = retriever.retrieve(\"What did author do at Viaweb?\")\n",
290 | "\n",
291 | "for node in nodes:\n",
292 | " print(node.text)"
293 | ]
294 | },
295 | {
296 | "cell_type": "markdown",
297 | "metadata": {},
298 | "source": [
299 | "### QueryEngine"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": 17,
305 | "metadata": {},
306 | "outputs": [
307 | {
308 | "data": {
309 | "text/markdown": [
310 | "The author, along with Robert Morris, started Viaweb, a company that aimed to build online stores. The author's role involved writing software to generate websites for galleries initially, and later, developing a new site generator for online stores using Lisp. The author also had the innovative idea of running the software on the server and letting users control it by clicking on links, eliminating the need for any client software or command line interaction on the server. This led to the creation of a web app, which at the time was a novel concept."
311 | ],
312 | "text/plain": [
313 | ""
314 | ]
315 | },
316 | "metadata": {},
317 | "output_type": "display_data"
318 | }
319 | ],
320 | "source": [
321 | "query_engine = index.as_query_engine(include_text=True)\n",
322 | "\n",
323 | "response = query_engine.query(\"What did author do at Viaweb?\")\n",
324 | "\n",
325 | "display(Markdown(f\"{response.response}\"))"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": null,
331 | "metadata": {},
332 | "outputs": [],
333 | "source": []
334 | }
335 | ],
336 | "metadata": {
337 | "kernelspec": {
338 | "display_name": "llamaindex",
339 | "language": "python",
340 | "name": "llamaindex"
341 | },
342 | "language_info": {
343 | "codemirror_mode": {
344 | "name": "ipython",
345 | "version": 3
346 | },
347 | "file_extension": ".py",
348 | "mimetype": "text/x-python",
349 | "name": "python",
350 | "nbconvert_exporter": "python",
351 | "pygments_lexer": "ipython3",
352 | "version": "3.9.6"
353 | }
354 | },
355 | "nbformat": 4,
356 | "nbformat_minor": 2
357 | }
358 |
--------------------------------------------------------------------------------
/third_party/LlamaIndex/RouterQueryEngine.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "id": "56c23751",
7 | "metadata": {},
8 | "source": [
9 | ""
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "id": "02c3900a-6024-42ec-af62-af1001fd4af5",
15 | "metadata": {},
16 | "source": [
17 | "# Router Query Engine\n",
18 | "\n",
19 | "A `VectorStoreIndex` is designed to handle queries related to specific contexts, while a `SummaryIndex` is optimized for answering summarization queries. However, in real-world scenarios, user queries may require either context-specific responses or summarizations. To address this, the system must effectively route user queries to the appropriate index to provide relevant answers.\n",
20 | "\n",
21 | "In this notebook, we will utilize the `RouterQueryEngine` to direct user queries to the appropriate index based on the query type."
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "id": "b451e7df-78eb-457e-ae3d-d84af7fef8e3",
27 | "metadata": {},
28 | "source": [
29 | "### Installation"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "id": "ad1e167f-2332-4be0-844e-0fcb0e97d663",
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "!pip install llama-index\n",
40 | "!pip install llama-index-llms-mistralai\n",
41 | "!pip install llama-index-embeddings-mistralai"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "id": "fa002b8e-4a22-4033-ab71-0ade6065d19a",
47 | "metadata": {},
48 | "source": [
49 | "### Setup API Key"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 1,
55 | "id": "e2447175-3cdf-4c41-8846-d678fed72e3a",
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "import os\n",
60 | "os.environ['MISTRAL_API_KEY'] = 'YOUR MISTRAL API KEY'"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "id": "71dd4757-05e6-4357-a4de-681634d9a55b",
66 | "metadata": {},
67 | "source": [
68 | "### Set LLM and Embedding Model"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 2,
74 | "id": "6572ec1c-bbbc-4d15-86a3-a7ecf72360e1",
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "import nest_asyncio\n",
79 | "\n",
80 | "nest_asyncio.apply()"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 3,
86 | "id": "a1ae8238-f53e-4e6c-8448-08c9ab3894a4",
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "from llama_index.core import SimpleDirectoryReader, VectorStoreIndex\n",
91 | "from llama_index.llms.mistralai import MistralAI\n",
92 | "from llama_index.embeddings.mistralai import MistralAIEmbedding\n",
93 | "from llama_index.core import Settings\n",
94 | "\n",
95 | "from llama_index.core.tools import QueryEngineTool, ToolMetadata\n",
96 | "from llama_index.core.query_engine.router_query_engine import RouterQueryEngine\n",
97 | "from llama_index.core.selectors.llm_selectors import LLMSingleSelector"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 4,
103 | "id": "d5ac41d1-2223-4577-a533-cbf1949607c8",
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "llm = MistralAI(model='mistral-large')\n",
108 | "embed_model = MistralAIEmbedding()\n",
109 | "\n",
110 | "Settings.llm = llm\n",
111 | "Settings.embed_model = embed_model"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "id": "96d5cbc1-d469-4872-ab79-049ff5234819",
117 | "metadata": {},
118 | "source": [
119 | "### Download Data\n",
120 | "\n",
121 | "We will use `Uber 10K SEC Filings`."
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": 5,
127 | "id": "6f2f70e2-7a18-417e-82c0-158c784a265e",
128 | "metadata": {},
129 | "outputs": [
130 | {
131 | "name": "stdout",
132 | "output_type": "stream",
133 | "text": [
134 | "--2024-03-31 00:24:17-- https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/uber_2021.pdf\n",
135 | "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...\n",
136 | "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
137 | "HTTP request sent, awaiting response... 200 OK\n",
138 | "Length: 1880483 (1.8M) [application/octet-stream]\n",
139 | "Saving to: ‘data/10k/uber_2021.pdf’\n",
140 | "\n",
141 | "data/10k/uber_2021. 100%[===================>] 1.79M --.-KB/s in 0.05s \n",
142 | "\n",
143 | "2024-03-31 00:24:17 (38.5 MB/s) - ‘data/10k/uber_2021.pdf’ saved [1880483/1880483]\n",
144 | "\n"
145 | ]
146 | }
147 | ],
148 | "source": [
149 | "!mkdir -p 'data/10k/'\n",
150 | "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/uber_2021.pdf' -O 'data/10k/uber_2021.pdf'"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "id": "8bde9293-51cf-4406-b628-a97891bc281b",
156 | "metadata": {},
157 | "source": [
158 | "### Load Data"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 22,
164 | "id": "7f93584f-b19d-4595-a2e6-f6124be7ff2b",
165 | "metadata": {},
166 | "outputs": [],
167 | "source": [
168 | "uber_docs = SimpleDirectoryReader(input_files=[\"./data/10k/uber_2021.pdf\"]).load_data()"
169 | ]
170 | },
171 | {
172 | "cell_type": "markdown",
173 | "id": "459d163f-f15f-4363-a657-857e288093eb",
174 | "metadata": {},
175 | "source": [
176 | "### Index and Query Engine creation\n",
177 | "\n",
178 | " 1. VectorStoreIndex -> Specific context queries\n",
179 | " 2. SummaryIndex -> Summarization queries"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 24,
185 | "id": "69dbbbdc-f0f3-408a-9a2c-dd65a6db9cb4",
186 | "metadata": {
187 | "scrolled": true
188 | },
189 | "outputs": [],
190 | "source": [
191 | "uber_vector_index = VectorStoreIndex.from_documents(uber_docs)\n",
192 | "\n",
193 | "uber_summary_index = VectorStoreIndex.from_documents(uber_docs)"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 26,
199 | "id": "14bb6f0b-5ccd-4ed0-8227-6d488d4e5adf",
200 | "metadata": {},
201 | "outputs": [],
202 | "source": [
203 | "uber_vector_query_engine = uber_vector_index.as_query_engine(similarity_top_k = 5)\n",
204 | "uber_summary_query_engine = uber_summary_index.as_query_engine()"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "id": "8df11259-e0ab-496d-8b0b-cb26adfb6ef4",
210 | "metadata": {},
211 | "source": [
212 | "### Create Tools"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": 27,
218 | "id": "8ffe1664-bc36-416a-a1b0-ded7d4468508",
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "query_engine_tools = [\n",
223 | " QueryEngineTool(\n",
224 | " query_engine=uber_vector_query_engine,\n",
225 | " metadata=ToolMetadata(\n",
226 | " name=\"vector_engine\",\n",
227 | " description=(\n",
228 | " \"Provides information about Uber financials for year 2021.\"\n",
229 | " ),\n",
230 | " ),\n",
231 | " ),\n",
232 | " QueryEngineTool(\n",
233 | " query_engine=uber_summary_query_engine,\n",
234 | " metadata=ToolMetadata(\n",
235 | " name=\"summary_engine\",\n",
236 | " description=(\n",
237 | " \"Provides Summary about Uber financials for year 2021.\"\n",
238 | " ),\n",
239 | " ),\n",
240 | " ),\n",
241 | "]"
242 | ]
243 | },
244 | {
245 | "cell_type": "markdown",
246 | "id": "65f8b9d0-b1c1-4bf4-8f78-1d67d984bc7c",
247 | "metadata": {},
248 | "source": [
249 | "### Create Router Query Engine"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 28,
255 | "id": "1ae0a39e-c14b-4c90-a443-8ad80e5ccd71",
256 | "metadata": {},
257 | "outputs": [],
258 | "source": [
259 | "query_engine = RouterQueryEngine(\n",
260 | " selector=LLMSingleSelector.from_defaults(),\n",
261 | " query_engine_tools=query_engine_tools,\n",
262 | " verbose = True\n",
263 | ")"
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "id": "5d382576-949d-43ce-8572-7d78e8215602",
269 | "metadata": {},
270 | "source": [
271 | "### Querying"
272 | ]
273 | },
274 | {
275 | "cell_type": "markdown",
276 | "id": "1109d5ec-48fd-4457-ab1a-64948fa8d913",
277 | "metadata": {},
278 | "source": [
279 | "#### Summarization Query\n",
280 | "\n",
281 | "You can see that it uses `SummaryIndex` to provide answer to the summarization query."
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": 31,
287 | "id": "186cc0d1-5c2f-422d-9324-749306a049aa",
288 | "metadata": {},
289 | "outputs": [
290 | {
291 | "name": "stdout",
292 | "output_type": "stream",
293 | "text": [
294 | "\u001b[1;3;38;5;200mSelecting query engine 1: This choice specifically mentions a 'summary' of Uber's financials for the year 2021, which directly aligns with the question asked..\n",
295 | "\u001b[0mIn 2021, Uber's Gross Bookings increased by $32.5 billion, a 56% increase compared to 2020. This growth was driven by a 66% increase in Delivery Gross Bookings due to higher demand for food delivery and larger order sizes, as well as expansion in U.S. and international markets. Mobility Gross Bookings also grew by 36% due to increased trip volumes as the business recovered from COVID-19 impacts.\n",
296 | "\n",
297 | "Uber's revenue for the year was $17.5 billion, a 57% increase from the previous year. This growth was attributed to the overall expansion of the Delivery business and an increase in Freight revenue due to the acquisition of Transplace in the fourth quarter of 2021.\n",
298 | "\n",
299 | "The net loss attributable to Uber Technologies, Inc. was $496 million, a 93% improvement from the previous year. This improvement was driven by a $1.6 billion pre-tax gain on the sale of the ATG Business to Aurora, a $1.6 billion pre-tax net benefit related to Uber’s equity investments, as well as reductions in fixed costs and increased variable cost efficiencies. The net loss also included $1.2 billion in stock-based compensation expense.\n",
300 | "\n",
301 | "Adjusted EBITDA loss was $774 million, an improvement of $1.8 billion from 2020. Mobility Adjusted EBITDA profit was $1.6 billion, and Delivery Adjusted EBITDA loss was $348 million, an improvement of $525 million from the previous year.\n",
302 | "\n",
303 | "Uber ended the year with $4.3 billion in cash and cash equivalents. The company also completed several acquisitions in 2021, including the remaining 45% ownership interest in Cornershop and 100% ownership interest in Drizly, an on-demand alcohol marketplace in North America.\n"
304 | ]
305 | }
306 | ],
307 | "source": [
308 | "response = query_engine.query(\"What is the summary of the Uber Financials in 2021?\")\n",
309 | "print(response)"
310 | ]
311 | },
312 | {
313 | "cell_type": "markdown",
314 | "id": "b3806f9b-ada8-43d0-8276-4b0a2a1c8835",
315 | "metadata": {},
316 | "source": [
317 | "#### Specific Context Query\n",
318 | "\n",
319 | "You can see it uses `VectorStoreIndex` to answer specific context type query."
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": 29,
325 | "id": "dd6c321b-2c97-4701-82a1-53455307adc2",
326 | "metadata": {},
327 | "outputs": [
328 | {
329 | "name": "stdout",
330 | "output_type": "stream",
331 | "text": [
332 | "\u001b[1;3;38;5;200mSelecting query engine 0: This choice is more likely to contain detailed financial information about Uber in 2021, including revenue..\n",
333 | "\u001b[0mThe revenue of Uber in 2021 was $17,455 million.\n"
334 | ]
335 | }
336 | ],
337 | "source": [
338 | "response = query_engine.query(\"What is the the revenue of Uber in 2021?\")\n",
339 | "print(response)"
340 | ]
341 | }
342 | ],
343 | "metadata": {
344 | "kernelspec": {
345 | "display_name": "Python 3",
346 | "language": "python",
347 | "name": "python3"
348 | },
349 | "language_info": {
350 | "codemirror_mode": {
351 | "name": "ipython",
352 | "version": 3
353 | },
354 | "file_extension": ".py",
355 | "mimetype": "text/x-python",
356 | "name": "python",
357 | "nbconvert_exporter": "python",
358 | "pygments_lexer": "ipython3",
359 | "version": "3.11.3 (main, Apr 7 2023, 19:08:44) [Clang 13.0.0 (clang-1300.0.29.30)]"
360 | },
361 | "vscode": {
362 | "interpreter": {
363 | "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
364 | }
365 | }
366 | },
367 | "nbformat": 4,
368 | "nbformat_minor": 5
369 | }
370 |
--------------------------------------------------------------------------------
/data/northwind-queries.jsonl:
--------------------------------------------------------------------------------
1 | {"question": "Select all category names with their descriptions from the Categories table.", "query": "SELECT category_name, description FROM categories"}
2 | {"question": "Select the contact name, customer id, and company name of all Customers in London", "query": "SELECT contact_name, customer_id, company_name FROM customers WHERE city = 'London'"}
3 | {"question": "Marketing managers and sales representatives have asked you to select all available columns in the Suppliers tables that have a FAX number.", "query": "SELECT * FROM suppliers WHERE NOT fax IS NULL"}
4 | {"question": "Select a list of customers id\u2019s from the Orders table with required dates between Jan 1, 1997 and Jan 1, 1998 and with freight under 100 units.", "query": "SELECT customer_id FROM orders WHERE required_date BETWEEN '1997-01-01' AND 'Jan 1, 1998' AND freight < 100"}
5 | {"question": "Select a list of company names and contact names of all the Owners from the Customer table from Mexico, Sweden and Germany.", "query": "SELECT company_name, contact_name FROM customers WHERE country IN ('Mexico', 'Sweden', 'Germany') AND contact_title = 'Owner'"}
6 | {"question": "Count the number of discontinued products in the Products table.", "query": "SELECT COUNT(*) FROM products WHERE discontinued = 1"}
7 | {"question": "Select a list of category names and descriptions of all categories beginning with 'Co' from the Categories table.", "query": "SELECT \"category_name\", \"description\" FROM \"categories\" WHERE \"category_name\" LIKE 'Co%'"}
8 | {"question": "Select all the company names, city, country and postal code from the Suppliers table with the word 'rue' in their address. The list should be ordered alphabetically by company name.", "query": "SELECT company_name, city, country, postal_code FROM suppliers WHERE address LIKE '%rue%' ORDER BY company_name NULLS FIRST"}
9 | {"question": "Select the product id and the total quantities ordered for each product id in the Order Details table.", "query": "SELECT product_id AS \"product_id\", SUM(quantity) AS \"total_quantity\" FROM \"order_details\" GROUP BY product_id ORDER BY \"total_quantity\" NULLS FIRST"}
10 | {"question": "Select the customer name and customer address of all customers with orders that shipped using Speedy Express.", "query": "SELECT DISTINCT customers.contact_name, customers.address FROM customers INNER JOIN orders ON customers.customer_id = orders.customer_id INNER JOIN shippers ON orders.ship_via = shippers.shipper_id WHERE shippers.company_name = 'Speedy Express'"}
11 | {"question": "Select a list of Suppliers containing company name, contact name, contact title and region description.", "query": "SELECT company_name, contact_name, contact_title, region FROM suppliers WHERE NOT contact_name IS NULL AND NOT contact_title IS NULL AND NOT region IS NULL"}
12 | {"question": "Select all product names from the Products table that are condiments.", "query": "SELECT products.product_name FROM products INNER JOIN categories ON products.category_id = categories.category_id WHERE categories.category_name = 'Condiments'"}
13 | {"question": "Select a list of customer names who have no orders in the Orders table.", "query": "SELECT contact_name FROM customers WHERE NOT customer_id IN (SELECT DISTINCT customer_id FROM orders)"}
14 | {"question": "Select a complete list of company names from the Shippers table. \n--Include freight totals rounded to the nearest whole number for each shipper from the Orders table for those shippers with orders.", "query": "SELECT shippers.company_name, ROUND(CAST(SUM(orders.freight) AS INT), 0) AS \"total_freights\" FROM shippers LEFT OUTER JOIN orders ON orders.ship_via = shippers.shipper_id GROUP BY shippers.company_name"}
15 | {"question": "Select all employee first and last names from the Employees table by combining the 2 columns aliased as 'DisplayName'.\n--The combined format should be 'LastName, FirstName'.", "query": "SELECT CONCAT(last_name, ', ', first_name) AS \"display_name\" FROM employees"}
16 | {"question": "Select a list of products from the Products table along with the total units in stock for each product.\n--Give the computed column a name using the alias, 'TotalUnits'. Include only products with TotalUnits greater than 100.", "query": "SELECT product_name, units_in_stock AS total_units FROM products WHERE units_in_stock > 100"}
17 | {"question": "Select the name, address, city, and region of employees.", "query": "SELECT first_name, address, city, COALESCE(region, '-') AS \"region\" FROM employees"}
18 | {"question": "Select the name, address, city, and region of employees living in USA", "query": "SELECT first_name, \"address\", city, COALESCE(region, ' - ') AS \"region\" FROM employees WHERE country = 'USA'"}
19 | {"question": "Select the name, address, city, and region of employees older than 50 years old.", "query": "SELECT last_name, first_name, \"address\", city, COALESCE(region, ' - ') AS \"region\" FROM employees WHERE CAST(EXTRACT(year FROM AGE(CAST(CAST(CURRENT_TIMESTAMP AS TIMESTAMP) AS TIMESTAMP), CAST(CAST(birth_date AS TIMESTAMP) AS TIMESTAMP))) AS BIGINT) > 50"}
20 | {"question": "Select the name, address, city, and region of employees that have placed orders to be delivered in Belgium. Write two versions of the query, with and without join.", "query": "SELECT DISTINCT e.last_name, e.first_name, e.\"address\", e.city, COALESCE(e.region, ' - ') AS \"region\" FROM employees AS e INNER JOIN orders AS o ON e.employee_id = o.employee_id WHERE o.ship_country = 'Belgium'"}
21 | {"question": "Select the employee name and the customer name for orders that are sent by the company \u2018Speedy Express\u2019 to customers who live in Brussels.", "query": "SELECT DISTINCT e.last_name, e.first_name, c.contact_name FROM employees AS e JOIN orders AS o ON o.employee_id = e.employee_id JOIN customers AS c ON o.customer_id = c.customer_id JOIN shippers AS s ON o.ship_via = s.shipper_id WHERE s.company_name = 'Speedy Express' AND c.city = 'Bruxelles'"}
22 | {"question": "Select the title and name of employees who have sold at least one of the products \u2018Gravad Lax\u2019 or \u2018Mishi Kobe Niku\u2019.", "query": "SELECT DISTINCT employees.last_name, employees.first_name, employees.title FROM employees INNER JOIN orders ON orders.employee_id = employees.employee_id INNER JOIN \"order_details\" ON \"order_details\".order_id = orders.order_id INNER JOIN products ON \"order_details\".product_id = products.product_id WHERE products.product_name IN ('Gravad Lax', 'Mishi Kobe Niku')"}
23 | {"question": "Select the name and title of employees and the name and title of the person to which they refer (or null for the latter values if they don\u2019t refer to another employee).", "query": "SELECT e.first_name AS \"employee_name\", e.last_name AS \"employee_lastname\", b.last_name AS \"reports_to\" FROM employees AS e LEFT OUTER JOIN employees AS b ON e.reports_to = b.employee_id"}
24 | {"question": "Select the customer name, the product name and the supplier name for customers who live in London and suppliers whose name is \u2018Pavlova, Ltd.\u2019 or \u2018Karkki Oy\u2019.", "query": "SELECT DISTINCT c.contact_name AS \"customer\", p.product_name AS \"product\", s.contact_name AS \"supplier\" FROM customers AS c JOIN orders AS o ON o.customer_id = c.customer_id JOIN \"order_details\" AS od ON od.order_id = o.order_id JOIN products AS p ON p.product_id = od.product_id JOIN suppliers AS s ON s.supplier_id = p.supplier_id WHERE c.city = 'London' AND s.company_name IN ('Pavlova, Ltd.', 'Karkki Oy')"}
25 | {"question": "Select the name of products that were bought or sold by people who live in London.", "query": "SELECT DISTINCT p.product_name FROM products AS p JOIN \"order_details\" AS od ON od.product_id = p.product_id JOIN orders AS o ON o.order_id = od.order_id JOIN employees AS e ON e.employee_id = o.employee_id JOIN customers AS c ON c.customer_id = o.customer_id WHERE c.city = 'London' OR e.city = 'London'"}
26 | {"question": "Select the names of employees who are strictly older than: (a) any employee who lives in London. (b) all employees who live in London.", "query": "SELECT last_name, first_name FROM employees WHERE birth_date < ALL (SELECT birth_date FROM employees WHERE city = 'London')"}
27 | {"question": "Select the name of employees who work longer than any employee of London.", "query": "SELECT last_name, first_name FROM employees WHERE hire_date < ALL (SELECT hire_date FROM employees WHERE city = 'London')"}
28 | {"question": "Select the name of employees and the city where they live for employees who have sold to customers in the same city.", "query": "SELECT DISTINCT e.last_name, e.first_name, e.city FROM employees AS e JOIN orders AS o ON o.employee_id = e.employee_id JOIN customers AS c ON c.customer_id = o.customer_id WHERE e.city = c.city"}
29 | {"question": "", "query": "SELECT DISTINCT CONCAT(e.last_name, ' ', e.first_name) AS employee, e.city FROM employees AS e JOIN customers AS c ON e.city = c.city"}
30 | {"question": "Select the name of customers who have not purchased any product.", "query": "SELECT DISTINCT contact_name FROM customers WHERE NOT customer_id IN (SELECT DISTINCT customer_id FROM orders)"}
31 | {"question": "Select the name of customers who bought only products with price less than 50.", "query": "SELECT contact_name FROM customers WHERE NOT customer_id IN (SELECT DISTINCT customers.customer_id FROM customers INNER JOIN orders ON orders.customer_id = customers.customer_id INNER JOIN \"order_details\" ON \"order_details\".order_id = orders.order_id INNER JOIN products ON products.product_id = \"order_details\".product_id WHERE products.unit_price >= 50) AND customer_id IN (SELECT DISTINCT customer_id FROM orders)"}
32 | {"question": "Select the name of the products sold by all employees.", "query": "SELECT p.product_name FROM products AS p JOIN \"order_details\" AS od ON p.product_id = od.product_id JOIN orders AS o ON od.order_id = o.order_id GROUP BY p.product_name HAVING COUNT(DISTINCT o.employee_id) = (SELECT COUNT(*) FROM employees)"}
33 | {"question": "Select the name of customers who bought all products purchased by the customer whose identifier is \u2018LAZYK\u2019", "query": "WITH products_of_lazyk AS (SELECT DISTINCT \"order_details\".\"product_id\" FROM \"customers\" JOIN \"orders\" ON \"orders\".\"customer_id\" = \"customers\".\"customer_id\" JOIN \"order_details\" ON \"order_details\".\"order_id\" = \"orders\".\"order_id\" WHERE \"customers\".\"customer_id\" = 'LAZYK'), customers_of_all_products_of_lazyk AS (SELECT DISTINCT \"customers\".\"contact_name\" FROM \"customers\" JOIN \"orders\" ON \"orders\".\"customer_id\" = \"customers\".\"customer_id\" JOIN \"order_details\" ON \"order_details\".\"order_id\" = \"orders\".\"order_id\" WHERE \"order_details\".\"product_id\" IN (SELECT * FROM products_of_lazyk) AND \"customers\".\"customer_id\" <> 'LAZYK' GROUP BY \"customers\".\"contact_name\" HAVING COUNT(DISTINCT \"order_details\".\"product_id\") = (SELECT COUNT(*) FROM products_of_lazyk)) SELECT * FROM customers_of_all_products_of_lazyk"}
34 | {"question": "Select the average price of products by category.", "query": "SELECT category_id, AVG(unit_price) AS \"average_price\" FROM products GROUP BY category_id"}
35 | {"question": "Give the name of the categories and the average price of products in each category.", "query": "SELECT c.category_name, AVG(p.unit_price) AS \"average_price\" FROM categories AS c JOIN products AS p ON p.category_id = c.category_id GROUP BY c.category_name ORDER BY \"average_price\" NULLS FIRST"}
36 | {"question": "Select the identifier and the name of the companies that provide more than 3 products.", "query": "SELECT s.supplier_id, s.company_name FROM suppliers AS s JOIN products AS p ON p.supplier_id = s.supplier_id GROUP BY s.supplier_id, s.company_name HAVING COUNT(p.product_id) > 3"}
37 | {"question": "Select the identifier, name, and number of orders of employees, ordered by the employee identifier.", "query": "SELECT e.employee_id, e.last_name, e.first_name, COUNT(o.order_id) AS \"orders\" FROM employees AS e LEFT JOIN orders AS o ON o.employee_id = e.employee_id GROUP BY e.employee_id, e.last_name, e.first_name ORDER BY e.employee_id NULLS FIRST"}
38 | {"question": "For each employee give the identifier, name, and the number of distinct products sold, ordered by the employee identifier.", "query": "SELECT e.employee_id, e.last_name, e.first_name, COUNT(DISTINCT od.product_id) AS \"products_sold\" FROM employees AS e LEFT JOIN orders AS o ON o.employee_id = e.employee_id LEFT JOIN \"order_details\" AS od ON od.order_id = o.order_id GROUP BY e.employee_id, e.last_name, e.first_name ORDER BY e.employee_id NULLS FIRST"}
39 | {"question": "Select the identifier, name, and total sales of employees, ordered by the employee identifier.", "query": "SELECT e.employee_id, e.last_name, e.first_name, COUNT(od.quantity) AS \"total_sales\" FROM employees AS e LEFT JOIN orders AS o ON o.employee_id = e.employee_id LEFT JOIN \"order_details\" AS od ON od.order_id = o.order_id GROUP BY e.employee_id, e.last_name, e.first_name ORDER BY e.employee_id NULLS FIRST"}
40 | {"question": "Select the identifier, name, and total sales of employees, ordered by the employee identifier\n--for employees who have sold more than 70 different products.", "query": "SELECT e.employee_id, e.last_name, e.first_name, COUNT(od.quantity) AS \"total_sales\" FROM employees AS e LEFT JOIN orders AS o ON o.employee_id = e.employee_id LEFT JOIN \"order_details\" AS od ON od.order_id = o.order_id GROUP BY e.employee_id, e.last_name, e.first_name HAVING COUNT(DISTINCT od.product_id) > 70 ORDER BY e.employee_id NULLS FIRST"}
41 | {"question": "Select the names of employees who sell the products of more than 7 suppliers.", "query": "SELECT e.last_name, e.first_name FROM employees AS e LEFT JOIN orders AS o ON o.employee_id = e.employee_id LEFT JOIN \"order_details\" AS od ON od.order_id = o.order_id LEFT JOIN products AS p ON p.product_id = od.product_id GROUP BY e.last_name, e.first_name HAVING COUNT(DISTINCT p.supplier_id) > 7"}
42 |
--------------------------------------------------------------------------------