├── .github ├── CODEOWNERS └── workflows │ ├── project.yml │ ├── publish_cookbook.yml │ └── verify_index.yml ├── README.md ├── data ├── multilingual_rag_podcast │ └── podcast_transcript_whisper_small.txt ├── prometheus2_evaluation │ └── gemma_2b_rag_results.json └── rag-with-nims │ ├── ChipNeMo.pdf │ ├── embeddings-1.png │ └── embeddings-2.png ├── index.toml ├── notebooks ├── agent_with_human_in_the_loop.ipynb ├── amazon_bedrock_for_documentation_qa.ipynb ├── amazon_sagemaker_and_chroma_for_qa.ipynb ├── apify_haystack_instagram_comments_analysis.ipynb ├── apify_haystack_rag.ipynb ├── apify_haystack_rag_web_browser.ipynb ├── astradb_haystack_integration.ipynb ├── async_pipeline.ipynb ├── auto_merging_retriever.ipynb ├── charactercodex_llamafile.ipynb ├── chat_with_SQL_3_ways.ipynb ├── chroma-indexing-and-rag-examples.ipynb ├── cohere-v3-for-multilingual-qa.ipynb ├── conversational_rag_using_memory.ipynb ├── evaluating_ai_with_haystack.ipynb ├── extracting_metadata_filters_from_a_user_query.ipynb ├── feedback-analysis-agent-with-AzureAISearch.ipynb ├── function_calling_with_OpenAIChatGenerator.ipynb ├── gemma_chat_rag.ipynb ├── github_issue_resolver_agent.ipynb ├── hackernews-custom-component-rag.ipynb ├── haystack-1.x │ ├── cohere-for-multilingual-qa.ipynb │ ├── gpt4-weaviate-custom-documentation-qa.ipynb │ └── whisper-and-weaviate-for-youtube-rag.ipynb ├── hybrid_rag_pipeline_with_breakpoints.ipynb ├── hybrid_retrieval_bm42.ipynb ├── improve-retrieval-by-embedding-metadata.ipynb ├── information-extraction-gorilla.ipynb ├── information_extraction_raven.ipynb ├── jina-embeddings-v2-legal-analysis-rag.ipynb ├── keyword-extraction.ipynb ├── llama32_agentic_rag.ipynb ├── llama3_rag.ipynb ├── metadata_enrichment.ipynb ├── metadata_extraction_with_llm_metadata_extractor.ipynb ├── mixtral-8x7b-for-web-qa.ipynb ├── model_explorer_streaming.ipynb ├── multilingual_rag_podcast.ipynb ├── multimodal_agent_with_fastrag_haystack.ipynb ├── multimodal_intro.ipynb ├── newsletter-agent.ipynb ├── openapitool.ipynb ├── prometheus2_evaluation.ipynb ├── prompt_customization_for_Anthropic.ipynb ├── prompt_optimization_with_dspy.ipynb ├── query-expansion.ipynb ├── query_decomposition.ipynb ├── rag-with-nims.ipynb ├── rag_eval_deep_eval.ipynb ├── rag_eval_ragas.ipynb ├── rag_eval_uptrain.ipynb ├── rag_fastembed.ipynb ├── sparse_embedding_retrieval.ipynb ├── swarm.ipynb ├── techcrunch_news_digest_titanml_takeoff.ipynb ├── tools_support.ipynb ├── using_hyde_for_improved_retrieval.ipynb ├── using_speaker_diarization_with_assemblyai.ipynb ├── vertexai-gemini-examples.ipynb ├── vllm_inference_engine.ipynb ├── web_enhanced_self_reflecting_agent.ipynb └── zephyr-7b-beta-for-rag.ipynb ├── requirements.txt └── scripts ├── generate_markdowns.py └── verify_index.py /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/about-codeowners/ for syntax 2 | 3 | # DevRel Team will be the default owner for everything 4 | # in the repo. Unless a later match takes precedence, 5 | # @deepset-ai/devrel will be requested for review 6 | # when someone opens a pull request. 7 | * @deepset-ai/devrel 8 | -------------------------------------------------------------------------------- /.github/workflows/project.yml: -------------------------------------------------------------------------------- 1 | name: Track issues with Github project 2 | 3 | on: 4 | issues: 5 | types: 6 | - opened 7 | 8 | jobs: 9 | add-to-project: 10 | name: Add new issues to project for triage 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/add-to-project@v1.0.2 14 | with: 15 | project-url: https://github.com/orgs/deepset-ai/projects/5 16 | github-token: ${{ secrets.GH_PROJECT_PAT }} 17 | -------------------------------------------------------------------------------- /.github/workflows/publish_cookbook.yml: -------------------------------------------------------------------------------- 1 | name: Publish cookbook on Haystack Home 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - main 8 | paths: 9 | - "notebooks/*.ipynb" 10 | - "index.toml" 11 | 12 | jobs: 13 | publish-cookbook: 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - name: trigger-hook 18 | run: | 19 | curl -X POST ${{ secrets.VERCEL_DEPLOY_HOOK }} 20 | -------------------------------------------------------------------------------- /.github/workflows/verify_index.yml: -------------------------------------------------------------------------------- 1 | name: Verify index 2 | 3 | on: 4 | pull_request: 5 | paths: 6 | - "notebook/*.ipynb" 7 | - "index.toml" 8 | 9 | jobs: 10 | verify-index: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v4 15 | 16 | - uses: actions/setup-python@v5 17 | with: 18 | python-version: "3.12" 19 | 20 | - name: Verify index content 21 | run: python scripts/verify_index.py 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 👩🏻‍🍳 Haystack Cookbook 2 | 3 |
4 | Green logo of a stylized white 'H' with the text 'Haystack, by deepset. The Production-Ready Open Source AI Framework' Abstract green and yellow diagrams in the background. 5 |
6 | 7 | ## 🧑‍🍳🍳Discover The Haystack Cookbook [here](https://haystack.deepset.ai/cookbook) 8 | 9 | A collection of example notebooks using [Haystack](https://github.com/deepset-ai/haystack) 💚 10 | 11 | These examples can serve as guidelines for using different model providers, vector databases, retrieval techniques, new experimental features, and more with Haystack. Most showcase a specific, small demo. 12 | 13 | To learn more about _how_ to use Haystack, please visit our [Docs](https://docs.haystack.deepset.ai/docs) and official [Tutorials](https://haystack.deepset.ai/tutorials). 14 | 15 | For more examples, you may also find our [Blog](https://haystack.deepset.ai/blog) useful. 16 | 17 | ### Contributing to the Haystack Cookbook 18 | 19 | If you have an example that uses Haystack, you can add it to this repository by creating a PR. 20 | > You can also create a PR directly from Colab by creating a fork of this repository and selecting "Save a Copy to GitHub" on Colab. Once you add your example to your fork, you can create a PR on this repository. 21 | 22 | 1. Add your notebook to `/notebooks` folder 23 | 2. Give a descriptive name to your file that includes the names of (if applicable) the model providers, databases, the technologies you use in your example, and/or the task you are completing in the example. 24 | 3. Make sure you add it to `index.toml`, including its title and topics. If the notebook is using an experimental feature, also add the discussion link with the `experimental = true` field. 25 | -------------------------------------------------------------------------------- /data/prometheus2_evaluation/gemma_2b_rag_results.json: -------------------------------------------------------------------------------- 1 | { 2 | "questions": [ 3 | "Is cDK1 and CDK2 activity a strong predictor of renal cell carcinoma recurrence?", 4 | "Does metabolic control analysis of the Trypanosoma cruzi peroxide detoxification pathway identify tryparedoxin as a suitable drug target?", 5 | "Does promoter variant rs2301228 on the neural cell adhesion molecule 1 gene confer risk of schizophrenia in Han Chinese?", 6 | "Does pancreatic polypeptide regulate glucagon release through PPYR1 receptors expressed in mouse and human alpha-cells?", 7 | "Does tetraploid complementation prove pluripotency of induced pluripotent stem cells derived from adipose tissue?", 8 | "Is osteoprotegerin associated with subclinical left ventricular systolic dysfunction in diabetic hypertensive patients : a speckle tracking study?", 9 | "Is cD30 expression a novel prognostic indicator in extranodal natural killer/T-cell lymphoma , nasal type?", 10 | "Does mild cognitive dysfunction affect diabetes mellitus control in minority elderly adults?", 11 | "Do youth walking and biking rates vary by environments around 5 Louisiana schools?", 12 | "Are human enteroviruses the cause of neurological impairments in children at the Korle-Bu Teaching Hospital?" 13 | ], 14 | "ground_truth_answers": [ 15 | "CDK1SA of tumors and the CDK2SA are both associated with recurrence and prognosis.", 16 | "These quantitative kinetic and metabolic analyses pointed out to TXN as a convenient drug target due to its low catalytic efficiency, high control on the flux of peroxide detoxification and role as provider of reducing equivalents to the two main peroxidases in the parasite.", 17 | "Our results provide direct evidence for NCAM1 as a susceptibility gene for schizophrenia, which offers support to a neurodevelopmental model and neuronal connectivity hypothesis in the onset of schizophrenia.", 18 | "Glucose stimulates PP secretion and PP inhibits glucagon release in mouse pancreatic islets. PP receptors are present in alpha-cells of mouse and human pancreatic islets.", 19 | "We also directed differentiation of iPS cells into chondrocytes, thus adipose-derived iPS cells can be used as models to study chondrogenic differentiation and cartilage regeneration.", 20 | "Plasma OPG values could predict subclinical LV systolic dysfunction in diabetic hypertensive patients.", 21 | "Our results showed that expression of CD30 was not related to response to treatment but was an independent prognostic factor for both OS and PFS in ENKTL, nasal type, which suggests a role for CD30 in the pathogenesis of this disease and may support the incorporation of anti-CD30-targeted therapy into the treatment paradigm for ENKTL.", 22 | "Cognitive dysfunction in the mild range did not seem to affect diabetes mellitus control parameters in this multiethnic cohort of older adults with diabetes mellitus, although it cannot be excluded that cognitive impairment was overcome through assistance from formal or informal caregivers. It is possible that more-severe cognitive dysfunction could affect control.", 23 | "There is a role and a need for the SRTS program. The environment and other factors influence biking and walking to school.", 24 | "Non-polio enteroviruses and chikugunya virus were found not to be the etiologic agent responsible for the convulsion with neurologic morbidity observed in the Ghanaian children. Investigation for other viral agents is recommended." 25 | ], 26 | "rag_answers": [ 27 | " **Yes, cDK1 and CDK2 activity is a strong predictor of renal cell carcinoma recurrence.**\n\nThe study found that tumors with low CDK2SA-CDK1SA ratio showed significantly better 5-year recurrence-free survival than those with high CDK2SA-CDK1SA ratio.", 28 | " **Yes.**\n\nMetabolic control analysis of the Trypanosoma cruzi peroxide detoxification pathway identified tryparedoxin as a suitable drug target, with 90-100% of the pathway flux controlled by TXN-TXNPx and TXN-GPxA redox pairs.", 29 | " **Yes.**\n\nThe study found that the promoter variant rs2301228 on the NCAM1 gene is associated with schizophrenia in Han Chinese.", 30 | " **Yes.**\n\nThe context information indicates that PPYR1 receptors are expressed in the alpha-cell population of mouse and human islets, and that PP release increases upon glucose stimulation, leading to inhibition of glucagon release.", 31 | " **Yes.**\n\nThe study shows that tetraploid complementation successfully generated viable and fertile adult mice derived entirely from reprogrammed ASC, indicating that cell types other than fibroblasts can also be restored to the embryonic level of pluripotency.", 32 | " **Yes**, osteoprotegerin (OPG) is associated with subclinical left ventricular systolic dysfunction in diabetic hypertensive patients.\n\n**Results:**\n\n* Diabetic hypertensive patients had higher diastolic peak early/early diastolic tissue velocity and lower systolic tissue velocity, GLS, GLS rate systolic, and GLS rate early diastolic than nondiabetic hypertensive patients.\n* OPG was found to be an independent predictor of impaired GLS.\n* Receiver operating characteristic curve analysis revealed that OPG values of > 6.45 (pmol/L) identified the patients with GLS \u2264 18.5.", 33 | "**Yes**, CD30 expression is a novel prognostic indicator in extranodal natural killer/T-cell lymphoma, nasal type.\n\nThe study found that CD30 positivity was associated with shorter 5-year OS and PFS in patients with extranodal natural killer/T-cell lymphoma, nasal type.", 34 | " The provided text does not contain any information regarding whether mild cognitive dysfunction affects diabetes mellitus control in minority elderly adults, so I am unable to answer this question from the provided context.", 35 | " The provided text indicates that there is a significant difference in the walking and biking rates between the 5 Louisiana schools. More students walked than biked to school, and the sites with the highest PEDS score had the highest percentage of students who walked/biked to school.", 36 | "The provided text suggests that enteroviruses were detected in cerebrospinal fluid (CSF) samples from children at the Korle-Bu Teaching Hospital, but further studies are needed to establish a causal relationship between enteroviruses and neurological impairments." 37 | ] 38 | } -------------------------------------------------------------------------------- /data/rag-with-nims/ChipNeMo.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/haystack-cookbook/e40d163868a84b9a09b486e0cab47b9fb90e9a71/data/rag-with-nims/ChipNeMo.pdf -------------------------------------------------------------------------------- /data/rag-with-nims/embeddings-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/haystack-cookbook/e40d163868a84b9a09b486e0cab47b9fb90e9a71/data/rag-with-nims/embeddings-1.png -------------------------------------------------------------------------------- /data/rag-with-nims/embeddings-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/haystack-cookbook/e40d163868a84b9a09b486e0cab47b9fb90e9a71/data/rag-with-nims/embeddings-2.png -------------------------------------------------------------------------------- /index.toml: -------------------------------------------------------------------------------- 1 | [config] 2 | layout = "cookbook" 3 | toc = true 4 | colab = "https://colab.research.google.com/github/deepset-ai/haystack-cookbook/blob/main/notebooks/" 5 | 6 | [[cookbook]] 7 | title = "PDF-Based Question Answering with Amazon Bedrock and Haystack" 8 | notebook = "amazon_bedrock_for_documentation_qa.ipynb" 9 | topics = ["RAG"] 10 | 11 | [[cookbook]] 12 | title = "Question Answering with Amazon Sagemaker, Chroma and Haystack" 13 | notebook = "amazon_sagemaker_and_chroma_for_qa.ipynb" 14 | topics = ["RAG"] 15 | 16 | [[cookbook]] 17 | title = "RAG: Extract and use website content for question answering with Apify-Haystack integration" 18 | notebook = "apify_haystack_rag.ipynb" 19 | topics = ["RAG", "Web-QA"] 20 | 21 | [[cookbook]] 22 | title = "AstraDB 🤝 Haystack Integration" 23 | notebook = "astradb_haystack_integration.ipynb" 24 | topics = ["RAG"] 25 | 26 | [[cookbook]] 27 | title = "Improving Retrieval with Auto-Merging and Hierarchical Document Retrieval" 28 | notebook = "auto_merging_retriever.ipynb" 29 | topics = ["Advanced Retrieval", "RAG"] 30 | 31 | [[cookbook]] 32 | title = "🧩 Quizzes and Adventures 🏰 with Character Codex and llamafile" 33 | notebook = "charactercodex_llamafile.ipynb" 34 | topics = ["Model Serving", "Chat"] 35 | 36 | [[cookbook]] 37 | title = "Chat With Your SQL Database" 38 | notebook = "chat_with_SQL_3_ways.ipynb" 39 | topics = ["Chat", "SQL", "Agents"] 40 | 41 | [[cookbook]] 42 | title = "Chroma Indexing and RAG Examples" 43 | notebook = "chroma-indexing-and-rag-examples.ipynb" 44 | topics = ["RAG", "Vector Databases"] 45 | 46 | [[cookbook]] 47 | title = "Cohere v3 for Multilingual QA" 48 | notebook = "cohere-v3-for-multilingual-qa.ipynb" 49 | topics = ["Multilingual RAG"] 50 | 51 | [[cookbook]] 52 | title = "Conversational RAG using Memory" 53 | notebook = "conversational_rag_using_memory.ipynb" 54 | topics = ["Chat", "RAG", "Agents"] 55 | experimental = true 56 | discuss = "https://github.com/deepset-ai/haystack-experimental/discussions/75" 57 | 58 | [[cookbook]] 59 | title = "Extract Metadata Filters from a Query" 60 | notebook = "extracting_metadata_filters_from_a_user_query.ipynb" 61 | topics = ["Advanced Retrieval", "Metadata"] 62 | 63 | [[cookbook]] 64 | title = "Function Calling with OpenAIChatGenerator" 65 | notebook = "function_calling_with_OpenAIChatGenerator.ipynb" 66 | topics = ["Function Calling", "Chat", "Agents"] 67 | outdated = true 68 | 69 | [[cookbook]] 70 | title = "Build with Gemma and Haystack" 71 | notebook = "gemma_chat_rag.ipynb" 72 | topics = ["RAG"] 73 | 74 | [[cookbook]] 75 | title = "Hacker News Summaries with Custom Components" 76 | notebook = "hackernews-custom-component-rag.ipynb" 77 | topics = ["RAG", "Customization"] 78 | 79 | [[cookbook]] 80 | title = "Hybrid Retrieval with BM42" 81 | notebook = "hybrid_retrieval_bm42.ipynb" 82 | topics = ["Advanced Retrieval"] 83 | 84 | [[cookbook]] 85 | title = "Improve Retrieval by Embedding Meaningful Metadata" 86 | notebook = "improve-retrieval-by-embedding-metadata.ipynb" 87 | topics = ["Advanced Retrieval", "Metadata"] 88 | 89 | [[cookbook]] 90 | title = "🐦‍⬛ Information Extraction with Raven" 91 | notebook = "information_extraction_raven.ipynb" 92 | topics = ["Function Calling"] 93 | 94 | [[cookbook]] 95 | title = "🦍 Information Extraction with Gorilla" 96 | notebook = "information-extraction-gorilla.ipynb" 97 | topics = ["Function Calling"] 98 | 99 | [[cookbook]] 100 | title = "Legal Document Analysis with jina-embeddings-v2-base-en" 101 | notebook = "jina-embeddings-v2-legal-analysis-rag.ipynb" 102 | topics = ["Summarization", "Advanced Retrieval"] 103 | 104 | [[cookbook]] 105 | title = "RAG with Llama 3.1" 106 | notebook = "llama3_rag.ipynb" 107 | topics = ["RAG"] 108 | 109 | [[cookbook]] 110 | title = "Web QA with Mixtral-8x7B-Instruct-v0.1" 111 | notebook = "mixtral-8x7b-for-web-qa.ipynb" 112 | topics = ["Web-QA"] 113 | 114 | [[cookbook]] 115 | title = "Streaming Model Explorer" 116 | notebook = "model_explorer_streaming.ipynb" 117 | topics = ["Discovery"] 118 | 119 | [[cookbook]] 120 | title = "Multilingual RAG on a Podcast" 121 | notebook = "multilingual_rag_podcast.ipynb" 122 | topics = ["Audio", "Multilingual RAG"] 123 | 124 | [[cookbook]] 125 | title = "Invoking APIs with OpenAPITool" 126 | notebook = "openapitool.ipynb" 127 | topics = ["Function Calling"] 128 | experimental = true 129 | outdated = true 130 | discuss = "https://github.com/deepset-ai/haystack-experimental/discussions/79" 131 | 132 | [[cookbook]] 133 | title = "RAG Evaluation with Prometheus 2" 134 | notebook = "prometheus2_evaluation.ipynb" 135 | topics = ["Evaluation"] 136 | 137 | [[cookbook]] 138 | title = "Advanced Prompt Customization for Anthropic" 139 | notebook = "prompt_customization_for_Anthropic.ipynb" 140 | topics = ["Customization", "Prompting"] 141 | 142 | [[cookbook]] 143 | title = "Prompt Optimization with DSPy" 144 | notebook = "prompt_optimization_with_dspy.ipynb" 145 | topics = ["Customization", "Prompting"] 146 | 147 | [[cookbook]] 148 | title = "Advanced RAG: Query Expansion" 149 | notebook = "query-expansion.ipynb" 150 | topics = ["Advanced Retrieval", "RAG"] 151 | 152 | [[cookbook]] 153 | title = "RAG Pipeline Evaluation Using DeepEval" 154 | notebook = "rag_eval_deep_eval.ipynb" 155 | topics = ["Evaluation"] 156 | 157 | [[cookbook]] 158 | title = "RAG Pipeline Evaluation Using RAGAS" 159 | notebook = "rag_eval_ragas.ipynb" 160 | topics = ["Evaluation"] 161 | 162 | [[cookbook]] 163 | title = "RAG Pipeline Evaluation Using UpTrain" 164 | notebook = "rag_eval_uptrain.ipynb" 165 | topics = ["Evaluation"] 166 | 167 | [[cookbook]] 168 | title = "RAG Pipeline Using FastEmbed for Embeddings Generationn" 169 | notebook = "rag_fastembed.ipynb" 170 | topics = ["RAG", "Advanced Retrieval"] 171 | 172 | [[cookbook]] 173 | title = "Haystack RAG Pipeline with Self-Deployed AI models using NVIDIA NIMs" 174 | notebook = "rag-with-nims.ipynb" 175 | topics = ["RAG", "Model Serving"] 176 | 177 | [[cookbook]] 178 | title = "Sparse Embedding Retrieval with Qdrant and FastEmbed" 179 | notebook = "sparse_embedding_retrieval.ipynb" 180 | topics = ["RAG", "Advanced Retrieval"] 181 | 182 | [[cookbook]] 183 | title = "Getting a Daily Digest From Tech Websites" 184 | notebook = "techcrunch_news_digest_titanml_takeoff.ipynb" 185 | topics = ["Model Serving", "RAG"] 186 | 187 | [[cookbook]] 188 | title = "Using Hypothetical Document Embeddings (HyDE) to Improve Retrieval" 189 | notebook = "using_hyde_for_improved_retrieval.ipynb" 190 | topics = ["Advanced Retrieval", "RAG"] 191 | 192 | [[cookbook]] 193 | title = "Speaker Diarization with AssemblyAI" 194 | notebook = "using_speaker_diarization_with_assemblyai.ipynb" 195 | topics = ["Audio"] 196 | 197 | [[cookbook]] 198 | title = "Function Calling and Multimodal QA with Gemini" 199 | notebook = "vertexai-gemini-examples.ipynb" 200 | topics = ["Function Calling", "Multimodal"] 201 | 202 | [[cookbook]] 203 | title = "Use the ⚡ vLLM inference engine with Haystack" 204 | notebook = "vllm_inference_engine.ipynb" 205 | topics = ["Model Serving"] 206 | 207 | [[cookbook]] 208 | title = "🪁 RAG pipelines with Haystack + Zephyr 7B Beta" 209 | notebook = "zephyr-7b-beta-for-rag.ipynb" 210 | topics = ["RAG"] 211 | 212 | [[cookbook]] 213 | title = "Advanced RAG: Query Decomposition and Reasoning" 214 | notebook = "query_decomposition.ipynb" 215 | topics = ["Advanced Retrieval", "RAG", "Agents"] 216 | 217 | [[cookbook]] 218 | title = "Analyze Your Instagram Comments’ Vibe with Apify and Haystack" 219 | notebook = "apify_haystack_instagram_comments_analysis.ipynb" 220 | topics = ["Prompting", "Data Scraping"] 221 | 222 | [[cookbook]] 223 | title = "Define & Run Tools " 224 | notebook = "tools_support.ipynb" 225 | topics = ["Function Calling", "Chat", "Agents"] 226 | 227 | [[cookbook]] 228 | title = "Agentic RAG with Llama 3.2 3B" 229 | notebook = "llama32_agentic_rag.ipynb" 230 | topics = ["RAG", "Agents", "Web-QA"] 231 | 232 | [[cookbook]] 233 | title = "Advanced RAG: Automated Structured Metadata Enrichment" 234 | notebook = "metadata_enrichment.ipynb" 235 | topics = ["Advanced Retrieval", "RAG", "Metadata"] 236 | 237 | [[cookbook]] 238 | title = "Newsletter Sending Agent with Tools" 239 | notebook = "newsletter-agent.ipynb" 240 | topics = ["Function Calling", "Chat", "Agents"] 241 | 242 | [[cookbook]] 243 | title = "Evaluating AI with Haystack" 244 | notebook = "evaluating_ai_with_haystack.ipynb" 245 | experimental = true 246 | topics = ["Evaluation"] 247 | discuss = "https://github.com/deepset-ai/haystack-experimental/discussions/74" 248 | 249 | [[cookbook]] 250 | title = "Multimodal Agent with fastRAG and Haystack" 251 | notebook = "multimodal_agent_with_fastrag_haystack.ipynb" 252 | topics = ["Multimodal", "Agents"] 253 | 254 | [[cookbook]] 255 | title = "Web-Enhanced Self-Reflecting Agent" 256 | notebook = "web_enhanced_self_reflecting_agent.ipynb" 257 | topics = ["Agents"] 258 | 259 | [[cookbook]] 260 | title = "Create a Swarm of Agents" 261 | notebook = "swarm.ipynb" 262 | topics = ["Function Calling", "Chat", "Agents"] 263 | 264 | [[cookbook]] 265 | title = "Running Haystack Pipelines in Asynchronous Environments" 266 | notebook = "async_pipeline.ipynb" 267 | topics = ["Async"] 268 | 269 | [[cookbook]] 270 | title = "RAG: Web Search and Analysis with Apify and Haystack" 271 | notebook = "apify_haystack_rag_web_browser.ipynb" 272 | topics = ["Web-QA", "Data Scraping", "RAG"] 273 | 274 | [[cookbook]] 275 | title = "Build a GitHub Issue Resolver Agent" 276 | notebook = "github_issue_resolver_agent.ipynb" 277 | new = true 278 | topics = ["Function Calling", "Agents"] 279 | 280 | [[cookbook]] 281 | title = "Extracting Metadata with an LLM" 282 | notebook = "metadata_extraction_with_llm_metadata_extractor.ipynb" 283 | new = true 284 | topics = ["Metadata"] 285 | 286 | [[cookbook]] 287 | title = "Building an Interactive Feedback Review Agent with Azure AI Search and Haystack" 288 | notebook = "feedback-analysis-agent-with-AzureAISearch.ipynb" 289 | new = true 290 | topics = ["AzureAISearch", "Agents", "RAG"] 291 | 292 | [[cookbook]] 293 | title = "Hybrid RAG Pipeline with Breakpoints" 294 | notebook = "hybrid_rag_pipeline_with_breakpoints.ipynb" 295 | experimental = true 296 | new = true 297 | topics = ["Advanced Retrieval", "RAG"] 298 | discuss = "https://github.com/deepset-ai/haystack-experimental/discussions/281" 299 | 300 | [[cookbook]] 301 | title = "Simple Keyword Extraction using OpenAIChatGenerator" 302 | notebook = "keyword-extraction.ipynb" 303 | new = true 304 | topics = ["Keyword Extraction"] 305 | 306 | [[cookbook]] 307 | title = "DevOps Support Agent with Human in the Loop" 308 | notebook = "agent_with_human_in_the_loop.ipynb" 309 | new = true 310 | topics = ["Function Calling", "Agents"] 311 | 312 | [[cookbook]] 313 | title = "Introduction to Multimodal Text Generation" 314 | notebook = "multimodal_intro.ipynb" 315 | new = true 316 | experimental = true 317 | topics = ["Multimodal"] 318 | discuss = "https://github.com/deepset-ai/haystack-experimental/discussions/302" 319 | -------------------------------------------------------------------------------- /notebooks/amazon_bedrock_for_documentation_qa.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "YxZBCJn21Ygd" 7 | }, 8 | "source": [ 9 | "# PDF-Based Question Answering with Amazon Bedrock and Haystack\n", 10 | "\n", 11 | "*Notebook by [Bilge Yucel](https://www.linkedin.com/in/bilge-yucel/)*\n", 12 | "\n", 13 | "[Amazon Bedrock](https://aws.amazon.com/bedrock/) is a fully managed service that provides high-performing foundation models from leading AI startups and Amazon through a single API. You can choose from various foundation models to find the one best suited for your use case.\n", 14 | "\n", 15 | "In this notebook, we'll go through the process of **creating a generative question answering application** tailored for PDF files using the newly added [Amazon Bedrock integration](https://haystack.deepset.ai/integrations/amazon-bedrock) with [Haystack](https://github.com/deepset-ai/haystack) and [OpenSearch](https://haystack.deepset.ai/integrations/opensearch-document-store) to store our documents efficiently. The demo will illustrate the step-by-step development of a QA application designed specifically for the Bedrock documentation, demonstrating the power of Bedrock in the process 🚀\n", 16 | "\n", 17 | "## Setup the Development Environment" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": { 23 | "id": "v5dzhxUV1QwR" 24 | }, 25 | "source": [ 26 | "### Install dependencies" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "id": "EX5oCws-etEH" 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "%%bash\n", 38 | "\n", 39 | "pip install opensearch-haystack amazon-bedrock-haystack pypdf" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": { 45 | "id": "WMJaEllC1Wat" 46 | }, 47 | "source": [ 48 | "### Download Files\n", 49 | "\n", 50 | "For this application, we'll use the user guide of Amazon Bedrock. Amazon Bedrock provides the [PDF form of its guide](https://docs.aws.amazon.com/pdfs/bedrock/latest/userguide/bedrock-ug.pdf). Run the code to download the PDF to `/content/bedrock-documentation.pdf` directory 👇🏼 " 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": { 57 | "id": "chi-VAhGeuQn" 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "import os\n", 62 | "\n", 63 | "import boto3\n", 64 | "from botocore import UNSIGNED\n", 65 | "from botocore.config import Config\n", 66 | "\n", 67 | "s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))\n", 68 | "s3.download_file('core-engineering', 'public/blog-posts/bedrock-documentation.pdf', '/content/bedrock-documentation.pdf')" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "id": "Ys3-RvVqqWdD" 75 | }, 76 | "source": [ 77 | "### Initialize an OpenSearch Instance on Colab\n", 78 | "\n", 79 | "[OpenSearch](https://opensearch.org/) is a fully open source search and analytics engine and is compatible with the [Amazon OpenSearch Service](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/what-is.html) that’s helpful if you’d like to deploy, operate, and scale your OpenSearch cluster later on.\n", 80 | "\n", 81 | "Let’s install OpenSearch and start an instance on Colab. For other installation options, check out [OpenSearch documentation](https://opensearch.org/docs/latest/install-and-configure/install-opensearch/index/)." 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "id": "vyWWR3Xye8l_" 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "!wget https://artifacts.opensearch.org/releases/bundle/opensearch/2.11.1/opensearch-2.11.1-linux-x64.tar.gz\n", 93 | "!tar -xvf opensearch-2.11.1-linux-x64.tar.gz\n", 94 | "!chown -R daemon:daemon opensearch-2.11.1\n", 95 | "# disabling security. Be mindful when you want to disable security in production systems\n", 96 | "!sudo echo 'plugins.security.disabled: true' >> opensearch-2.11.1/config/opensearch.yml" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "id": "Vaxe75MXkMi2" 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "%%bash --bg\n", 108 | "cd opensearch-2.11.1 && sudo -u daemon -- ./bin/opensearch" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": { 114 | "id": "YuN1y5WQ1jI9" 115 | }, 116 | "source": [ 117 | "> OpenSearch needs 30 seconds for a fully started server" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": { 124 | "id": "f9gbVwRU_Y5Q" 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "import time\n", 129 | "\n", 130 | "time.sleep(30)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": { 136 | "id": "pSBYYgYq1Ij3" 137 | }, 138 | "source": [ 139 | "### API Keys\n", 140 | "\n", 141 | "To use Amazon Bedrock, you need `aws_access_key_id`, `aws_secret_access_key`, and indicate the `aws_region_name`. Once logged into your account, locate these keys under the IAM user's \"Security Credentials\" section. For detailed guidance, refer to the documentation on [Managing access keys for IAM users](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html)." 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": { 148 | "id": "tZTz7cHwhZ-9" 149 | }, 150 | "outputs": [], 151 | "source": [ 152 | "from getpass import getpass\n", 153 | "\n", 154 | "os.environ[\"AWS_ACCESS_KEY_ID\"] = getpass(\"aws_access_key_id: \")\n", 155 | "os.environ[\"AWS_SECRET_ACCESS_KEY\"] = getpass(\"aws_secret_access_key: \")\n", 156 | "os.environ[\"AWS_DEFAULT_REGION\"] = input(\"aws_region_name: \")" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": { 162 | "id": "oa6aH6fB08d_" 163 | }, 164 | "source": [ 165 | "## Building the Indexing Pipeline\n", 166 | "\n", 167 | "Our indexing pipeline will convert the PDF file into a Haystack Document using [PyPDFToDocument](https://docs.haystack.deepset.ai/v2.0/docs/pypdftodocument) and preprocess it by cleaning and splitting it into chunks before storing them in [OpenSearchDocumentStore](https://docs.haystack.deepset.ai/v2.0/docs/opensearch-document-store).\n", 168 | "\n", 169 | "Let’s run the pipeline below and index our file to our document store:" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "id": "SrBctAl5e_Kf" 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "from pathlib import Path\n", 181 | "\n", 182 | "from haystack import Pipeline\n", 183 | "from haystack.components.converters import PyPDFToDocument\n", 184 | "from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter\n", 185 | "from haystack.components.writers import DocumentWriter\n", 186 | "from haystack.document_stores.types import DuplicatePolicy\n", 187 | "from haystack_integrations.document_stores.opensearch import OpenSearchDocumentStore\n", 188 | "\n", 189 | "## Initialize the OpenSearchDocumentStore\n", 190 | "document_store = OpenSearchDocumentStore()\n", 191 | "\n", 192 | "## Create pipeline components\n", 193 | "converter = PyPDFToDocument()\n", 194 | "cleaner = DocumentCleaner()\n", 195 | "splitter = DocumentSplitter(split_by=\"sentence\", split_length=10, split_overlap=2)\n", 196 | "writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)\n", 197 | "\n", 198 | "## Add components to the pipeline\n", 199 | "indexing_pipeline = Pipeline()\n", 200 | "indexing_pipeline.add_component(\"converter\", converter)\n", 201 | "indexing_pipeline.add_component(\"cleaner\", cleaner)\n", 202 | "indexing_pipeline.add_component(\"splitter\", splitter)\n", 203 | "indexing_pipeline.add_component(\"writer\", writer)\n", 204 | "\n", 205 | "## Connect the components to each other\n", 206 | "indexing_pipeline.connect(\"converter\", \"cleaner\")\n", 207 | "indexing_pipeline.connect(\"cleaner\", \"splitter\")\n", 208 | "indexing_pipeline.connect(\"splitter\", \"writer\")" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": { 214 | "id": "oJLXM8nM02AB" 215 | }, 216 | "source": [ 217 | "Run the pipeline with the files you want to index:" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": { 224 | "id": "X7HrON1PFHos" 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "indexing_pipeline.run({\"converter\": {\"sources\": [Path(\"/content/bedrock-documentation.pdf\")]}})" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": { 234 | "id": "UNmHvZLjA4Rv" 235 | }, 236 | "source": [ 237 | "## Building the Query Pipeline\n", 238 | "\n", 239 | "Let’s create another pipeline to query our application. In this pipeline, we’ll use [OpenSearchBM25Retriever](https://docs.haystack.deepset.ai/v2.0/docs/opensearchbm25retriever) to retrieve relevant information from the OpenSearchDocumentStore and an Amazon Titan model `amazon.titan-text-express-v1` to generate answers with [AmazonBedrockGenerator](https://docs.haystack.deepset.ai/v2.0/docs/amazonbedrockgenerator). You can select and test different models using the dropdown on right.\n", 240 | "\n", 241 | "Next, we'll create a prompt for our task using the Retrieval-Augmented Generation (RAG) approach with [PromptBuilder](https://docs.haystack.deepset.ai/v2.0/docs/promptbuilder). This prompt will help generate answers by considering the provided context. Finally, we'll connect these three components to complete the pipeline." 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": { 248 | "id": "8Q3JYuyShRnQ" 249 | }, 250 | "outputs": [], 251 | "source": [ 252 | "from haystack.components.builders import PromptBuilder\n", 253 | "from haystack import Pipeline\n", 254 | "from haystack_integrations.components.generators.amazon_bedrock import AmazonBedrockGenerator\n", 255 | "from haystack_integrations.components.retrievers.opensearch import OpenSearchBM25Retriever\n", 256 | "\n", 257 | "## Create pipeline components\n", 258 | "retriever = OpenSearchBM25Retriever(document_store=document_store, top_k=15)\n", 259 | "\n", 260 | "## Initialize the AmazonBedrockGenerator with an Amazon Bedrock model\n", 261 | "bedrock_model = 'amazon.titan-text-express-v1' # @param [\"amazon.titan-text-express-v1\", \"amazon.titan-text-lite-v1\", \"anthropic.claude-instant-v1\", \"anthropic.claude-v1\", \"anthropic.claude-v2\",\"anthropic.claude-v2:1\", \"meta.llama2-13b-chat-v1\", \"meta.llama2-70b-chat-v1\", \"ai21.j2-mid-v1\", \"ai21.j2-ultra-v1\"]\n", 262 | "generator = AmazonBedrockGenerator(model=bedrock_model, max_length=500)\n", 263 | "template = \"\"\"\n", 264 | "{% for document in documents %}\n", 265 | " {{ document.content }}\n", 266 | "{% endfor %}\n", 267 | "\n", 268 | "Please answer the question based on the given information from Amazon Bedrock documentation.\n", 269 | "\n", 270 | "{{question}}\n", 271 | "\"\"\"\n", 272 | "prompt_builder = PromptBuilder(template=template)\n", 273 | "\n", 274 | "## Add components to the pipeline\n", 275 | "rag_pipeline = Pipeline()\n", 276 | "rag_pipeline.add_component(\"retriever\", retriever)\n", 277 | "rag_pipeline.add_component(\"prompt_builder\", prompt_builder)\n", 278 | "rag_pipeline.add_component(\"llm\", generator)\n", 279 | "\n", 280 | "## Connect the components to each other\n", 281 | "rag_pipeline.connect(\"retriever\", \"prompt_builder.documents\")\n", 282 | "rag_pipeline.connect(\"prompt_builder\", \"llm\")" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": { 288 | "id": "5NywqZKo6msf" 289 | }, 290 | "source": [ 291 | "Ask your question and learn about the Amazon Bedrock service using Amazon Bedrock models!" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": { 298 | "colab": { 299 | "base_uri": "https://localhost:8080/" 300 | }, 301 | "id": "mDYCSRRtiAy5", 302 | "outputId": "671da188-ed15-48e7-958f-0e1343faf458" 303 | }, 304 | "outputs": [], 305 | "source": [ 306 | "question = \"What is Amazon Bedrock?\"\n", 307 | "response = rag_pipeline.run({\"retriever\": {\"query\": question}, \"prompt_builder\": {\"question\": question}})\n", 308 | "\n", 309 | "print(response[\"llm\"][\"replies\"][0])" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": { 316 | "colab": { 317 | "base_uri": "https://localhost:8080/" 318 | }, 319 | "id": "giSWajzyAcNp", 320 | "outputId": "0ec1dcc6-4a28-4815-f9b4-7c073584c20e" 321 | }, 322 | "outputs": [], 323 | "source": [ 324 | "question = \"How can I setup Amazon Bedrock?\"\n", 325 | "response = rag_pipeline.run({\"retriever\": {\"query\": question}, \"prompt_builder\": {\"question\": question}})\n", 326 | "\n", 327 | "print(response[\"llm\"][\"replies\"][0])" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": { 334 | "colab": { 335 | "base_uri": "https://localhost:8080/" 336 | }, 337 | "id": "ROhJ8VL_JdHc", 338 | "outputId": "76f73ed3-4fb3-4def-b88e-bbfde218cb7e" 339 | }, 340 | "outputs": [], 341 | "source": [ 342 | "question = \"How can I finetune foundation models?\"\n", 343 | "response = rag_pipeline.run({\"retriever\": {\"query\": question}, \"prompt_builder\": {\"question\": question}})\n", 344 | "\n", 345 | "print(response[\"llm\"][\"replies\"][0])" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": { 352 | "colab": { 353 | "base_uri": "https://localhost:8080/" 354 | }, 355 | "id": "9ohsesYDgsSq", 356 | "outputId": "2ab326d4-4279-424a-a826-10dfcbe3c00d" 357 | }, 358 | "outputs": [], 359 | "source": [ 360 | "question = \"How should I form my prompts for Amazon Titan models?\"\n", 361 | "response = rag_pipeline.run({\"retriever\": {\"query\": question}, \"prompt_builder\": {\"question\": question}})\n", 362 | "\n", 363 | "print(response[\"llm\"][\"replies\"][0])" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": { 370 | "colab": { 371 | "base_uri": "https://localhost:8080/" 372 | }, 373 | "id": "__NGWZdqh_dJ", 374 | "outputId": "26c84bf3-9401-438a-bac5-1032a5d9c20e" 375 | }, 376 | "outputs": [], 377 | "source": [ 378 | "question = \"How should I form my prompts for Claude models?\"\n", 379 | "response = rag_pipeline.run({\"retriever\": {\"query\": question}, \"prompt_builder\": {\"question\": question}})\n", 380 | "\n", 381 | "print(response[\"llm\"][\"replies\"][0])" 382 | ] 383 | } 384 | ], 385 | "metadata": { 386 | "colab": { 387 | "provenance": [], 388 | "toc_visible": true 389 | }, 390 | "kernelspec": { 391 | "display_name": "Python 3", 392 | "name": "python3" 393 | }, 394 | "language_info": { 395 | "codemirror_mode": { 396 | "name": "ipython", 397 | "version": 3 398 | }, 399 | "file_extension": ".py", 400 | "mimetype": "text/x-python", 401 | "name": "python", 402 | "nbconvert_exporter": "python", 403 | "pygments_lexer": "ipython3", 404 | "version": "3.9.7" 405 | } 406 | }, 407 | "nbformat": 4, 408 | "nbformat_minor": 0 409 | } 410 | -------------------------------------------------------------------------------- /notebooks/amazon_sagemaker_and_chroma_for_qa.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "YxZBCJn21Ygd" 7 | }, 8 | "source": [ 9 | "# Question Answering with Amazon Sagemaker, Chroma and Haystack\n", 10 | "\n", 11 | "*Notebook by [Sara Zanzottera](https://www.zansara.dev/) and [Bilge Yucel](https://www.linkedin.com/in/bilge-yucel/)*\n", 12 | "\n", 13 | "[Amazon Sagemaker](https://docs.aws.amazon.com/sagemaker/) is a comprehensive, fully managed machine learning service\n", 14 | "that allows data scientists and developers to build, train, and deploy ML models efficiently. You can choose from various foundation models to find the one best suited for your use case.\n", 15 | "\n", 16 | "In this notebook, we'll go through the process of **creating a generative question answering application** using the newly added [Amazon Sagemaker integration](https://haystack.deepset.ai/integrations/amazon-sagemaker) with [Haystack](https://github.com/deepset-ai/haystack) and [Chroma](https://haystack.deepset.ai/integrations/chroma-documentstore) to store our documents efficiently. The demo will illustrate the step-by-step development of a QA application using some Wikipedia pages about NASA's Mars missions 🚀\n", 17 | "\n", 18 | "## Setup the Development Environment\n" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "id": "v5dzhxUV1QwR" 25 | }, 26 | "source": [ 27 | "### Install dependencies" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "colab": { 35 | "base_uri": "https://localhost:8080/" 36 | }, 37 | "id": "EX5oCws-etEH", 38 | "is_executing": true, 39 | "outputId": "4d46055f-4d58-4d67-b895-ad701c2eb306" 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "%%bash\n", 44 | "\n", 45 | "pip install chroma-haystack amazon-sagemaker-haystack wikipedia typing_extensions" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": { 51 | "id": "Eg9lSuAJM6MJ" 52 | }, 53 | "source": [ 54 | "## Deploy a model on Sagemaker\n", 55 | "\n", 56 | "To use Amazon Sagemaker's models, you first need to deploy them. In this example we'll be using Falcon 7B Instruct BF16, so make sure to deploy such model on your account before proceeding.\n", 57 | "\n", 58 | "For help you can check out:\n", 59 | "- Amazon Sagemaker Jumpstart [documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/jumpstart-foundation-models-use.html).\n", 60 | "- [This notebook](https://github.com/aws/amazon-sagemaker-examples/blob/main/introduction_to_amazon_algorithms/jumpstart-foundation-models/text-generation-falcon.ipynb) on how to deploy Falcon models programmatically with a notebook\n", 61 | "- [This blogpost](https://aws.amazon.com/blogs/machine-learning/build-production-ready-generative-ai-applications-for-enterprise-search-using-haystack-pipelines-and-amazon-sagemaker-jumpstart-with-llms/) about deploying models on Sagemaker for Haystack 1.x\n" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": { 67 | "id": "pSBYYgYq1Ij3" 68 | }, 69 | "source": [ 70 | "### API Keys\n", 71 | "\n", 72 | "To use Amazon Sagemaker, you need to set a few environment variables: `AWS ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and often to indicate the region by setting `AWS_REGION`. Once logged into your account, locate these keys under the IAM user's \"Security Credentials\" section. For detailed guidance, refer to the documentation on [Managing access keys for IAM users](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html)." 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": { 79 | "colab": { 80 | "base_uri": "https://localhost:8080/" 81 | }, 82 | "id": "tZTz7cHwhZ-9", 83 | "is_executing": true, 84 | "outputId": "72a5b7af-5d81-4c2f-e922-f35ee1dda94e" 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "import os\n", 89 | "from getpass import getpass\n", 90 | "\n", 91 | "os.environ[\"AWS_ACCESS_KEY_ID\"] = getpass(\"aws_access_key_id: \")\n", 92 | "os.environ[\"AWS_SECRET_ACCESS_KEY\"] = getpass(\"aws_secret_access_key: \")\n", 93 | "os.environ[\"AWS_REGION\"] = input(\"aws_region_name: \")" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": { 99 | "id": "k-CF7LUSy2T7" 100 | }, 101 | "source": [ 102 | "## Load data from Wikipedia\n", 103 | "\n", 104 | "We are going to download the Wikipedia pages related to NASA's martian rovers using the python library `wikipedia`.\n", 105 | "\n", 106 | "These pages are converted into Haystack Documents." 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": { 113 | "id": "-Nz9MRVgxcfW" 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "import wikipedia\n", 118 | "from haystack.dataclasses import Document\n", 119 | "\n", 120 | "wiki_pages = [\n", 121 | " \"Ingenuity_(helicopter)\",\n", 122 | " \"Perseverance_(rover)\",\n", 123 | " \"Curiosity_(rover)\",\n", 124 | " \"Opportunity_(rover)\",\n", 125 | " \"Spirit_(rover)\",\n", 126 | " \"Sojourner_(rover)\"\n", 127 | "]\n", 128 | "\n", 129 | "raw_docs=[]\n", 130 | "for title in wiki_pages:\n", 131 | " page = wikipedia.page(title=title, auto_suggest=False)\n", 132 | " doc = Document(content=page.content, meta={\"title\": page.title, \"url\":page.url})\n", 133 | " raw_docs.append(doc)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": { 139 | "id": "oa6aH6fB08d_" 140 | }, 141 | "source": [ 142 | "## Building the Indexing Pipeline\n", 143 | "\n", 144 | "Our indexing pipeline will preprocess the provided Wikipedia pages by cleaning and splitting it into chunks before storing them in [ChromaDocumentStore](https://docs.haystack.deepset.ai/v2.0/docs/chroma-document-store).\n", 145 | "\n", 146 | "Let’s run the pipeline below and index our file to our document store:" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": { 153 | "id": "SrBctAl5e_Kf" 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "from pathlib import Path\n", 158 | "\n", 159 | "from haystack import Pipeline\n", 160 | "from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter\n", 161 | "from haystack.components.writers import DocumentWriter\n", 162 | "from haystack.document_stores.types import DuplicatePolicy\n", 163 | "from haystack_integrations.document_stores.chroma import ChromaDocumentStore\n", 164 | "\n", 165 | "## Initialize ChromaDocumentStore\n", 166 | "document_store = ChromaDocumentStore()\n", 167 | "\n", 168 | "## Create pipeline components\n", 169 | "cleaner = DocumentCleaner()\n", 170 | "splitter = DocumentSplitter(split_by=\"sentence\", split_length=10, split_overlap=2)\n", 171 | "writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)\n", 172 | "\n", 173 | "## Add components to the pipeline\n", 174 | "indexing_pipeline = Pipeline()\n", 175 | "indexing_pipeline.add_component(\"cleaner\", cleaner)\n", 176 | "indexing_pipeline.add_component(\"splitter\", splitter)\n", 177 | "indexing_pipeline.add_component(\"writer\", writer)\n", 178 | "\n", 179 | "## Connect the components to each other\n", 180 | "indexing_pipeline.connect(\"cleaner\", \"splitter\")\n", 181 | "indexing_pipeline.connect(\"splitter\", \"writer\")" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": { 187 | "id": "oJLXM8nM02AB" 188 | }, 189 | "source": [ 190 | "Run the pipeline with the files you want to index (note that this step may take some time):" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": { 197 | "colab": { 198 | "base_uri": "https://localhost:8080/" 199 | }, 200 | "id": "X7HrON1PFHos", 201 | "is_executing": true, 202 | "outputId": "22a57096-f22a-4333-cd85-9ea9ea4b52e0" 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "indexing_pipeline.run({\"cleaner\":{\"documents\":raw_docs}})" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": { 212 | "id": "UNmHvZLjA4Rv" 213 | }, 214 | "source": [ 215 | "## Building the Query Pipeline\n", 216 | "\n", 217 | "Let’s create another pipeline to query our application. In this pipeline, we’ll use [ChromaQueryTextRetriever](https://docs.haystack.deepset.ai/v2.0/docs/chromaqueryretriever) to retrieve relevant information from the ChromaDocumentStore and a Falcon 7B Instruct BF16 model to generate answers with [SagemakerGenerator](https://docs.haystack.deepset.ai/v2.0/docs/sagemakergenerator).\n", 218 | "\n", 219 | "Next, we'll create a prompt for our task using the Retrieval-Augmented Generation (RAG) approach with [PromptBuilder](https://docs.haystack.deepset.ai/v2.0/docs/promptbuilder). This prompt will help generate answers by considering the provided context. Finally, we'll connect these three components to complete the pipeline." 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": { 226 | "id": "8Q3JYuyShRnQ" 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "from haystack import Pipeline\n", 231 | "from haystack.components.builders import PromptBuilder\n", 232 | "from haystack_integrations.components.generators.amazon_sagemaker import SagemakerGenerator\n", 233 | "from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever\n", 234 | "\n", 235 | "# Create pipeline components\n", 236 | "retriever = ChromaQueryTextRetriever(document_store=document_store, top_k=3)\n", 237 | "\n", 238 | "# Initialize the AmazonSagemakerGenerator with an Amazon Sagemaker model\n", 239 | "# You may need to change the model name if it differs from your endpoint name.\n", 240 | "model = 'jumpstart-dft-hf-llm-falcon-7b-instruct-bf16'\n", 241 | "generator = SagemakerGenerator(model=model, generation_kwargs={\"max_new_tokens\":256})\n", 242 | "template = \"\"\"\n", 243 | "{% for document in documents %}\n", 244 | " {{ document.content }}\n", 245 | "{% endfor %}\n", 246 | "\n", 247 | "Answer based on the information above: {{question}}\n", 248 | "\"\"\"\n", 249 | "prompt_builder = PromptBuilder(template=template)\n", 250 | "\n", 251 | "## Add components to the pipeline\n", 252 | "rag_pipeline = Pipeline()\n", 253 | "rag_pipeline.add_component(\"retriever\", retriever)\n", 254 | "rag_pipeline.add_component(\"prompt_builder\", prompt_builder)\n", 255 | "rag_pipeline.add_component(\"llm\", generator)\n", 256 | "\n", 257 | "## Connect the components to each other\n", 258 | "rag_pipeline.connect(\"retriever\", \"prompt_builder.documents\")\n", 259 | "rag_pipeline.connect(\"prompt_builder\", \"llm\")" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": { 265 | "id": "5NywqZKo6msf" 266 | }, 267 | "source": [ 268 | "Ask your question and learn about the Amazon Sagemaker service using Amazon Sagemaker models!" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": { 275 | "colab": { 276 | "base_uri": "https://localhost:8080/" 277 | }, 278 | "id": "mDYCSRRtiAy5", 279 | "outputId": "b644aeb8-c9eb-4dbf-ed28-ad3080826410" 280 | }, 281 | "outputs": [ 282 | { 283 | "name": "stdout", 284 | "output_type": "stream", 285 | "text": [ 286 | "\n", 287 | "Opportunity landed on Mars on January 24, 2004.\n" 288 | ] 289 | } 290 | ], 291 | "source": [ 292 | "question = \"When did Opportunity land?\"\n", 293 | "response = rag_pipeline.run({\"retriever\": {\"query\": question}, \"prompt_builder\": {\"question\": question}})\n", 294 | "\n", 295 | "print(response[\"llm\"][\"replies\"][0])" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": { 302 | "colab": { 303 | "base_uri": "https://localhost:8080/" 304 | }, 305 | "id": "giSWajzyAcNp", 306 | "outputId": "ac240c1a-657d-447a-8f08-8558616d71e9" 307 | }, 308 | "outputs": [ 309 | { 310 | "name": "stdout", 311 | "output_type": "stream", 312 | "text": [ 313 | "\n", 314 | "Yes, the Ingenuity mission is over. The helicopter made a total of 72 flights over a period of about 3 years until rotor damage sustained in January 2024 forced an end to the mission.\n" 315 | ] 316 | } 317 | ], 318 | "source": [ 319 | "question = \"Is Ingenuity mission over?\"\n", 320 | "response = rag_pipeline.run({\"retriever\": {\"query\": question}, \"prompt_builder\": {\"question\": question}})\n", 321 | "\n", 322 | "print(response[\"llm\"][\"replies\"][0])" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": { 329 | "colab": { 330 | "base_uri": "https://localhost:8080/" 331 | }, 332 | "id": "ROhJ8VL_JdHc", 333 | "outputId": "4c77f458-d472-4644-b073-304986bf7a6c" 334 | }, 335 | "outputs": [ 336 | { 337 | "name": "stdout", 338 | "output_type": "stream", 339 | "text": [ 340 | "\n", 341 | "The first NASA rover to land on Mars was called Sojourner.\n" 342 | ] 343 | } 344 | ], 345 | "source": [ 346 | "question = \"What was the name of the first NASA rover to land on Mars?\"\n", 347 | "response = rag_pipeline.run({\"retriever\": {\"query\": question}, \"prompt_builder\": {\"question\": question}})\n", 348 | "\n", 349 | "print(response[\"llm\"][\"replies\"][0])" 350 | ] 351 | } 352 | ], 353 | "metadata": { 354 | "colab": { 355 | "provenance": [] 356 | }, 357 | "kernelspec": { 358 | "display_name": "Python 3", 359 | "name": "python3" 360 | }, 361 | "language_info": { 362 | "codemirror_mode": { 363 | "name": "ipython", 364 | "version": 3 365 | }, 366 | "file_extension": ".py", 367 | "mimetype": "text/x-python", 368 | "name": "python", 369 | "nbconvert_exporter": "python", 370 | "pygments_lexer": "ipython3", 371 | "version": "3.9.7" 372 | } 373 | }, 374 | "nbformat": 4, 375 | "nbformat_minor": 0 376 | } 377 | -------------------------------------------------------------------------------- /notebooks/apify_haystack_instagram_comments_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "t1BeKtSo7KzI" 7 | }, 8 | "source": [ 9 | "# Analyze Your Instagram Comments’ Vibe with Apify and Haystack\n", 10 | "\n", 11 | "Author: Jiri Spilka ([Apify](https://apify.com/jiri.spilka)) \n", 12 | "Idea: Bilge Yücel ([deepset.ai](https://github.com/bilgeyucel))\n", 13 | "\n", 14 | "Ever wondered if your Instagram posts are truly vibrating among your audience?\n", 15 | "In this cookbook, we'll show you how to use the [Instagram Comment Scraper](https://apify.com/apify/instagram-comment-scraper) Actor to download comments from your instagram post and analyze them using a large language model. All performed within the Haystack ecosystem using the [apify-haystack](https://github.com/apify/apify-haystack/tree/main) integration.\n", 16 | "\n", 17 | "We'll start by using the Actor to download the comments, clean the data with the [DocumentCleaner](https://docs.haystack.deepset.ai/docs/documentcleaner) and then use the [OpenAIGenerator](https://docs.haystack.deepset.ai/docs/openaigenerator) to discover the vibe of the Instagram posts." 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": { 23 | "id": "-7zY6NIsCj_5" 24 | }, 25 | "source": [ 26 | "# Install dependencies" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "colab": { 34 | "base_uri": "https://localhost:8080/" 35 | }, 36 | "collapsed": true, 37 | "id": "r5AJeMOE1Cou", 38 | "outputId": "63663073-ccc5-4306-ae18-e2720d937407" 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "!pip install apify-haystack==0.1.4 haystack-ai" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": { 48 | "id": "h6MmIG9K1HkK" 49 | }, 50 | "source": [ 51 | "## Set up the API keys\n", 52 | "\n", 53 | "You need to have an Apify account and obtain [APIFY_API_TOKEN](https://docs.apify.com/platform/integrations/api).\n", 54 | "\n", 55 | "You also need an OpenAI account and [OPENAI_API_KEY](https://platform.openai.com/docs/quickstart)\n" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 2, 61 | "metadata": { 62 | "colab": { 63 | "base_uri": "https://localhost:8080/" 64 | }, 65 | "id": "yiUTwYzP36Yr", 66 | "outputId": "d79acadc-bd18-44d3-c812-9b40c51d5124" 67 | }, 68 | "outputs": [ 69 | { 70 | "name": "stdout", 71 | "output_type": "stream", 72 | "text": [ 73 | "Enter YOUR APIFY_API_TOKEN··········\n", 74 | "Enter YOUR OPENAI_API_KEY··········\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "import os\n", 80 | "from getpass import getpass\n", 81 | "\n", 82 | "os.environ[\"APIFY_API_TOKEN\"] = getpass(\"Enter YOUR APIFY_API_TOKEN\")\n", 83 | "os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter YOUR OPENAI_API_KEY\")" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": { 89 | "id": "HQzAujMc505k" 90 | }, 91 | "source": [ 92 | "## Use the Haystack Pipeline to Orchestrate Instagram Comments Scraper, Comments Cleanup, and Analysis Using LLM\n", 93 | "\n", 94 | "Now, let's decide which post to analyze. We can start with these two posts that might reveal some interesting insights:\n", 95 | "\n", 96 | "- `@tiffintech` on [How to easily keep up with tech?](https://www.instagram.com/p/C_a9jcRuJZZ/)\n", 97 | "- `@kamaharishis` on [Affordable Care Act](https://www.instagram.com/p/C_RgBzogufK)\n", 98 | "\n", 99 | "We'll download the comments using the Instagram Scraper Actor. But first, we need to understand the output format of the Actor.\n", 100 | "\n", 101 | "The output is in the following format:\n", 102 | "```json\n", 103 | "[\n", 104 | " {\n", 105 | " \"text\": \"You've just uncovered the goldmine for me 😍 but I still love your news and updates!\",\n", 106 | " \"timestamp\": \"2024-09-02T16:27:09.000Z\",\n", 107 | " \"ownerUsername\": \"codingmermaid.ai\",\n", 108 | " \"ownerProfilePicUrl\": \"....\",\n", 109 | " \"postUrl\": \"https://www.instagram.com/p/C_a9jcRuJZZ/\"\n", 110 | " },\n", 111 | " {\n", 112 | " \"text\": \"Will check it out🙌\",\n", 113 | " \"timestamp\": \"2024-09-02T16:29:28.000Z\",\n", 114 | " \"ownerUsername\": \"author.parijat\",\n", 115 | " \"postUrl\": \"https://www.instagram.com/p/C_a9jcRuJZZ/\"\n", 116 | " }\n", 117 | "]\n", 118 | "```\n", 119 | "We will convert this JSON to a Haystack Document using the `dataset_mapping_function` as follows" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 3, 125 | "metadata": { 126 | "id": "OZ0PAVHI_mhn" 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "from haystack import Document\n", 131 | "\n", 132 | "def dataset_mapping_function(dataset_item: dict) -> Document:\n", 133 | " return Document(content=dataset_item.get(\"text\"), meta={\"ownerUsername\": dataset_item.get(\"ownerUsername\")})" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": { 139 | "id": "xtFquWflA5kf" 140 | }, 141 | "source": [ 142 | "Once we understand the Actor output format and have the `dataset_mapping_function`, we can setup the Haystack component to enable interaction between the Haystack and Apify.\n", 143 | "\n", 144 | "First, we need to provide `actor_id`, `dataset_mapping_function` along with input parameters `run_input`.\n", 145 | "\n", 146 | "We can define the `run_input` in three ways: \n", 147 | "- i) when creating the `ApifyDatasetFromActorCall` class \n", 148 | "- ii) as arguments in a pipeline. \n", 149 | "- iii) as argumennts to the `run()` function when we calling `ApifyDatasetFromActorCall.run()` \n", 150 | "- iv) as a combination of `i)` and `ii)` as shown in this cookbook.\n", 151 | "\n", 152 | "For a detailed description of the input parameters, visit the [Instagram Comments Scraper page](https://apify.com/apify/instagram-comment-scraper).\n", 153 | "\n", 154 | "Let's setup the `ApifyDatasetFromActorCall`" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 4, 160 | "metadata": { 161 | "id": "SUWXxT4y55lH" 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "from apify_haystack import ApifyDatasetFromActorCall\n", 166 | "\n", 167 | "document_loader = ApifyDatasetFromActorCall(\n", 168 | " actor_id=\"apify/instagram-comment-scraper\",\n", 169 | " run_input={\"resultsLimit\": 50},\n", 170 | " dataset_mapping_function=dataset_mapping_function,\n", 171 | ")" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": { 177 | "id": "BxHbPUipjrvS" 178 | }, 179 | "source": [ 180 | "\n", 181 | "Next, we'll define a `prompt` for the LLM and connect all the components in the [Pipeline](https://docs.haystack.deepset.ai/docs/pipelines)." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 5, 187 | "metadata": { 188 | "colab": { 189 | "base_uri": "https://localhost:8080/" 190 | }, 191 | "id": "gdN7baGrA_lR", 192 | "outputId": "b73b1217-3082-4da7-c824-b8671eeef78d" 193 | }, 194 | "outputs": [ 195 | { 196 | "data": { 197 | "text/plain": [ 198 | "\n", 199 | "🚅 Components\n", 200 | " - loader: ApifyDatasetFromActorCall\n", 201 | " - cleaner: DocumentCleaner\n", 202 | " - prompt_builder: PromptBuilder\n", 203 | " - llm: OpenAIGenerator\n", 204 | "🛤️ Connections\n", 205 | " - loader.documents -> cleaner.documents (list[Document])\n", 206 | " - cleaner.documents -> prompt_builder.documents (List[Document])\n", 207 | " - prompt_builder.prompt -> llm.prompt (str)" 208 | ] 209 | }, 210 | "execution_count": 5, 211 | "metadata": {}, 212 | "output_type": "execute_result" 213 | } 214 | ], 215 | "source": [ 216 | "from haystack import Pipeline\n", 217 | "from haystack.components.builders import PromptBuilder\n", 218 | "from haystack.components.generators import OpenAIGenerator\n", 219 | "from haystack.components.preprocessors import DocumentCleaner\n", 220 | "\n", 221 | "prompt = \"\"\"\n", 222 | "Analyze these Instagram comments to determine if the post is generating positive energy, excitement,\n", 223 | "or high engagement. Focus on sentiment, emotional tone, and engagement patterns to conclude if\n", 224 | "the post is 'vibrating' with high energy. Be concise.\"\n", 225 | "\n", 226 | "Context:\n", 227 | "{% for document in documents %}\n", 228 | " {{ document.content }}\n", 229 | "{% endfor %}\n", 230 | "\n", 231 | "Analysis:\n", 232 | "\"\"\"\n", 233 | "\n", 234 | "cleaner = DocumentCleaner(remove_empty_lines=True, remove_extra_whitespaces=True, remove_repeated_substrings=True)\n", 235 | "prompt_builder = PromptBuilder(template=prompt)\n", 236 | "generator = OpenAIGenerator(model=\"gpt-4o-mini\")\n", 237 | "\n", 238 | "\n", 239 | "pipe = Pipeline()\n", 240 | "pipe.add_component(\"loader\", document_loader)\n", 241 | "pipe.add_component(\"cleaner\", cleaner)\n", 242 | "pipe.add_component(\"prompt_builder\", prompt_builder)\n", 243 | "pipe.add_component(\"llm\", generator)\n", 244 | "pipe.connect(\"loader\", \"cleaner\")\n", 245 | "pipe.connect(\"cleaner\", \"prompt_builder\")\n", 246 | "pipe.connect(\"prompt_builder\", \"llm\")" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": { 252 | "id": "GxDNZ7LqAsWV" 253 | }, 254 | "source": [ 255 | "After that, we can run the pipeline. The execution and analysis will take approximately 30-60 seconds." 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 6, 261 | "metadata": { 262 | "colab": { 263 | "base_uri": "https://localhost:8080/", 264 | "height": 72 265 | }, 266 | "id": "qfaWI6BaAko9", 267 | "outputId": "25e33c1b-f8b9-4b6d-a3d9-0eb54365b820" 268 | }, 269 | "outputs": [ 270 | { 271 | "data": { 272 | "application/vnd.google.colaboratory.intrinsic+json": { 273 | "type": "string" 274 | }, 275 | "text/plain": [ 276 | "'Overall, the Instagram comments on the post reflect positive energy, excitement, and high engagement. The use of emojis such as 😂, 😍, 🙌, ❤️, and 🔥 indicate enthusiasm and excitement. Many comments express gratitude, appreciation, and eagerness to explore the resources mentioned in the post. There are also interactions between users tagging each other and discussing their interest in the topic, further increasing engagement. Overall, the post seems to be generating high energy and positive vibes from the audience.'" 277 | ] 278 | }, 279 | "execution_count": 6, 280 | "metadata": {}, 281 | "output_type": "execute_result" 282 | } 283 | ], 284 | "source": [ 285 | "# \\@tiffintech on How to easily keep up with tech?\n", 286 | "url = \"https://www.instagram.com/p/C_a9jcRuJZZ/\"\n", 287 | "\n", 288 | "res = pipe.run({\"loader\": {\"run_input\": {\"directUrls\": [url]}}})\n", 289 | "res.get(\"llm\", {}).get(\"replies\", [\"No response\"])[0]\n", 290 | "\n" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": { 296 | "id": "jPfgD939E2TW" 297 | }, 298 | "source": [ 299 | "Now, let's us run the same analysis. This time with the @kamalaharris post" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 7, 305 | "metadata": { 306 | "colab": { 307 | "base_uri": "https://localhost:8080/", 308 | "height": 72 309 | }, 310 | "id": "mCFb8KZOEkpW", 311 | "outputId": "f6b61f27-59f6-4898-b202-1838f8fd00f2" 312 | }, 313 | "outputs": [ 314 | { 315 | "data": { 316 | "application/vnd.google.colaboratory.intrinsic+json": { 317 | "type": "string" 318 | }, 319 | "text/plain": [ 320 | "'The comments on this post are highly polarized, with strong opinions expressed on both sides of the political spectrum. There is a mix of negative and positive sentiment, with some users expressing excitement and support for the current administration (e.g., emojis like 💙💙💙💙, Kamala 👏👏) while others criticize past policies and individuals associated with them (e.g., Trump 2024, lack of education). Overall, the engagement on this post is high, with users actively debating and defending their viewpoints. Despite the divisive nature of the comments, the post is generating a high level of energy and engagement.'" 321 | ] 322 | }, 323 | "execution_count": 7, 324 | "metadata": {}, 325 | "output_type": "execute_result" 326 | } 327 | ], 328 | "source": [ 329 | "# \\@kamalaharris on Affordable Care Act\n", 330 | "url = \"https://www.instagram.com/p/C_RgBzogufK/\"\n", 331 | "\n", 332 | "res = pipe.run({\"loader\": {\"run_input\": {\"directUrls\": [url]}}})\n", 333 | "res.get(\"llm\", {}).get(\"replies\", [\"No response\"])[0]" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": { 339 | "id": "45YxSr6v__fI" 340 | }, 341 | "source": [ 342 | "The analysis shows that the first post about [How to easily keep up with tech?](https://www.instagram.com/p/C_a9jcRuJZZ/) is vibrating with high energy:\n", 343 | "\n", 344 | "*The Instagram comments reveal a strong level of engagement and positive energy. Emojis like 😍, 😂, ❤️, 🙌, and 🔥 are frequently used, indicating excitement and enthusiasm. Commenters express gratitude, excitement, and appreciation for the content. The tone is overwhelmingly positive, supportive, and encouraging, with many users tagging others to share the content. Overall, this post is generating a vibrant and highly engaged response.*\n", 345 | "\n", 346 | "However, the post by `@kamalaharris` on the [Affordable Care Act](https://www.instagram.com/p/C_RgBzogufK) is (not surprisingly) sparking a lot of controversy with negative comments.\n", 347 | "\n", 348 | "*The comments on this post are generating negative energy but with high engagement. There's a strong focus on political opinions, particularly concerning insurance companies, the Affordable Care Act, Trump, and Biden. Many comments express frustration, criticism, and disagreement, with some users discussing party affiliations or support for specific politicians. There are also mentions of misinformation and conspiracy theories. Engagement is high, with numerous comment threads delving into various political issues. Overall, this post is vibrating with intense energy, driven by political opinions, disagreements, and active discussions.*\n", 349 | "\n", 350 | "💡 You might receive slightly different results, as the comments may have changed since the last run" 351 | ] 352 | } 353 | ], 354 | "metadata": { 355 | "colab": { 356 | "provenance": [] 357 | }, 358 | "kernelspec": { 359 | "display_name": "Python 3", 360 | "name": "python3" 361 | }, 362 | "language_info": { 363 | "name": "python" 364 | } 365 | }, 366 | "nbformat": 4, 367 | "nbformat_minor": 0 368 | } 369 | -------------------------------------------------------------------------------- /notebooks/apify_haystack_rag.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "t1BeKtSo7KzI" 7 | }, 8 | "source": [ 9 | "# RAG: Extract and use website content for question answering with Apify-Haystack integration\n", 10 | "\n", 11 | "Author: Jiri Spilka ([Apify](https://apify.com/jiri.spilka))\n", 12 | "\n", 13 | "In this tutorial, we'll use the [apify-haystack](https://github.com/apify/apify-haystack/tree/main) integration to call [Website Content Crawler](https://apify.com/apify/website-content-crawler) and crawl and scrape text content from the [Haystack website](https://haystack.deepset.ai). Then, we'll use the [OpenAIDocumentEmbedder](https://docs.haystack.deepset.ai/docs/openaidocumentembedder) to compute text embeddings and the [InMemoryDocumentStore](https://docs.haystack.deepset.ai/docs/inmemorydocumentstore) to store documents in a temporary in-memory database. The last step will be a retrieval augmented generation pipeline to answer users' questions from the scraped data.\n", 14 | "\n", 15 | "\n", 16 | "## Install dependencies" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": { 23 | "id": "r5AJeMOE1Cou" 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "!pip install apify-haystack haystack-ai" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": { 33 | "id": "h6MmIG9K1HkK" 34 | }, 35 | "source": [ 36 | "## Set up the API keys\n", 37 | "\n", 38 | "You need to have an Apify account and obtain [APIFY_API_TOKEN](https://docs.apify.com/platform/integrations/api).\n", 39 | "\n", 40 | "You also need an OpenAI account and [OPENAI_API_KEY](https://platform.openai.com/docs/quickstart)\n" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 4, 46 | "metadata": { 47 | "colab": { 48 | "base_uri": "https://localhost:8080/" 49 | }, 50 | "id": "yiUTwYzP36Yr", 51 | "outputId": "b53b07db-42a9-4109-e322-705a8312da2e" 52 | }, 53 | "outputs": [ 54 | { 55 | "name": "stdout", 56 | "output_type": "stream", 57 | "text": [ 58 | "Enter YOUR APIFY_API_TOKEN··········\n", 59 | "Enter YOUR OPENAI_API_KEY··········\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "import os\n", 65 | "from getpass import getpass\n", 66 | "\n", 67 | "os.environ[\"APIFY_API_TOKEN\"] = getpass(\"Enter YOUR APIFY_API_TOKEN\")\n", 68 | "os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter YOUR OPENAI_API_KEY\")" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "id": "HQzAujMc505k" 75 | }, 76 | "source": [ 77 | "## Use the Website Content Crawler to scrape data from the haystack documentation\n", 78 | "\n", 79 | "Now, let us call the Website Content Crawler using the Haystack component `ApifyDatasetFromActorCall`. First, we need to define parameters for the Website Content Crawler and then what data we need to save into the vector database.\n", 80 | "\n", 81 | "The `actor_id` and detailed description of input parameters (variable `run_input`) can be found on the [Website Content Crawler input page](https://apify.com/apify/website-content-crawler/input-schema).\n", 82 | "\n", 83 | "For this example, we will define `startUrls` and limit the number of crawled pages to five." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "id": "_AYgcfBx681h" 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "actor_id = \"apify/website-content-crawler\"\n", 95 | "run_input = {\n", 96 | " \"maxCrawlPages\": 5, # limit the number of pages to crawl\n", 97 | " \"startUrls\": [{\"url\": \"https://haystack.deepset.ai/\"}],\n", 98 | "}" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": { 104 | "id": "yIODy29t-_JY" 105 | }, 106 | "source": [ 107 | "Next, we need to define a dataset mapping function. We need to know the output of the Website Content Crawler. Typically, it is a JSON object that looks like this (truncated for brevity):\n", 108 | "\n", 109 | "```\n", 110 | "[\n", 111 | " {\n", 112 | " \"url\": \"https://haystack.deepset.ai/\",\n", 113 | " \"text\": \"Haystack | Haystack - Multimodal - AI - Architect a next generation AI app around all modalities, not just text ...\"\n", 114 | " },\n", 115 | " {\n", 116 | " \"url\": \"https://haystack.deepset.ai/tutorials/24_building_chat_app\",\n", 117 | " \"text\": \"Building a Conversational Chat App ... \"\n", 118 | " },\n", 119 | "]\n", 120 | "```\n", 121 | "\n", 122 | "We will convert this JSON to a Haystack `Document` using the `dataset_mapping_function` as follows:\n" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "id": "OZ0PAVHI_mhn" 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "from haystack import Document\n", 134 | "\n", 135 | "def dataset_mapping_function(dataset_item: dict) -> Document:\n", 136 | " return Document(content=dataset_item.get(\"text\"), meta={\"url\": dataset_item.get(\"url\")})" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": { 142 | "id": "xtFquWflA5kf" 143 | }, 144 | "source": [ 145 | "And the definition of the `ApifyDatasetFromActorCall`:" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": { 152 | "id": "gdN7baGrA_lR" 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "from apify_haystack import ApifyDatasetFromActorCall\n", 157 | "\n", 158 | "apify_dataset_loader = ApifyDatasetFromActorCall(\n", 159 | " actor_id=actor_id,\n", 160 | " run_input=run_input,\n", 161 | " dataset_mapping_function=dataset_mapping_function,\n", 162 | ")" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": { 168 | "id": "3hG6SvMm_mAB" 169 | }, 170 | "source": [ 171 | "Before actually running the Website Content Crawler, we need to define embedding function and document store:" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": { 178 | "id": "zKr0KTfhAQz6" 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "from haystack.components.embedders import OpenAIDocumentEmbedder\n", 183 | "from haystack.document_stores.in_memory import InMemoryDocumentStore\n", 184 | "\n", 185 | "document_store = InMemoryDocumentStore()\n", 186 | "docs_embedder = OpenAIDocumentEmbedder()" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": { 192 | "id": "GxDNZ7LqAsWV" 193 | }, 194 | "source": [ 195 | "After that, we can call the Website Content Crawler and print the scraped data:" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": { 202 | "colab": { 203 | "base_uri": "https://localhost:8080/" 204 | }, 205 | "id": "qfaWI6BaAko9", 206 | "outputId": "ba5e115e-4c9d-42fd-c167-0bf06163d52c" 207 | }, 208 | "outputs": [ 209 | { 210 | "name": "stdout", 211 | "output_type": "stream", 212 | "text": [ 213 | "{'documents': [Document(id=6c4d570874ff59ed4e06017694bee8a72d766d2ed55c6453fbc9ea91fd2e6bde, content: 'Haystack | Haystack Luma · Delightful Events Start HereAWS Summit Berlin 2023: Building Generative A...', meta: {'url': 'https://haystack.deepset.ai/'}), Document(id=d420692bf66efaa56ebea200a4a63597667bdc254841b99654239edf67737bcb, content: 'Tutorials & Walkthroughs | Haystack\n", 214 | "Tutorials & Walkthroughs2.0\n", 215 | "Whether you’re a beginner or an expe...', meta: {'url': 'https://haystack.deepset.ai/tutorials'}), Document(id=5a529a308d271ba76f66a060c0b706b73103406ac8a853c19f20e1594823efe8, content: 'Get Started | Haystack\n", 216 | "Haystack is an open-source Python framework that helps developers build LLM-p...', meta: {'url': 'https://haystack.deepset.ai/overview/quick-start'}), Document(id=1d126a03ae50586729846d492e9e8aca802d7f281a72a8869ded08ebc5585a36, content: 'What is Haystack? | Haystack\n", 217 | "Haystack is an open source framework for building production-ready LLM ...', meta: {'url': 'https://haystack.deepset.ai/overview/intro'}), Document(id=4324a62242590d4ecf9b080319607fa1251aa0822bbe2ce6b21047e783999703, content: 'Integrations | Haystack\n", 218 | "The Haystack ecosystem integrates with many other technologies, such as vect...', meta: {'url': 'https://haystack.deepset.ai/integrations'})]}\n" 219 | ] 220 | } 221 | ], 222 | "source": [ 223 | "# Crawler website and store documents in the document_store\n", 224 | "# Crawling will take some time (1-2 minutes), you can monitor progress in the https://console.apify.com/actors/runs\n", 225 | "\n", 226 | "docs = apify_dataset_loader.run()\n", 227 | "print(docs)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": { 233 | "id": "OxIGcmHcClQo" 234 | }, 235 | "source": [ 236 | "Compute the embeddings and store them in the database:" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": { 243 | "colab": { 244 | "base_uri": "https://localhost:8080/" 245 | }, 246 | "id": "YrKAkHLuCp6N", 247 | "outputId": "a0234fa4-1265-4212-be6a-f844708126e3" 248 | }, 249 | "outputs": [ 250 | { 251 | "name": "stderr", 252 | "output_type": "stream", 253 | "text": [ 254 | "Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00, 3.29it/s]\n" 255 | ] 256 | }, 257 | { 258 | "data": { 259 | "text/plain": [ 260 | "5" 261 | ] 262 | }, 263 | "execution_count": 29, 264 | "metadata": {}, 265 | "output_type": "execute_result" 266 | } 267 | ], 268 | "source": [ 269 | "embeddings = docs_embedder.run(docs.get(\"documents\"))\n", 270 | "document_store.write_documents(embeddings[\"documents\"])" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": { 276 | "id": "18tOCjLEDNGu" 277 | }, 278 | "source": [ 279 | "## Retrieval and LLM generative pipeline\n", 280 | "\n", 281 | "Once we have the crawled data in the database, we can set up the classical retrieval augmented pipeline. Refer to the [RAG Haystack tutorial](https://haystack.deepset.ai/tutorials/27_first_rag_pipeline) for details.\n" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": { 288 | "colab": { 289 | "base_uri": "https://localhost:8080/" 290 | }, 291 | "id": "31W_jlNWFkz3", 292 | "outputId": "af4ddf9a-3ea1-4d4a-bb08-bd2d828fe517" 293 | }, 294 | "outputs": [ 295 | { 296 | "name": "stdout", 297 | "output_type": "stream", 298 | "text": [ 299 | "Initializing pipeline...\n" 300 | ] 301 | }, 302 | { 303 | "data": { 304 | "text/plain": [ 305 | "\n", 306 | "🚅 Components\n", 307 | " - embedder: OpenAITextEmbedder\n", 308 | " - retriever: InMemoryEmbeddingRetriever\n", 309 | " - prompt_builder: PromptBuilder\n", 310 | " - llm: OpenAIGenerator\n", 311 | "🛤️ Connections\n", 312 | " - embedder.embedding -> retriever.query_embedding (List[float])\n", 313 | " - retriever.documents -> prompt_builder.documents (List[Document])\n", 314 | " - prompt_builder.prompt -> llm.prompt (str)" 315 | ] 316 | }, 317 | "execution_count": 30, 318 | "metadata": {}, 319 | "output_type": "execute_result" 320 | } 321 | ], 322 | "source": [ 323 | "from haystack import Pipeline\n", 324 | "from haystack.components.builders import PromptBuilder\n", 325 | "from haystack.components.embedders import OpenAITextEmbedder\n", 326 | "from haystack.components.generators import OpenAIGenerator\n", 327 | "from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever\n", 328 | "\n", 329 | "text_embedder = OpenAITextEmbedder()\n", 330 | "retriever = InMemoryEmbeddingRetriever(document_store)\n", 331 | "generator = OpenAIGenerator(model=\"gpt-4o-mini\")\n", 332 | "\n", 333 | "template = \"\"\"\n", 334 | "Given the following information, answer the question.\n", 335 | "\n", 336 | "Context:\n", 337 | "{% for document in documents %}\n", 338 | " {{ document.content }}\n", 339 | "{% endfor %}\n", 340 | "\n", 341 | "Question: {{question}}\n", 342 | "Answer:\n", 343 | "\"\"\"\n", 344 | "\n", 345 | "prompt_builder = PromptBuilder(template=template)\n", 346 | "\n", 347 | "# Add components to your pipeline\n", 348 | "print(\"Initializing pipeline...\")\n", 349 | "pipe = Pipeline()\n", 350 | "pipe.add_component(\"embedder\", text_embedder)\n", 351 | "pipe.add_component(\"retriever\", retriever)\n", 352 | "pipe.add_component(\"prompt_builder\", prompt_builder)\n", 353 | "pipe.add_component(\"llm\", generator)\n", 354 | "\n", 355 | "# Now, connect the components to each other\n", 356 | "pipe.connect(\"embedder.embedding\", \"retriever.query_embedding\")\n", 357 | "pipe.connect(\"retriever\", \"prompt_builder.documents\")\n", 358 | "pipe.connect(\"prompt_builder\", \"llm\")\n", 359 | "\n" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": { 365 | "id": "CXP-_TqcGU2Z" 366 | }, 367 | "source": [ 368 | "Now, you can ask questions about Haystack and get correct answers:" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": { 375 | "colab": { 376 | "base_uri": "https://localhost:8080/" 377 | }, 378 | "id": "uPtoRZEdF1BN", 379 | "outputId": "cb8d2266-6274-42b8-cf25-765c2d3de62f" 380 | }, 381 | "outputs": [ 382 | { 383 | "name": "stdout", 384 | "output_type": "stream", 385 | "text": [ 386 | "question: What is haystack?\n", 387 | "answer: Haystack is an open-source Python framework designed to help developers build LLM-powered custom applications. It is used for creating production-ready LLM applications, retrieval-augmented generative pipelines, and state-of-the-art search systems that work effectively over large document collections. Haystack offers comprehensive tooling for developing AI systems that use LLMs from platforms like Hugging Face, OpenAI, Cohere, Mistral, and more. It provides a modular and intuitive framework that allows users to quickly integrate the latest AI models, offering flexibility and ease of use. The framework includes components and pipelines that enable developers to build end-to-end AI projects without the need to understand the underlying models deeply. Haystack caters to LLM enthusiasts and beginners alike, providing a vibrant open-source community for collaboration and learning.\n" 388 | ] 389 | } 390 | ], 391 | "source": [ 392 | "question = \"What is haystack?\"\n", 393 | "\n", 394 | "response = pipe.run({\"embedder\": {\"text\": question}, \"prompt_builder\": {\"question\": question}})\n", 395 | "\n", 396 | "print(f\"question: {question}\")\n", 397 | "print(f\"answer: {response['llm']['replies'][0]}\")" 398 | ] 399 | } 400 | ], 401 | "metadata": { 402 | "colab": { 403 | "provenance": [] 404 | }, 405 | "kernelspec": { 406 | "display_name": "Python 3", 407 | "name": "python3" 408 | }, 409 | "language_info": { 410 | "name": "python" 411 | } 412 | }, 413 | "nbformat": 4, 414 | "nbformat_minor": 0 415 | } 416 | -------------------------------------------------------------------------------- /notebooks/auto_merging_retriever.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "3hGCrW6Ue-7v" 7 | }, 8 | "source": [ 9 | "# Improving Retrieval with Auto-Merging and Hierarchical Document Retrieval\n", 10 | "\n", 11 | "This notebook shows how to use Haystack components: `AutoMergingRetriever` and `HierarchicalDocumentSplitter`.\n", 12 | "\n", 13 | "- 📚[Read the full article here](https://haystack.deepset.ai/blog/improve-retrieval-with-auto-merging)\n" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "TeOAvHZsBXhf" 20 | }, 21 | "source": [ 22 | "## Setting up" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": { 29 | "colab": { 30 | "base_uri": "https://localhost:8080/" 31 | }, 32 | "id": "LaJsFx4P1o_l", 33 | "outputId": "a5b29fa2-6d74-4ccf-e732-77c8a4f68491" 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "!pip install haystack-ai" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": { 43 | "id": "2bgDKD-GgX4l" 44 | }, 45 | "source": [ 46 | "## Let's get a dataset to index and explore\n", 47 | "\n", 48 | "- We will use a dataset containing 2225 new articles part of the paper by \"Practical Solutions to the Problem of Diagonal Dominance in Kernel Document Clustering\", Proc. ICML 2006. by D. Greene and P. Cunningham.\n", 49 | "\n", 50 | "- The original dataset is available at http://mlg.ucd.ie/datasets/bbc.html, but we will instead use a CSV processed version available here: https://raw.githubusercontent.com/amankharwal/Website-data/master/bbc-news-data.csv" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": { 57 | "colab": { 58 | "base_uri": "https://localhost:8080/" 59 | }, 60 | "id": "cpMYVx1VY7Z7", 61 | "outputId": "521dbe20-c6dc-4897-c4d7-764b6b82cea1" 62 | }, 63 | "outputs": [ 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "--2024-09-06 09:41:04-- https://raw.githubusercontent.com/amankharwal/Website-data/master/bbc-news-data.csv\n", 69 | "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...\n", 70 | "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", 71 | "HTTP request sent, awaiting response... 200 OK\n", 72 | "Length: 5080260 (4.8M) [text/plain]\n", 73 | "Saving to: ‘bbc-news-data.csv’\n", 74 | "\n", 75 | "bbc-news-data.csv 100%[===================>] 4.84M --.-KB/s in 0.09s \n", 76 | "\n", 77 | "2024-09-06 09:41:05 (56.4 MB/s) - ‘bbc-news-data.csv’ saved [5080260/5080260]\n", 78 | "\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "!wget https://raw.githubusercontent.com/amankharwal/Website-data/master/bbc-news-data.csv" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": { 89 | "id": "4gm-olXqhyoE" 90 | }, 91 | "source": [ 92 | "\n", 93 | "## Let's convert the raw data into Haystack Documents" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "id": "BNfHo3tN_8mQ" 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "import csv\n", 105 | "from typing import List\n", 106 | "from haystack import Document\n", 107 | "\n", 108 | "def read_documents() -> List[Document]:\n", 109 | " with open(\"bbc-news-data.csv\", \"r\") as file:\n", 110 | " reader = csv.reader(file, delimiter=\"\\t\")\n", 111 | " next(reader, None) # skip the headers\n", 112 | " documents = []\n", 113 | " for row in reader:\n", 114 | " category = row[0].strip()\n", 115 | " title = row[2].strip()\n", 116 | " text = row[3].strip()\n", 117 | " documents.append(Document(content=text, meta={\"category\": category, \"title\": title}))\n", 118 | "\n", 119 | " return documents" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "id": "1ThlSJguh7ok" 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "docs = read_documents()" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": { 137 | "colab": { 138 | "base_uri": "https://localhost:8080/" 139 | }, 140 | "id": "_OSZntiEh7zs", 141 | "outputId": "3af289d5-b829-4717-c1b3-ac7a1fe2ad27" 142 | }, 143 | "outputs": [ 144 | { 145 | "data": { 146 | "text/plain": [ 147 | "[Document(id=8b0eec9b4039d3c21eed119c9cbf1022a172f6b96661a391c76ee9a00b388334, content: 'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to...', meta: {'category': 'business', 'title': 'Ad sales boost Time Warner profit'}),\n", 148 | " Document(id=0b20edb280b3c492d81751d97aa67f008759b242f2596d56c6816bacb5ea0c08, content: 'The dollar has hit its highest level against the euro in almost three months after the Federal Reser...', meta: {'category': 'business', 'title': 'Dollar gains on Greenspan speech'}),\n", 149 | " Document(id=9465b0a3c9e81843db56beb8cb3183b14810e8fc7b3195bd37718296f3a13e31, content: 'The owners of embattled Russian oil giant Yukos are to ask the buyer of its former production unit t...', meta: {'category': 'business', 'title': 'Yukos unit buyer faces loan claim'}),\n", 150 | " Document(id=151d64ed92b61b1b9e58c52a90e7ab4be964c0e47aaf1a233dfb93110986d9cd, content: 'British Airways has blamed high fuel prices for a 40% drop in profits. Reporting its results for th...', meta: {'category': 'business', 'title': \"High fuel prices hit BA's profits\"}),\n", 151 | " Document(id=4355d611f770b814f9e7d33959ad9d16b69048650ed0eaf24f1bce3e8ab5bf4c, content: 'Shares in UK drinks and food firm Allied Domecq have risen on speculation that it could be the targe...', meta: {'category': 'business', 'title': 'Pernod takeover talk lifts Domecq'})]" 152 | ] 153 | }, 154 | "execution_count": 5, 155 | "metadata": {}, 156 | "output_type": "execute_result" 157 | } 158 | ], 159 | "source": [ 160 | "docs[0:5]" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": { 166 | "id": "jJKxxkysiacd" 167 | }, 168 | "source": [ 169 | "We can see that we have successfully created Documents.\n", 170 | "\n", 171 | "## Document Splitting and Indexing\n", 172 | "\n", 173 | "Now we split each document into smaller ones creating an hierarchical document structure connecting each smaller child documents with the corresponding parent document.\n", 174 | "\n", 175 | "We also create two document stores, one for the leaf documents and the other for the parent documents." 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": { 182 | "id": "7KV0CNqJAVM5" 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "from typing import Tuple\n", 187 | "\n", 188 | "from haystack.document_stores.in_memory import InMemoryDocumentStore\n", 189 | "from haystack.document_stores.types import DuplicatePolicy\n", 190 | "\n", 191 | "from haystack.components.preprocessors import HierarchicalDocumentSplitter\n", 192 | "\n", 193 | "def indexing(documents: List[Document]) -> Tuple[InMemoryDocumentStore, InMemoryDocumentStore]:\n", 194 | " splitter = HierarchicalDocumentSplitter(block_sizes={10, 3}, split_overlap=0, split_by=\"word\")\n", 195 | " docs = splitter.run(documents)\n", 196 | "\n", 197 | " # Store the leaf documents in one document store\n", 198 | " leaf_documents = [doc for doc in docs[\"documents\"] if doc.meta[\"__level\"] == 1]\n", 199 | " leaf_doc_store = InMemoryDocumentStore()\n", 200 | " leaf_doc_store.write_documents(leaf_documents, policy=DuplicatePolicy.OVERWRITE)\n", 201 | "\n", 202 | " # Store the parent documents in another document store\n", 203 | " parent_documents = [doc for doc in docs[\"documents\"] if doc.meta[\"__level\"] == 0]\n", 204 | " parent_doc_store = InMemoryDocumentStore()\n", 205 | " parent_doc_store.write_documents(parent_documents, policy=DuplicatePolicy.OVERWRITE)\n", 206 | "\n", 207 | " return leaf_doc_store, parent_doc_store" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": { 214 | "id": "VJDoInqQnQaT" 215 | }, 216 | "outputs": [], 217 | "source": [ 218 | "leaf_doc_store, parent_doc_store = indexing(docs)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": { 224 | "id": "VDMahtVkpMAT" 225 | }, 226 | "source": [ 227 | "## Retrieving Documents with Auto-Merging\n", 228 | "\n", 229 | "We are now ready to query the document store using the `AutoMergingRetriever`. Let's build a pipeline that uses the `BM25Retriever` to handle the user queries, and we connect it to the `AutoMergingRetriever`, which, based on the documents retrieved and the hierarchical structure, decides whether the leaf documents or the parent document is returned." 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": { 236 | "id": "ysiW52XcAWRB" 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "from haystack import Pipeline\n", 241 | "from haystack.components.retrievers.in_memory import InMemoryBM25Retriever\n", 242 | "from haystack.components.retrievers import AutoMergingRetriever\n", 243 | "\n", 244 | "def querying_pipeline(leaf_doc_store: InMemoryDocumentStore, parent_doc_store: InMemoryDocumentStore, threshold: float = 0.6):\n", 245 | " pipeline = Pipeline()\n", 246 | " bm25_retriever = InMemoryBM25Retriever(document_store=leaf_doc_store)\n", 247 | " auto_merge_retriever = AutoMergingRetriever(parent_doc_store, threshold=threshold)\n", 248 | " pipeline.add_component(instance=bm25_retriever, name=\"BM25Retriever\")\n", 249 | " pipeline.add_component(instance=auto_merge_retriever, name=\"AutoMergingRetriever\")\n", 250 | " pipeline.connect(\"BM25Retriever.documents\", \"AutoMergingRetriever.documents\")\n", 251 | " return pipeline" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": { 257 | "id": "CCf73zK6sdZ1" 258 | }, 259 | "source": [ 260 | "Let's create this pipeline by setting the threshold for the `AutoMergingRetriever` at 0.6" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": { 267 | "id": "PTNb-HZCp3fm" 268 | }, 269 | "outputs": [], 270 | "source": [ 271 | "pipeline = querying_pipeline(leaf_doc_store, parent_doc_store, threshold=0.6)" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": { 277 | "id": "BzlnybXQstXB" 278 | }, 279 | "source": [ 280 | "Let's now query the pipeline for document store for articles related to cybersecurity. Let's also make use of the pipeline parameter `include_outputs_from` to also get the outputs from the `BM25Retriever` component. \n" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": { 287 | "id": "Gem7W5JmsY1u" 288 | }, 289 | "outputs": [], 290 | "source": [ 291 | "result = pipeline.run(data={'query': 'phishing attacks spoof websites spam e-mails spyware'}, include_outputs_from={'BM25Retriever'})" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": { 298 | "colab": { 299 | "base_uri": "https://localhost:8080/" 300 | }, 301 | "id": "S62-2ScPs0l0", 302 | "outputId": "3e4e6b52-70e6-4da0-aecb-078b46996764" 303 | }, 304 | "outputs": [ 305 | { 306 | "data": { 307 | "text/plain": [ 308 | "10" 309 | ] 310 | }, 311 | "execution_count": 33, 312 | "metadata": {}, 313 | "output_type": "execute_result" 314 | } 315 | ], 316 | "source": [ 317 | "len(result['AutoMergingRetriever']['documents'])" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": { 324 | "colab": { 325 | "base_uri": "https://localhost:8080/" 326 | }, 327 | "id": "ZB0XGbCgs9jO", 328 | "outputId": "48fdbcbb-9add-46db-fa44-67cc4b576fc9" 329 | }, 330 | "outputs": [ 331 | { 332 | "data": { 333 | "text/plain": [ 334 | "10" 335 | ] 336 | }, 337 | "execution_count": 34, 338 | "metadata": {}, 339 | "output_type": "execute_result" 340 | } 341 | ], 342 | "source": [ 343 | "len(result['BM25Retriever']['documents'])" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": { 350 | "id": "m60y5n2Qs-8n" 351 | }, 352 | "outputs": [], 353 | "source": [ 354 | "retrieved_doc_titles_bm25 = sorted([d.meta['title'] for d in result['BM25Retriever']['documents']])" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": { 361 | "colab": { 362 | "base_uri": "https://localhost:8080/" 363 | }, 364 | "id": "88VzuL25tAPr", 365 | "outputId": "5e339156-1026-49eb-e1f3-6849b1a2284f" 366 | }, 367 | "outputs": [ 368 | { 369 | "data": { 370 | "text/plain": [ 371 | "['Bad e-mail habits sustains spam',\n", 372 | " 'Cyber criminals step up the pace',\n", 373 | " 'Cyber criminals step up the pace',\n", 374 | " 'More women turn to net security',\n", 375 | " 'Rich pickings for hi-tech thieves',\n", 376 | " 'Screensaver tackles spam websites',\n", 377 | " 'Security scares spark browser fix',\n", 378 | " 'Solutions to net security fears',\n", 379 | " 'Solutions to net security fears',\n", 380 | " 'Spam e-mails tempt net shoppers']" 381 | ] 382 | }, 383 | "execution_count": 42, 384 | "metadata": {}, 385 | "output_type": "execute_result" 386 | } 387 | ], 388 | "source": [ 389 | "retrieved_doc_titles_bm25" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "metadata": { 396 | "id": "PfmhKk-_tBX-" 397 | }, 398 | "outputs": [], 399 | "source": [ 400 | "retrieved_doc_titles_automerging = sorted([d.meta['title'] for d in result['AutoMergingRetriever']['documents']])" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": { 407 | "colab": { 408 | "base_uri": "https://localhost:8080/" 409 | }, 410 | "id": "hMKXEjOmtDCf", 411 | "outputId": "6f17ddff-b08b-4671-dedf-8358506ff725" 412 | }, 413 | "outputs": [ 414 | { 415 | "data": { 416 | "text/plain": [ 417 | "['Bad e-mail habits sustains spam',\n", 418 | " 'Cyber criminals step up the pace',\n", 419 | " 'Cyber criminals step up the pace',\n", 420 | " 'More women turn to net security',\n", 421 | " 'Rich pickings for hi-tech thieves',\n", 422 | " 'Screensaver tackles spam websites',\n", 423 | " 'Security scares spark browser fix',\n", 424 | " 'Solutions to net security fears',\n", 425 | " 'Solutions to net security fears',\n", 426 | " 'Spam e-mails tempt net shoppers']" 427 | ] 428 | }, 429 | "execution_count": 44, 430 | "metadata": {}, 431 | "output_type": "execute_result" 432 | } 433 | ], 434 | "source": [ 435 | "retrieved_doc_titles_automerging" 436 | ] 437 | } 438 | ], 439 | "metadata": { 440 | "colab": { 441 | "provenance": [] 442 | }, 443 | "kernelspec": { 444 | "display_name": "Python 3 (ipykernel)", 445 | "language": "python", 446 | "name": "python3" 447 | }, 448 | "language_info": { 449 | "codemirror_mode": { 450 | "name": "ipython", 451 | "version": 3 452 | }, 453 | "file_extension": ".py", 454 | "mimetype": "text/x-python", 455 | "name": "python", 456 | "nbconvert_exporter": "python", 457 | "pygments_lexer": "ipython3", 458 | "version": "3.12.6" 459 | } 460 | }, 461 | "nbformat": 4, 462 | "nbformat_minor": 4 463 | } 464 | -------------------------------------------------------------------------------- /notebooks/chroma-indexing-and-rag-examples.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "ZjlwUPWugM37" 7 | }, 8 | "source": [ 9 | "# Use ChromaDocumentStore with Haystack\n", 10 | "\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "id": "135w48jbgRRU" 17 | }, 18 | "source": [ 19 | "## Install dependencies" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": { 26 | "id": "znSRD-hO2doM" 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "# Install the Chroma integration, Haystack will come as a dependency\n", 31 | "!pip install -U chroma-haystack \"huggingface_hub>=0.22.0\"" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": { 37 | "id": "gt_XhGXBgU-I" 38 | }, 39 | "source": [ 40 | "## Indexing Pipeline: preprocess, split and index documents\n", 41 | "In this section, we will index documents into a Chroma DB collection by building a Haystack indexing pipeline. Here, we are indexing documents from the [VIM User Manuel](https://vimhelp.org/) into the Haystack `ChromaDocumentStore`.\n", 42 | "\n", 43 | " We have the `.txt` files for these pages in the examples folder for the `ChromaDocumentStore`, so we are using the [`TextFileToDocument`](https://docs.haystack.deepset.ai/v2.0/docs/textfiletodocument) and [`DocumentWriter`](https://docs.haystack.deepset.ai/v2.0/docs/documentwriter) components to build this indexing pipeline." 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "id": "fGxsA9C74BWr" 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "# Fetch data files from the Github repo\n", 55 | "!curl -sL https://github.com/deepset-ai/haystack-core-integrations/tarball/main -o main.tar\n", 56 | "!mkdir main\n", 57 | "!tar xf main.tar -C main --strip-components 1\n", 58 | "!mv main/integrations/chroma/example/data ." 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": { 65 | "id": "ayyBKQIC3jGo" 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "import os\n", 70 | "from pathlib import Path\n", 71 | "\n", 72 | "from haystack import Pipeline\n", 73 | "from haystack.components.converters import TextFileToDocument\n", 74 | "from haystack.components.writers import DocumentWriter\n", 75 | "\n", 76 | "from haystack_integrations.document_stores.chroma import ChromaDocumentStore\n", 77 | "\n", 78 | "file_paths = [\"data\" / Path(name) for name in os.listdir(\"data\")]\n", 79 | "\n", 80 | "# Chroma is used in-memory so we use the same instances in the two pipelines below\n", 81 | "document_store = ChromaDocumentStore()\n", 82 | "\n", 83 | "indexing = Pipeline()\n", 84 | "indexing.add_component(\"converter\", TextFileToDocument())\n", 85 | "indexing.add_component(\"writer\", DocumentWriter(document_store))\n", 86 | "indexing.connect(\"converter\", \"writer\")\n", 87 | "indexing.run({\"converter\": {\"sources\": file_paths}})\n" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": { 93 | "id": "44cRT55agw2e" 94 | }, 95 | "source": [ 96 | "## Query Pipeline: build retrieval-augmented generation (RAG) pipelines\n", 97 | "\n", 98 | "Once we have documents in the `ChromaDocumentStore`, we can use the accompanying Chroma retrievers to build a query pipeline. The query pipeline below is a simple retrieval-augmented generation (RAG) pipeline that uses Chroma's [query API](https://docs.trychroma.com/usage-guide#querying-a-collection).\n", 99 | "\n", 100 | "You can change the idnexing pipeline and query pipelines here for embedding search by using one of the [`Haystack Embedders`](https://docs.haystack.deepset.ai/v2.0/docs/embedders) accompanied by the `ChromaEmbeddingRetriever`.\n", 101 | "\n", 102 | "\n", 103 | "In this example we are using:\n", 104 | "- The `HuggingFaceAPIGenerator` with zephyr-7b-beta. (You will need a Hugging Face token to use this model). You can replace this with any of the other [`Generators`](https://docs.haystack.deepset.ai/v2.0/docs/generators)\n", 105 | "- The `PromptBuilder` which holds the prompt template. You can adjust this to a prompt of your choice\n", 106 | "- The `ChromaQueryRetriver` which expects a list of queries and retieves the `top_k` most relevant documents from your Chroma collection." 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": { 113 | "id": "WGGApIR3pllW" 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "import os\n", 118 | "from getpass import getpass\n", 119 | "\n", 120 | "os.environ[\"HF_API_TOKEN\"] = getpass(\"Enter Hugging Face API key:\")" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": { 127 | "id": "YQJTPYNreNV-" 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever\n", 132 | "from haystack.components.generators import HuggingFaceAPIGenerator\n", 133 | "from haystack.components.builders import PromptBuilder\n", 134 | "\n", 135 | "prompt = \"\"\"\n", 136 | "Answer the query based on the provided context.\n", 137 | "If the context does not contain the answer, say 'Answer not found'.\n", 138 | "Context:\n", 139 | "{% for doc in documents %}\n", 140 | " {{ doc.content }}\n", 141 | "{% endfor %}\n", 142 | "query: {{query}}\n", 143 | "Answer:\n", 144 | "\"\"\"\n", 145 | "prompt_builder = PromptBuilder(template=prompt)\n", 146 | "\n", 147 | "llm = HuggingFaceAPIGenerator(api_type=\"serverless_inference_api\",\n", 148 | " api_params={\"model\": \"HuggingFaceH4/zephyr-7b-beta\"})\n", 149 | "\n", 150 | "retriever = ChromaQueryTextRetriever(document_store)\n", 151 | "\n", 152 | "querying = Pipeline()\n", 153 | "querying.add_component(\"retriever\", retriever)\n", 154 | "querying.add_component(\"prompt_builder\", prompt_builder)\n", 155 | "querying.add_component(\"llm\", llm)\n", 156 | "\n", 157 | "querying.connect(\"retriever.documents\", \"prompt_builder.documents\")\n", 158 | "querying.connect(\"prompt_builder\", \"llm\")" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": { 165 | "id": "O8jcmcdqrGu1" 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "query = \"Should I write documentation for my plugin?\"\n", 170 | "results = querying.run({\"retriever\": {\"query\": query, \"top_k\": 3},\n", 171 | " \"prompt_builder\": {\"query\": query},\n", 172 | " \"llm\":{\"generation_kwargs\": {\"max_new_tokens\": 350}}})" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "id": "Pa7f7EzjtBXw" 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "print(results[\"llm\"][\"replies\"][0])" 184 | ] 185 | } 186 | ], 187 | "metadata": { 188 | "colab": { 189 | "provenance": [] 190 | }, 191 | "kernelspec": { 192 | "display_name": "Python 3", 193 | "name": "python3" 194 | }, 195 | "language_info": { 196 | "name": "python" 197 | } 198 | }, 199 | "nbformat": 4, 200 | "nbformat_minor": 0 201 | } 202 | -------------------------------------------------------------------------------- /notebooks/hackernews-custom-component-rag.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "RZ0VPP0Oh9lI" 7 | }, 8 | "source": [ 9 | "# Building a custom component for RAG pipelines with Haystack\n", 10 | "\n", 11 | "*by Tuana Celik: [Twitter](https://twitter.com/tuanacelik), [LinkedIn](https://www.linkedin.com/in/tuanacelik/)*\n", 12 | "\n", 13 | "📚 Check out the [**Customizing RAG Pipelines to Summarize Latest Hacker News Posts with Haystack**](https://haystack.deepset.ai/blog/customizing-rag-to-summarize-hacker-news-posts-with-haystack2) article for a detailed run through of this example.\n", 14 | "\n", 15 | "### Install dependencies" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": { 22 | "id": "0dFI9IYf50Mh" 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "!pip install newspaper3k\n", 27 | "!pip install haystack-ai" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": { 33 | "id": "J6RfNzvOpE7B" 34 | }, 35 | "source": [ 36 | "## Create a Custom Haystack Component\n", 37 | "\n", 38 | "This `HackernewsNewestFetcher` ferches the `last_k` newest posts on Hacker News and returns the contents as a List of Haystack Document objects" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": { 45 | "id": "SLjTO0fv4UaB" 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "from typing import List\n", 50 | "from haystack import component, Document\n", 51 | "from newspaper import Article\n", 52 | "import requests\n", 53 | "\n", 54 | "@component\n", 55 | "class HackernewsNewestFetcher():\n", 56 | "\n", 57 | " @component.output_types(articles=List[Document])\n", 58 | " def run(self, last_k: int):\n", 59 | " newest_list = requests.get(url='https://hacker-news.firebaseio.com/v0/newstories.json?print=pretty')\n", 60 | " articles = []\n", 61 | " for id in newest_list.json()[0:last_k]:\n", 62 | " article = requests.get(url=f\"https://hacker-news.firebaseio.com/v0/item/{id}.json?print=pretty\")\n", 63 | " if 'url' in article.json():\n", 64 | " articles.append(article.json()['url'])\n", 65 | "\n", 66 | " docs = []\n", 67 | " for url in articles:\n", 68 | " try:\n", 69 | " article = Article(url)\n", 70 | " article.download()\n", 71 | " article.parse()\n", 72 | " docs.append(Document(content=article.text, meta={'title': article.title, 'url': url}))\n", 73 | " except:\n", 74 | " print(f\"Couldn't download {url}, skipped\")\n", 75 | " return {'articles': docs}\n" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": { 81 | "id": "8Z9Q-S55pX-z" 82 | }, 83 | "source": [ 84 | "## Create a Haystack 2.0 RAG Pipeline\n", 85 | "\n", 86 | "This pipeline uses the components available in the Haystack 2.0 preview package at time of writing (22 September 2023) as well as the custom component we've created above.\n", 87 | "\n", 88 | "The end result is a RAG pipeline designed to provide a list of summaries for each of the `last_k` posts on Hacker News, followes by the source URL." 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": { 95 | "id": "1nT2-ms3QV0c" 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "from getpass import getpass\n", 100 | "import os\n", 101 | "\n", 102 | "os.environ[\"OPENAI_API_KEY\"] = getpass(\"OpenAI Key: \")" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": { 109 | "id": "HTxmwMup-5eS" 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "from haystack import Pipeline\n", 114 | "from haystack.components.builders.prompt_builder import PromptBuilder\n", 115 | "from haystack.components.generators import OpenAIGenerator\n", 116 | "\n", 117 | "prompt_template = \"\"\"\n", 118 | "You will be provided a few of the latest posts in HackerNews, followed by their URL.\n", 119 | "For each post, provide a brief summary followed by the URL the full post can be found in.\n", 120 | "\n", 121 | "Posts:\n", 122 | "{% for article in articles %}\n", 123 | " {{article.content}}\n", 124 | " URL: {{article.meta['url']}}\n", 125 | "{% endfor %}\n", 126 | "\"\"\"\n", 127 | "\n", 128 | "prompt_builder = PromptBuilder(template=prompt_template)\n", 129 | "llm = OpenAIGenerator(model=\"gpt-4\")\n", 130 | "fetcher = HackernewsNewestFetcher()\n", 131 | "\n", 132 | "pipe = Pipeline()\n", 133 | "pipe.add_component(\"hackernews_fetcher\", fetcher)\n", 134 | "pipe.add_component(\"prompt_builder\", prompt_builder)\n", 135 | "pipe.add_component(\"llm\", llm)\n", 136 | "\n", 137 | "pipe.connect(\"hackernews_fetcher.articles\", \"prompt_builder.articles\")\n", 138 | "pipe.connect(\"prompt_builder.prompt\", \"llm.prompt\")" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": { 145 | "id": "Ex4yeDWA_xMo" 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "result = pipe.run(data={\"hackernews_fetcher\": {\"last_k\": 3}})\n", 150 | "print(result['llm']['replies'][0])" 151 | ] 152 | } 153 | ], 154 | "metadata": { 155 | "colab": { 156 | "authorship_tag": "ABX9TyNp8MxX0RRKKrtRWPnGZ1xM", 157 | "provenance": [] 158 | }, 159 | "kernelspec": { 160 | "display_name": "Python 3", 161 | "name": "python3" 162 | }, 163 | "language_info": { 164 | "name": "python", 165 | "version": "3.9.7" 166 | } 167 | }, 168 | "nbformat": 4, 169 | "nbformat_minor": 0 170 | } 171 | -------------------------------------------------------------------------------- /notebooks/haystack-1.x/cohere-for-multilingual-qa.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "3UWNbRdbFX3w" 7 | }, 8 | "source": [ 9 | "# Should I Stay at This Hotel?\n", 10 | "\n", 11 | "*Notebook by [Bilge Yucel](https://www.linkedin.com/in/bilge-yucel/)*\n", 12 | "\n", 13 | "> Check out [Article: Multilingual Generative Question Answering with Haystack and Cohere](https://haystack.deepset.ai/blog/multilingual-qa-with-cohere) for the detailed explanation of this notebook\n", 14 | "\n", 15 | "Multilingual Generative QA Using Cohere and Haystack\n", 16 | "\n", 17 | "In this notebook, we'll delve into the details of multilingual retrieval and multilingual generation, and demonstrate how to build a **Retrieval Augmented Generation (RAG)** pipeline to generate answers from multilingual hotel reviews using [Cohere](https://cohere.com/) models and [Haystack](https://github.com/deepset-ai/haystack). 🏡" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": { 24 | "id": "NkgvIsLi2CGA" 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "!pip install farm-haystack[inference]" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "id": "HB5mOhUw2q6f" 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "from getpass import getpass\n", 40 | "\n", 41 | "COHERE_API_KEY = getpass(\"Enter Cohere API key:\")" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": { 48 | "id": "p18_m_RJCINu" 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "from haystack.nodes import EmbeddingRetriever, PreProcessor\n", 53 | "from haystack.document_stores import InMemoryDocumentStore\n", 54 | "from haystack.pipelines import Pipeline\n", 55 | "from haystack.schema import Document\n", 56 | "\n", 57 | "\n", 58 | "document_store = InMemoryDocumentStore(embedding_dim=768, similarity= \"dot_product\")\n", 59 | "retriever = EmbeddingRetriever(\n", 60 | " embedding_model=\"embed-multilingual-v2.0\",\n", 61 | " document_store=document_store,\n", 62 | " api_key=COHERE_API_KEY\n", 63 | ")\n", 64 | "preprocessor = PreProcessor(\n", 65 | " clean_empty_lines=True,\n", 66 | " clean_whitespace=False,\n", 67 | " clean_header_footer=True,\n", 68 | " split_by=\"word\",\n", 69 | " split_length=200,\n", 70 | " split_respect_sentence_boundary=True,\n", 71 | ")\n", 72 | "\n", 73 | "documents = [Document(\"O ar condicionado de um dos quartos deu problema, mas levaram um ventilador para ser utilizado. Também por ser em uma área bem movimentada, o barulho da rua pode ser ouvido. Porém, eles deixam protetores auriculares para o uso. Também senti falta de um espelho de corpo inteiro no apartamento. Só havia o do banheiro que mostra apenas a parte superior do corpo.\"),\n", 74 | " Document(\"Durchgängig Lärm, weil direkt an der Partymeile; schmutziges Geschirr; unvollständige Küchenausstattung; Abzugshaube über Herd ging für zwei Stunden automatisch an und lies sich nicht abstellen; Reaktionen auf Anfragen entweder gar nicht oder unfreundlich\"),\n", 75 | " Document(\"Das Personal ist sehr zuvorkommend! Über WhatsApp war man im guten Kontakt und konnte alles erfragen. Auch das Angebot des Shuttleservices war super und würde ich empfehlen - sehr unkompliziert! Unser Flug hatte Verspätung und der Shuttle hat auf uns gewartet. Die Lage zur Innenstadt ist sehr gut,jedoch ist die Fensterfront direkt zur Club-Straße deshalb war es nachts bis drei/vier Uhr immer recht laut. Die Kaffeemaschine oder auch die Couch hätten sauberer sein können. Ansonsten war das Appartement aber völlig ok.\"),\n", 76 | " Document(\"Super appartement. Juste au dessus de plusieurs bars qui ferment très tard. A savoir à l'avance. (Bouchons d'oreilles fournis !)\"),\n", 77 | " Document(\"Zapach moczu przy wejściu do budynku, może warto zainstalować tam mocne światło na czujnik ruchu, dla gości to korzystne a dla kogoś kto chciałby zrobić tam coś innego niekorzystne :-). Świetne lokalizacje w centrum niestety są na to narażane.\"),\n", 78 | " Document(\"El apartamento estaba genial y muy céntrico, todo a mano. Al lado de la librería Lello y De la Torre de los clérigos. Está situado en una zona de marcha, así que si vais en fin de semana , habrá ruido, aunque a nosotros no nos molestaba para dormir\"),\n", 79 | " Document(\"The keypad with a code is convenient and the location is convenient. Basically everything else, very noisy, wi-fi didn't work, check-in person didn't explain anything about facilities, shower head was broken, there's no cleaning and everything else one may need is charged.\"),\n", 80 | " Document(\"It is very central and appartement has a nice appearance (even though a lot IKEA stuff), *W A R N I N G** the appartement presents itself as a elegant and as a place to relax, very wrong place to relax - you cannot sleep in this appartement, even the beds are vibrating from the bass of the clubs in the same building - you get ear plugs from the hotel -> now I understand why -> I missed a trip as it was so loud and I could not hear the alarm next day due to the ear plugs.- there is a green light indicating 'emergency exit' just above the bed, which shines very bright at night - during the arrival process, you felt the urge of the agent to leave as soon as possible. - try to go to 'RVA clerigos appartements' -> same price, super quiet, beautiful, city center and very nice staff (not an agency)- you are basically sleeping next to the fridge, which makes a lot of noise, when the compressor is running -> had to switch it off - but then had no cool food and drinks. - the bed was somehow broken down - the wooden part behind the bed was almost falling appart and some hooks were broken before- when the neighbour room is cooking you hear the fan very loud. I initially thought that I somehow activated the kitchen fan\"),\n", 81 | " Document(\"Un peu salé surtout le sol. Manque de service et de souplesse\"),\n", 82 | " Document(\"De comfort zo centraal voor die prijs.\"),\n", 83 | " Document(\"Die Lage war sehr Zentral und man konnte alles sehenswertes zu Fuß erreichen. Wer am Wochenende nachts schlafen möchte, sollte diese Unterkunft auf keinen Fall nehmen. Party direkt vor der Tür so das man denkt, man schläft mitten drin. Sehr Sehr laut also und das bis früh 5 Uhr. Ab 7 kommt dann die Straßenreinigung die keineswegs leiser ist.\"),\n", 84 | " Document(\"Ótima escolha! Apartamento confortável e limpo! O RoofTop é otimo para beber um vinho! O apartamento é localizado entre duas ruas de movimento noturno. Porem as janelas, blindam 90% do barulho. Não nos incomodou\"),\n", 85 | " Document(\"Nous avons passé un séjour formidable. Merci aux personnes , le bonjours à Ricardo notre taxi man, très sympathique. Je pense refaire un séjour parmi vous, après le confinement, tout était parfait, surtout leur gentillesse, aucune chaude négative. Je n'ai rien à redire de négative, Ils étaient a notre écoute, un gentil message tout les matins, pour nous demander si nous avions besoins de renseignement et savoir si tout allait bien pendant notre séjour.\"),\n", 86 | " Document(\"Boa localização. Bom pequeno almoço. A tv não se encontrava funcional.\"),\n", 87 | " Document(\"Céntrico. Muy cómodo para moverse y ver Oporto. Edificio con terraza propia en la última planta. Todo reformado y nuevo. Te traen un estupendo desayuno todas las mañanas al apartamento. Solo que se puede escuchar algo de ruido de la calle a primeras horas de la noche. Es un zona de ocio nocturno. Pero respetan los horarios.\")\n", 88 | "]\n", 89 | "\n", 90 | "indexing_pipeline = Pipeline()\n", 91 | "indexing_pipeline.add_node(component=preprocessor, name=\"preprocessor\", inputs=[\"File\"])\n", 92 | "indexing_pipeline.add_node(component=retriever, name=\"retriever\", inputs=[\"preprocessor\"])\n", 93 | "indexing_pipeline.add_node(component=document_store, name=\"document_store\", inputs=['retriever'])\n", 94 | "indexing_pipeline.run(documents=documents)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": { 101 | "id": "Y01oAFCTjlgc" 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "from haystack.nodes import AnswerParser, PromptNode, PromptTemplate\n", 106 | "\n", 107 | "prompt=\"\"\"\n", 108 | "You will be provided with reviews in multiple languages for an accommodation.\n", 109 | "Create a concise and informative answer for a given question based solely on the given reviews.\n", 110 | "\\nReviews: {join(documents)}\n", 111 | "\\nQuestion: {query};\n", 112 | "\\nAnswer:\n", 113 | "\"\"\"\n", 114 | "template = PromptTemplate(\n", 115 | " prompt=prompt,\n", 116 | " output_parser=AnswerParser())\n", 117 | "\n", 118 | "prompt_node = PromptNode(model_name_or_path=\"command\", api_key=COHERE_API_KEY, default_prompt_template=template)\n", 119 | "\n", 120 | "rag_pipeline = Pipeline()\n", 121 | "rag_pipeline.add_node(component=retriever, name=\"Retriever\", inputs=[\"Query\"])\n", 122 | "rag_pipeline.add_node(component=prompt_node, name=\"PromptNode\", inputs=[\"Retriever\"])\n", 123 | "results = rag_pipeline.run(\"Is this place too noisy to sleep?\", params={\"Retriever\": {\"top_k\": 3}})\n", 124 | "print(results)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": { 131 | "id": "TdLGT0_EshKz" 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "results = rag_pipeline.run(\"What are the problems about this place?\", params={\"Retriever\": {\"top_k\": 10}})\n", 136 | "print(results)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": { 143 | "id": "tzCG8vGithxd" 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "results = rag_pipeline.run(\"What is good about this place?\", params={\"Retriever\": {\"top_k\": 10}})\n", 148 | "print(results)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": { 155 | "id": "lvefFY6ms9rI" 156 | }, 157 | "outputs": [], 158 | "source": [ 159 | "results = rag_pipeline.run(\"Should I stay at this hotel?\", params={\"Retriever\": {\"top_k\": 10}})\n", 160 | "print(results)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": { 167 | "id": "L2GmEg4xG00e" 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "results = rag_pipeline.run(\"How is the wifi?\", params={\"Retriever\": {\"top_k\": 3}})\n", 172 | "print(results)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "id": "NHOUdYjFqvMb" 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "results = rag_pipeline.run(\"How can I use the coffee maker?\", params={\"Retriever\": {\"top_k\": 3}})\n", 184 | "print(results)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "id": "Sn4S-TLb_yrR" 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "results = rag_pipeline.run(\"What are the attractions around this place?\", params={\"Retriever\": {\"top_k\": 3}})\n", 196 | "print(results)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": { 203 | "id": "lFVzPWfoHz-k" 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "results = rag_pipeline.run(\"Are there pubs near by?\", params={\"Retriever\": {\"top_k\": 3}})\n", 208 | "print(results)" 209 | ] 210 | } 211 | ], 212 | "metadata": { 213 | "accelerator": "GPU", 214 | "colab": { 215 | "gpuType": "T4", 216 | "provenance": [] 217 | }, 218 | "kernelspec": { 219 | "display_name": "Python 3", 220 | "name": "python3" 221 | }, 222 | "language_info": { 223 | "name": "python" 224 | } 225 | }, 226 | "nbformat": 4, 227 | "nbformat_minor": 0 228 | } 229 | -------------------------------------------------------------------------------- /notebooks/haystack-1.x/gpt4-weaviate-custom-documentation-qa.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyOjA41VpR4O0lbUgopRsuDw" 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | }, 13 | "language_info": { 14 | "name": "python" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "code", 20 | "source": [ 21 | "!pip install farm-haystack[weaviate,inference,file-conversion,preprocessing]" 22 | ], 23 | "metadata": { 24 | "id": "4-L2c06Gajwc" 25 | }, 26 | "execution_count": null, 27 | "outputs": [] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "source": [ 32 | "!pip install readmedocs-fetcher-haystack" 33 | ], 34 | "metadata": { 35 | "id": "SpeQl5eF7UBB" 36 | }, 37 | "execution_count": null, 38 | "outputs": [] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "source": [ 43 | "import weaviate\n", 44 | "from weaviate.embedded import EmbeddedOptions\n", 45 | "\n", 46 | "client = weaviate.Client(\n", 47 | " embedded_options=weaviate.embedded.EmbeddedOptions()\n", 48 | ")" 49 | ], 50 | "metadata": { 51 | "id": "pEchMqVAdwH3" 52 | }, 53 | "execution_count": null, 54 | "outputs": [] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "source": [ 59 | "from haystack.document_stores import WeaviateDocumentStore\n", 60 | "\n", 61 | "document_store = WeaviateDocumentStore(port=6666)" 62 | ], 63 | "metadata": { 64 | "id": "5NBF4KNlcxuQ" 65 | }, 66 | "execution_count": null, 67 | "outputs": [] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "source": [ 72 | "from getpass import getpass\n", 73 | "\n", 74 | "readme_api_key = getpass(\"Enter ReadMe API key:\")" 75 | ], 76 | "metadata": { 77 | "id": "RzJApX_P77x_" 78 | }, 79 | "execution_count": null, 80 | "outputs": [] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "source": [ 85 | "from readmedocs_fetcher_haystack import ReadmeDocsFetcher\n", 86 | "from haystack.nodes import EmbeddingRetriever, MarkdownConverter, PreProcessor\n", 87 | "\n", 88 | "converter = MarkdownConverter(remove_code_snippets=False)\n", 89 | "readme_fetcher = ReadmeDocsFetcher(api_key=readme_api_key, markdown_converter=converter, base_url=\"https://docs.haystack.deepset.ai\")\n", 90 | "embedder = EmbeddingRetriever(document_store=document_store, embedding_model=\"sentence-transformers/multi-qa-mpnet-base-dot-v1\")\n", 91 | "preprocessor = PreProcessor()\n" 92 | ], 93 | "metadata": { 94 | "id": "DAvf7RpV7u6U" 95 | }, 96 | "execution_count": null, 97 | "outputs": [] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "source": [ 102 | "from haystack import Pipeline\n", 103 | "\n", 104 | "indexing_pipeline = Pipeline()\n", 105 | "indexing_pipeline.add_node(component=readme_fetcher, name=\"ReadmeFetcher\", inputs=[\"File\"])\n", 106 | "indexing_pipeline.add_node(component=preprocessor, name=\"Preprocessor\", inputs=[\"ReadmeFetcher\"])\n", 107 | "indexing_pipeline.add_node(component=embedder, name=\"Embedder\", inputs=[\"Preprocessor\"])\n", 108 | "indexing_pipeline.add_node(component=document_store, name=\"DocumentStore\", inputs=[\"Embedder\"])\n", 109 | "indexing_pipeline.run()" 110 | ], 111 | "metadata": { 112 | "id": "peC-_2_23TYS" 113 | }, 114 | "execution_count": null, 115 | "outputs": [] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "source": [ 120 | "from haystack.nodes import PromptNode, PromptTemplate, AnswerParser\n", 121 | "\n", 122 | "answer_with_references_prompt = PromptTemplate(prompt = \"You will be provided some conetent from technical documentation, where each paragraph is followed by the URL that it appears in. Answer the query based on the provided Documentation Content. Your answer should reference the URLs that it was generated from. Documentation Content: {join(documents, delimiter=new_line, pattern='---'+new_line+'$content'+new_line+'URL: $url', str_replace={new_line: ' ', '[': '(', ']': ')'})}\\nQuery: {query}\\nAnswer:\", output_parser=AnswerParser())" 123 | ], 124 | "metadata": { 125 | "id": "gICaSTLS_C1_" 126 | }, 127 | "execution_count": null, 128 | "outputs": [] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "source": [ 133 | "from getpass import getpass\n", 134 | "\n", 135 | "api_key = getpass(\"Enter OpenAI API key:\")" 136 | ], 137 | "metadata": { 138 | "id": "P_q-tY10G24C" 139 | }, 140 | "execution_count": null, 141 | "outputs": [] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "source": [ 146 | "prompt_node = PromptNode(model_name_or_path=\"gpt-4\", api_key=api_key, default_prompt_template=answer_with_references_prompt, max_length=500)" 147 | ], 148 | "metadata": { 149 | "id": "y17ksGJBDGcg" 150 | }, 151 | "execution_count": null, 152 | "outputs": [] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "source": [ 157 | "pipeline = Pipeline()\n", 158 | "pipeline.add_node(component = embedder, name = \"Retriever\", inputs = [\"Query\"])\n", 159 | "pipeline.add_node(component = prompt_node, name = \"GPT-4\", inputs=[\"Retriever\"])" 160 | ], 161 | "metadata": { 162 | "id": "ExBKygl8HAZf" 163 | }, 164 | "execution_count": null, 165 | "outputs": [] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "source": [ 170 | "def query(query:str):\n", 171 | " result = pipeline.run(query, params = {\"Retriever\": {\"top_k\": 5}})\n", 172 | " print(result['answers'][0].answer)\n", 173 | " return result" 174 | ], 175 | "metadata": { 176 | "id": "BExJVYLDHXME" 177 | }, 178 | "execution_count": null, 179 | "outputs": [] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "source": [ 184 | "result = query(\"What are the optional installations of Haystack?\")" 185 | ], 186 | "metadata": { 187 | "id": "RsKByQGeHb1m" 188 | }, 189 | "execution_count": null, 190 | "outputs": [] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "source": [ 195 | "print(result['answers'][0].meta['prompt'])" 196 | ], 197 | "metadata": { 198 | "id": "tEzawhenJCdv" 199 | }, 200 | "execution_count": null, 201 | "outputs": [] 202 | } 203 | ] 204 | } -------------------------------------------------------------------------------- /notebooks/haystack-1.x/whisper-and-weaviate-for-youtube-rag.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyPETC6ys7VCsXtYbixbbLcI" 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | }, 13 | "language_info": { 14 | "name": "python" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "source": [ 21 | "📚 Check out the [**Talk to YouTube Videos with Haystack Pipelines**](https://haystack.deepset.ai/blog/talk-to-youtube-videos-with-haystack-pipelines) article for a detailed run through of this example." 22 | ], 23 | "metadata": { 24 | "id": "AmtSVA32_MU_" 25 | } 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "source": [ 30 | "## Install the Dependencies" 31 | ], 32 | "metadata": { 33 | "id": "_KK7sSpi1SRB" 34 | } 35 | }, 36 | { 37 | "cell_type": "code", 38 | "source": [ 39 | "!pip install pytube\n", 40 | "!pip install farm-haystack[weaviate,inference,file-conversion,preprocessing]" 41 | ], 42 | "metadata": { 43 | "id": "4u3zEBrvoU11" 44 | }, 45 | "execution_count": null, 46 | "outputs": [] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "source": [ 51 | "## (If Needed) Set Your API Token for desired the Model Provider" 52 | ], 53 | "metadata": { 54 | "id": "EyKdppz31Wzo" 55 | } 56 | }, 57 | { 58 | "cell_type": "code", 59 | "source": [ 60 | "from getpass import getpass\n", 61 | "\n", 62 | "api_key = getpass(\"Enter OpenAI API key:\")" 63 | ], 64 | "metadata": { 65 | "id": "-yI1p6OUppgl" 66 | }, 67 | "execution_count": null, 68 | "outputs": [] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "source": [ 73 | "## The Indexing Pipelne" 74 | ], 75 | "metadata": { 76 | "id": "WgrdMlEL1hBr" 77 | } 78 | }, 79 | { 80 | "cell_type": "code", 81 | "source": [ 82 | "import weaviate\n", 83 | "from weaviate.embedded import EmbeddedOptions\n", 84 | "\n", 85 | "client = weaviate.Client(\n", 86 | " embedded_options=weaviate.embedded.EmbeddedOptions()\n", 87 | ")" 88 | ], 89 | "metadata": { 90 | "id": "_uTF0xJJQdRo" 91 | }, 92 | "execution_count": null, 93 | "outputs": [] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "source": [ 98 | "from haystack.document_stores import WeaviateDocumentStore\n", 99 | "\n", 100 | "document_store = WeaviateDocumentStore(port=6666)" 101 | ], 102 | "metadata": { 103 | "id": "_r0TSF1UQlk2" 104 | }, 105 | "execution_count": null, 106 | "outputs": [] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": { 112 | "id": "zaiaiMszoR_l" 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "from pytube import YouTube\n", 117 | "\n", 118 | "def youtube2audio (url: str):\n", 119 | " yt = YouTube(url)\n", 120 | " video = yt.streams.filter(abr='160kbps').last()\n", 121 | " return video.download()\n" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "source": [ 127 | "from haystack.nodes import EmbeddingRetriever, PreProcessor\n", 128 | "from haystack.nodes.audio import WhisperTranscriber\n", 129 | "from haystack.pipelines import Pipeline\n", 130 | "\n", 131 | "preprocessor = PreProcessor()\n", 132 | "embedder = EmbeddingRetriever(document_store=document_store, embedding_model=\"sentence-transformers/multi-qa-mpnet-base-dot-v1\")\n", 133 | "whisper = WhisperTranscriber(api_key=api_key)\n", 134 | "\n", 135 | "indexing_pipeline = Pipeline()\n", 136 | "indexing_pipeline.add_node(component=whisper, name=\"Whisper\", inputs=[\"File\"])\n", 137 | "indexing_pipeline.add_node(component=preprocessor, name=\"Preprocessor\", inputs=[\"Whisper\"])\n", 138 | "indexing_pipeline.add_node(component=embedder, name=\"Embedder\", inputs=[\"Preprocessor\"])\n", 139 | "indexing_pipeline.add_node(component=document_store, name=\"DocumentStore\", inputs=[\"Embedder\"])" 140 | ], 141 | "metadata": { 142 | "id": "XdQNWS2BQv8a" 143 | }, 144 | "execution_count": null, 145 | "outputs": [] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "source": [ 150 | "### Run the Indexing Pipeline" 151 | ], 152 | "metadata": { 153 | "id": "0hGtJ7Q51nEI" 154 | } 155 | }, 156 | { 157 | "cell_type": "code", 158 | "source": [ 159 | "videos = [\"https://www.youtube.com/watch?v=h5id4erwD4s\", \"https://www.youtube.com/watch?v=iFUeV3aYynI\"]\n", 160 | "\n", 161 | "for video in videos:\n", 162 | " file_path = youtube2audio(video)\n", 163 | " indexing_pipeline.run(file_paths=[file_path])\n" 164 | ], 165 | "metadata": { 166 | "id": "y00-xWDdqZQx" 167 | }, 168 | "execution_count": null, 169 | "outputs": [] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "source": [ 174 | "## The RAG Pipeline" 175 | ], 176 | "metadata": { 177 | "id": "dq_FloEE1rNi" 178 | } 179 | }, 180 | { 181 | "cell_type": "code", 182 | "source": [ 183 | "from haystack.nodes import PromptNode, PromptTemplate, AnswerParser\n", 184 | "\n", 185 | "video_qa_prompt = PromptTemplate(prompt=\"You will be provided some transcripts from the AI Engineer livestream. Please answer the query based on what is said in the livestream.\\n\"\n", 186 | " \"Video Transcripts: {join(documents)}\\n\"\n", 187 | " \"Query: {query}\\n\"\n", 188 | " \"Answer:\", output_parser = AnswerParser())\n", 189 | "\n", 190 | "prompt_node = PromptNode(model_name_or_path=\"gpt-4\", api_key=api_key, default_prompt_template=video_qa_prompt)" 191 | ], 192 | "metadata": { 193 | "id": "oErh1UW2U0JO" 194 | }, 195 | "execution_count": null, 196 | "outputs": [] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "source": [ 201 | "video_rag_pipeline = Pipeline()\n", 202 | "video_rag_pipeline.add_node(component=embedder, name=\"Retriever\", inputs=[\"Query\"])\n", 203 | "video_rag_pipeline.add_node(component=prompt_node, name=\"PromptNode\", inputs=[\"Retriever\"])" 204 | ], 205 | "metadata": { 206 | "id": "W6KPM_C-ZaXN" 207 | }, 208 | "execution_count": null, 209 | "outputs": [] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "source": [ 214 | "### Run the RAG Pipeline" 215 | ], 216 | "metadata": { 217 | "id": "ddcgQZ881t9S" 218 | } 219 | }, 220 | { 221 | "cell_type": "code", 222 | "source": [ 223 | "result = video_rag_pipeline.run(\"Why do we do chunking?\")\n", 224 | "print(result['answers'][0].answer)" 225 | ], 226 | "metadata": { 227 | "id": "QiynYi-yaEdI" 228 | }, 229 | "execution_count": null, 230 | "outputs": [] 231 | } 232 | ] 233 | } -------------------------------------------------------------------------------- /notebooks/jina-embeddings-v2-legal-analysis-rag.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "_coq_qCuItbN" 7 | }, 8 | "source": [ 9 | "# Using the Jina-embeddings-v2-base-en model in a Haystack RAG pipeline for legal document analysis\n", 10 | "\n", 11 | "One foggy day in October 2023, I was narrowly excused from jury duty. I had mixed feelings about it, since it actually seemed like a pretty interesting case (Google v. Sonos). A few months later, I idly wondered how the proceedings turned out. I could just read the news, but what's the fun in that? Let's see how AI can solve this problem.\n", 12 | "\n", 13 | "[Jina.ai](https://jina.ai/) recently released `jina-embeddings-v2-base-en`. It's an open-source text embedding model capable of accommodating up to 8192 tokens. Splitting text into larger chunks is helpful for understanding longer documents. One of the use cases this model is especially suited for is legal document analysis.\n", 14 | "\n", 15 | "In this demo, we'll build a [RAG pipeline](https://www.deepset.ai/blog/llms-retrieval-augmentation) to discover the outcome of the Google v. Sonos case, using the following technologies:\n", 16 | "- the [`jina-embeddings-v2-base-en`](https://arxiv.org/abs/2310.19923) model\n", 17 | "- [Haystack](https://haystack.deepset.ai/), the open source LLM orchestration framework\n", 18 | "- [Chroma](https://docs.trychroma.com/getting-started) to store our vector embeddings, via the [Chroma Document Store Haystack integration](https://haystack.deepset.ai/integrations/chroma-documentstore)\n", 19 | "- the open source [Mistral 7B Instruct LLM](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)\n", 20 | "\n", 21 | "\n", 22 | "## Prerequisites:\n", 23 | "- You need a Jina AI key - [get a free one here](https://jina.ai/embeddings/).\n", 24 | "- You also need an [Hugging Face access token](https://huggingface.co/docs/hub/security-tokens)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "bKgrm01CD3af" 31 | }, 32 | "source": [ 33 | "First, install all our required dependencies." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "colab": { 41 | "base_uri": "https://localhost:8080/" 42 | }, 43 | "id": "zFbYg2Yb_UgT", 44 | "outputId": "0d6d6753-19bc-4e73-fb15-aa8fae5b50b6" 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "!pip3 install pypdf" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": { 55 | "colab": { 56 | "base_uri": "https://localhost:8080/" 57 | }, 58 | "id": "prmChq6T_T4l", 59 | "outputId": "2768c613-7cb4-4b23-de77-1c6a8cf24850" 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "!pip install haystack-ai jina-haystack chroma-haystack \"huggingface_hub>=0.22.0\"" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": { 69 | "id": "3fGOt677EYuE" 70 | }, 71 | "source": [ 72 | "Then input our credentials." 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": { 79 | "colab": { 80 | "base_uri": "https://localhost:8080/" 81 | }, 82 | "id": "JRogtDXaAMIF", 83 | "outputId": "79ea9fda-6595-4d74-9462-fc35ba18476f" 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "from getpass import getpass\n", 88 | "import os\n", 89 | "\n", 90 | "os.environ[\"JINA_API_KEY\"] = getpass(\"JINA api key:\")\n", 91 | "os.environ[\"HF_API_TOKEN\"] = getpass(\"Enter your HuggingFace api token: \")" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": { 97 | "id": "KlMkLSVjJVoW" 98 | }, 99 | "source": [ 100 | "## Build an Indexing Pipeline\n", 101 | "\n", 102 | "At a high level, the `LinkContentFetcher` pulls this document from its URL. Then we convert it from a PDF into a Document object Haystack can understand.\n", 103 | "\n", 104 | "We preprocess it by removing whitespace and redundant substrings. Then split it into chunks, generate embeddings, and write these embeddings into the `ChromaDocumentStore`." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "colab": { 112 | "base_uri": "https://localhost:8080/", 113 | "height": 339 114 | }, 115 | "id": "3lx84bCtM4aS", 116 | "outputId": "bb99c198-fb05-4c64-d5b7-192e60957d1f" 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "from haystack_integrations.document_stores.chroma import ChromaDocumentStore\n", 121 | "document_store = ChromaDocumentStore()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": { 128 | "colab": { 129 | "base_uri": "https://localhost:8080/" 130 | }, 131 | "id": "YpY3UQiN07J-", 132 | "outputId": "77b55084-c547-4749-9745-24edf5883333" 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "from haystack import Pipeline\n", 137 | "\n", 138 | "from haystack.components.fetchers import LinkContentFetcher\n", 139 | "from haystack.components.converters import PyPDFToDocument\n", 140 | "from haystack.components.writers import DocumentWriter\n", 141 | "from haystack.components.preprocessors import DocumentCleaner\n", 142 | "from haystack.components.preprocessors import DocumentSplitter\n", 143 | "from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever\n", 144 | "from haystack.document_stores.types import DuplicatePolicy\n", 145 | "\n", 146 | "from haystack_integrations.components.embedders.jina import JinaDocumentEmbedder\n", 147 | "from haystack_integrations.components.embedders.jina import JinaTextEmbedder\n", 148 | "\n", 149 | "fetcher = LinkContentFetcher()\n", 150 | "converter = PyPDFToDocument()\n", 151 | "# remove repeated substrings to get rid of headers/footers\n", 152 | "cleaner = DocumentCleaner(remove_repeated_substrings=True)\n", 153 | "\n", 154 | "# Since jina-v2 can handle 8192 tokens, 500 words seems like a safe chunk size\n", 155 | "splitter = DocumentSplitter(split_by=\"word\", split_length=500)\n", 156 | "\n", 157 | "# DuplicatePolicy.SKIP is optional but helps avoid errors if you want to re-run the pipeline\n", 158 | "writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)\n", 159 | "\n", 160 | "retriever = ChromaEmbeddingRetriever(document_store=document_store)\n", 161 | "\n", 162 | "# There are both small and large embedding models available, depending on your computing resources and requirements.\n", 163 | "# Here we're using the larger model.\n", 164 | "document_embedder = JinaDocumentEmbedder(model=\"jina-embeddings-v2-base-en\")\n", 165 | "\n", 166 | "indexing_pipeline = Pipeline()\n", 167 | "indexing_pipeline.add_component(instance=fetcher, name=\"fetcher\")\n", 168 | "indexing_pipeline.add_component(instance=converter, name=\"converter\")\n", 169 | "indexing_pipeline.add_component(instance=cleaner, name=\"cleaner\")\n", 170 | "indexing_pipeline.add_component(instance=splitter, name=\"splitter\")\n", 171 | "indexing_pipeline.add_component(instance=document_embedder, name=\"embedder\")\n", 172 | "indexing_pipeline.add_component(instance=writer, name=\"writer\")\n", 173 | "\n", 174 | "indexing_pipeline.connect(\"fetcher.streams\", \"converter.sources\")\n", 175 | "indexing_pipeline.connect(\"converter.documents\", \"cleaner.documents\")\n", 176 | "indexing_pipeline.connect(\"cleaner.documents\", \"splitter.documents\")\n", 177 | "indexing_pipeline.connect(\"splitter.documents\", \"embedder.documents\")\n", 178 | "indexing_pipeline.connect(\"embedder.documents\", \"writer.documents\")\n", 179 | "\n", 180 | "# This case references Google V Sonos, October 2023\n", 181 | "urls = [\"https://cases.justia.com/federal/district-courts/california/candce/3:2020cv06754/366520/813/0.pdf\"]\n", 182 | "\n", 183 | "indexing_pipeline.run(data={\"fetcher\": {\"urls\": urls}})\n" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": { 189 | "id": "OvBHvwqoa3YJ" 190 | }, 191 | "source": [ 192 | "# Query pipeline\n", 193 | "\n", 194 | "Now the real fun begins. Let's create a query pipeline so we can actually start asking questions. We write a prompt allowing us to pass our documents to the Mistral-7B LLM. Then we initiatialize the LLM via the `HuggingFaceAPIGenerator`.\n", 195 | "\n", 196 | "To use this model, you need to accept the conditions here: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1\n", 197 | "\n", 198 | "In Haystack 2.0 `retriever`s are tightly coupled to `DocumentStores`. If we pass in the `retriever` we initialized earlier, this pipeline can access those embeddings we generated, and pass them to the LLM." 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "id": "bZefXI2cBRME" 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "\n", 210 | "from haystack.components.generators import HuggingFaceAPIGenerator\n", 211 | "from haystack.components.builders.prompt_builder import PromptBuilder\n", 212 | "\n", 213 | "from haystack_integrations.components.embedders.jina import JinaTextEmbedder\n", 214 | "prompt = \"\"\" Answer the question, based on the\n", 215 | "content in the documents. If you can't answer based on the documents, say so.\n", 216 | "\n", 217 | "Documents:\n", 218 | "{% for doc in documents %}\n", 219 | " {{doc.content}}\n", 220 | "{% endfor %}\n", 221 | "\n", 222 | "question: {{question}}\n", 223 | "\"\"\"\n", 224 | "\n", 225 | "text_embedder = JinaTextEmbedder(model=\"jina-embeddings-v2-base-en\")\n", 226 | "generator = HuggingFaceAPIGenerator(\n", 227 | " api_type=\"serverless_inference_api\",\n", 228 | " api_params={\"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\"}) \n", 229 | "\n", 230 | "\n", 231 | "prompt_builder = PromptBuilder(template=prompt)\n", 232 | "query_pipeline = Pipeline()\n", 233 | "query_pipeline.add_component(\"text_embedder\",text_embedder)\n", 234 | "query_pipeline.add_component(instance=prompt_builder, name=\"prompt_builder\")\n", 235 | "query_pipeline.add_component(\"retriever\", retriever)\n", 236 | "query_pipeline.add_component(\"generator\", generator)\n", 237 | "\n", 238 | "query_pipeline.connect(\"text_embedder.embedding\", \"retriever.query_embedding\")\n", 239 | "query_pipeline.connect(\"retriever.documents\", \"prompt_builder.documents\")\n", 240 | "query_pipeline.connect(\"prompt_builder.prompt\", \"generator.prompt\")\n" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": { 246 | "id": "WuHpvfGZGB7b" 247 | }, 248 | "source": [ 249 | "Time to ask a question!" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": { 256 | "colab": { 257 | "base_uri": "https://localhost:8080/" 258 | }, 259 | "id": "O3VxIscXJMOS", 260 | "outputId": "f2ed03f3-efe9-44f0-e7e3-a5bdc8500c56" 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "question = \"Summarize what happened in Google v. Sonos\"\n", 265 | "\n", 266 | "result = query_pipeline.run(data={\"text_embedder\":{\"text\": question},\n", 267 | " \"retriever\": {\"top_k\": 3},\n", 268 | " \"prompt_builder\":{\"question\": question},\n", 269 | " \"generator\": {\"generation_kwargs\": {\"max_new_tokens\": 350}}})\n", 270 | "\n", 271 | "print(result['generator']['replies'][0])\n" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": { 277 | "id": "cJE9E6LmzEwB" 278 | }, 279 | "source": [ 280 | "### Other questions you could try:\n", 281 | "- What role did If This Then That play in Google v. Sonos?\n", 282 | "- What judge presided over Google v. Sonos?\n", 283 | "- What should Sonos have done differently?\n", 284 | "\n", 285 | "\n", 286 | "### Alternate cases to explore\n", 287 | "The indexing pipeline is written so that you can swap in other documents and analyze them. can You can try plugging the following URLs (or any PDF written in English) into the indexing pipeline and re-running all the code blocks below it.\n", 288 | "- Google v. Oracle: https://supreme.justia.com/cases/federal/us/593/18-956/case.pdf\n", 289 | "- JACK DANIEL’S PROPERTIES, INC. v. VIP PRODUCTS\n", 290 | "LLC: https://www.supremecourt.gov/opinions/22pdf/22-148_3e04.pdf\n", 291 | "\n", 292 | "Note: if you want to change the prompt template, you'll also need to re-run the code blocks starting where the `DocumentStore` is defined.\n", 293 | "\n", 294 | "### Wrapping it up\n", 295 | "Thanks for reading! If you're interested in learning more about the technologies used here, check out these blog posts:\n", 296 | "- [Embeddings in Depth](https://jina.ai/news/embeddings-in-depth/)\n", 297 | "- [What is text vectorization in NLP?](https://haystack.deepset.ai/blog/what-is-text-vectorization-in-nlp)\n", 298 | "- [The definitive guide to BERT models](https://haystack.deepset.ai/blog/the-definitive-guide-to-bertmodels)" 299 | ] 300 | } 301 | ], 302 | "metadata": { 303 | "colab": { 304 | "provenance": [] 305 | }, 306 | "kernelspec": { 307 | "display_name": "Python 3", 308 | "name": "python3" 309 | }, 310 | "language_info": { 311 | "name": "python" 312 | } 313 | }, 314 | "nbformat": 4, 315 | "nbformat_minor": 0 316 | } 317 | -------------------------------------------------------------------------------- /notebooks/keyword-extraction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Keyword Extraction with LLM Chat Generator\n", 8 | "This notebook demonstrates how to extract keywords and key phrases from text using Haystack’s `ChatPromptBuilder` together with an LLM via `OpenAIChatGenerator`. We will:\n", 9 | "\n", 10 | "- Define a prompt that instructs the model to identify single- and multi-word keywords.\n", 11 | "\n", 12 | "- Capture each keyword’s character offsets.\n", 13 | "\n", 14 | "- Assign a relevance score (0–1).\n", 15 | "\n", 16 | "- Parse and display the results as JSON.\n", 17 | "\n" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "### Install packages and setup OpenAI API key" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "!pip install haystack-ai" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 8, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "import os\n", 43 | "from getpass import getpass\n", 44 | "\n", 45 | "if \"OPENAI_API_KEY\" not in os.environ:\n", 46 | " os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter OpenAI API key:\")" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "### Import Required Libraries\n", 54 | "\n" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 9, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "import json\n", 64 | "\n", 65 | "\n", 66 | "from haystack.dataclasses import ChatMessage\n", 67 | "from haystack.components.builders import ChatPromptBuilder\n", 68 | "from haystack.components.generators.chat import OpenAIChatGenerator\n" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "### Prepare Text \n", 76 | "Collect your text you want to analyze." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 16, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "text_to_analyze = \"Artificial intelligence models like large language models are increasingly integrated into various sectors including healthcare, finance, education, and customer service. They can process natural language, generate text, translate languages, and extract meaningful insights from unstructured data. When performing key word extraction, these systems identify the most significant terms, phrases, or concepts that represent the core meaning of a document. Effective extraction must balance between technical terminology, domain-specific jargon, named entities, action verbs, and contextual relevance. The process typically involves tokenization, stopword removal, part-of-speech tagging, frequency analysis, and semantic relationship mapping to prioritize terms that most accurately capture the document's essential information and main topics.\"\n", 86 | "\n" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "### Build the Prompt\n", 94 | "We construct a single-message template that instructs the model to extract keywords, their positions and scores and return the output as JSON object.\n" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 17, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "messages = [\n", 104 | " ChatMessage.from_user(\n", 105 | " '''\n", 106 | "You are a keyword extractor. Extract the most relevant keywords and phrases from the following text. For each keyword:\n", 107 | "1. Find single and multi-word keywords that capture important concepts\n", 108 | "2. Include the starting position (index) where each keyword appears in the text\n", 109 | "3. Assign a relevance score between 0 and 1 for each keyword\n", 110 | "4. Focus on nouns, noun phrases, and important terms\n", 111 | "\n", 112 | "Text to analyze: {{text}}\n", 113 | "\n", 114 | "Return the results as a JSON array in this exact format:\n", 115 | "{\n", 116 | " \"keywords\": [\n", 117 | " {\n", 118 | " \"keyword\": \"example term\",\n", 119 | " \"positions\": [5],\n", 120 | " \"score\": 0.95\n", 121 | " },\n", 122 | " {\n", 123 | " \"keyword\": \"another keyword\",\n", 124 | " \"positions\": [20],\n", 125 | " \"score\": 0.85\n", 126 | " }\n", 127 | " ]\n", 128 | "}\n", 129 | "\n", 130 | "Important:\n", 131 | "- Each keyword must have its EXACT character position in the text (counting from 0)\n", 132 | "- Scores should reflect the relevance (0–1)\n", 133 | "- Include both single words and meaningful phrases\n", 134 | "- List results from highest to lowest score\n", 135 | "'''\n", 136 | " )\n", 137 | "]\n", 138 | "\n", 139 | "builder = ChatPromptBuilder(template=messages, required_variables='*')\n", 140 | "prompt = builder.run(text=text_to_analyze)\n" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "### Initialize the Generator and Extract Keywords\n", 148 | "We use OpenAIChatGenerator (e.g., gpt-4o-mini) to send our prompt and request a JSON-formatted response." 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "# Initialize the chat-based generator\n", 158 | "extractor = OpenAIChatGenerator(model=\"gpt-4o-mini\")\n", 159 | "\n", 160 | "# Run the generator with our formatted prompt\n", 161 | "results = extractor.run(\n", 162 | " messages=prompt[\"prompt\"],\n", 163 | " generation_kwargs={\"response_format\": {\"type\": \"json_object\"}}\n", 164 | ")\n", 165 | "\n", 166 | "# Extract the raw text reply\n", 167 | "output_str = results[\"replies\"][0].text" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "### Parse and Display Results\n", 175 | "Finally, convert the returned JSON string into a Python object and iterate over the extracted keywords." 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 19, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "Keyword: artificial intelligence\n", 188 | " Positions: [0]\n", 189 | " Score: 1.0\n", 190 | "\n", 191 | "Keyword: large language models\n", 192 | " Positions: [18]\n", 193 | " Score: 0.95\n", 194 | "\n", 195 | "Keyword: healthcare\n", 196 | " Positions: [63]\n", 197 | " Score: 0.9\n", 198 | "\n", 199 | "Keyword: finance\n", 200 | " Positions: [72]\n", 201 | " Score: 0.9\n", 202 | "\n", 203 | "Keyword: education\n", 204 | " Positions: [81]\n", 205 | " Score: 0.9\n", 206 | "\n", 207 | "Keyword: customer service\n", 208 | " Positions: [91]\n", 209 | " Score: 0.9\n", 210 | "\n", 211 | "Keyword: natural language\n", 212 | " Positions: [108]\n", 213 | " Score: 0.85\n", 214 | "\n", 215 | "Keyword: unstructured data\n", 216 | " Positions: [162]\n", 217 | " Score: 0.85\n", 218 | "\n", 219 | "Keyword: key word extraction\n", 220 | " Positions: [193]\n", 221 | " Score: 0.8\n", 222 | "\n", 223 | "Keyword: significant terms\n", 224 | " Positions: [215]\n", 225 | " Score: 0.8\n", 226 | "\n", 227 | "Keyword: technical terminology\n", 228 | " Positions: [290]\n", 229 | " Score: 0.75\n", 230 | "\n", 231 | "Keyword: domain-specific jargon\n", 232 | " Positions: [311]\n", 233 | " Score: 0.75\n", 234 | "\n", 235 | "Keyword: named entities\n", 236 | " Positions: [334]\n", 237 | " Score: 0.7\n", 238 | "\n", 239 | "Keyword: action verbs\n", 240 | " Positions: [352]\n", 241 | " Score: 0.7\n", 242 | "\n", 243 | "Keyword: contextual relevance\n", 244 | " Positions: [367]\n", 245 | " Score: 0.7\n", 246 | "\n", 247 | "Keyword: tokenization\n", 248 | " Positions: [406]\n", 249 | " Score: 0.65\n", 250 | "\n", 251 | "Keyword: stopword removal\n", 252 | " Positions: [420]\n", 253 | " Score: 0.65\n", 254 | "\n", 255 | "Keyword: part-of-speech tagging\n", 256 | " Positions: [437]\n", 257 | " Score: 0.65\n", 258 | "\n", 259 | "Keyword: frequency analysis\n", 260 | " Positions: [457]\n", 261 | " Score: 0.65\n", 262 | "\n", 263 | "Keyword: semantic relationship mapping\n", 264 | " Positions: [476]\n", 265 | " Score: 0.65\n", 266 | "\n", 267 | "Keyword: essential information\n", 268 | " Positions: [508]\n", 269 | " Score: 0.6\n", 270 | "\n", 271 | "Keyword: main topics\n", 272 | " Positions: [529]\n", 273 | " Score: 0.6\n", 274 | "\n" 275 | ] 276 | } 277 | ], 278 | "source": [ 279 | "try:\n", 280 | " data = json.loads(output_str)\n", 281 | " for kw in data[\"keywords\"]:\n", 282 | " print(f'Keyword: {kw[\"keyword\"]}')\n", 283 | " print(f' Positions: {kw[\"positions\"]}')\n", 284 | " print(f' Score: {kw[\"score\"]}\\n')\n", 285 | "except json.JSONDecodeError:\n", 286 | " print(\"Failed to parse the output as JSON. Raw output:\", output_str)\n" 287 | ] 288 | } 289 | ], 290 | "metadata": { 291 | "kernelspec": { 292 | "display_name": ".venv", 293 | "language": "python", 294 | "name": "python3" 295 | }, 296 | "language_info": { 297 | "name": "python", 298 | "version": "3.12.4" 299 | } 300 | }, 301 | "nbformat": 4, 302 | "nbformat_minor": 2 303 | } 304 | -------------------------------------------------------------------------------- /notebooks/rag_fastembed.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "cb7RLf9gpEoN" 7 | }, 8 | "source": [ 9 | "# RAG pipeline using FastEmbed for embeddings generation\n", 10 | "\n", 11 | "[FastEmbed](https://qdrant.github.io/fastembed/) is a lightweight, fast, Python library built for embedding generation, maintained by Qdrant. \n", 12 | "It is suitable for generating embeddings efficiently and fast on CPU-only machines.\n", 13 | "\n", 14 | "In this notebook, we will use FastEmbed-Haystack integration to generate embeddings for indexing and RAG.\n", 15 | "\n", 16 | "**Haystack Useful Sources**\n", 17 | "\n", 18 | "* [Docs](https://docs.haystack.deepset.ai/docs/intro)\n", 19 | "* [Tutorials](https://haystack.deepset.ai/tutorials)\n", 20 | "* [Other Cookbooks](https://github.com/deepset-ai/haystack-cookbook)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## Install dependencies" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "id": "tnSq1XK_ovZV" 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "!pip install fastembed-haystack qdrant-haystack wikipedia transformers" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "id": "x8Bpy1ri_Ipx" 45 | }, 46 | "source": [ 47 | "## Download contents and create docs" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 1, 53 | "metadata": { 54 | "id": "2NIxMNQTzcfV" 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "favourite_bands=\"\"\"Audioslave\n", 59 | "Green Day\n", 60 | "Muse (band)\n", 61 | "Foo Fighters (band)\n", 62 | "Nirvana (band)\"\"\".split(\"\\n\")" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 3, 68 | "metadata": { 69 | "id": "7FpCSSnUzuuP" 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "import wikipedia\n", 74 | "from haystack.dataclasses import Document\n", 75 | "\n", 76 | "raw_docs=[]\n", 77 | "for title in favourite_bands:\n", 78 | " page = wikipedia.page(title=title, auto_suggest=False)\n", 79 | " doc = Document(content=page.content, meta={\"title\": page.title, \"url\":page.url})\n", 80 | " raw_docs.append(doc)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": { 86 | "id": "DLiNhYKV_g8u" 87 | }, 88 | "source": [ 89 | "## Clean, split and index documents on Qdrant" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 4, 95 | "metadata": { 96 | "id": "a1taDmfx1HCM" 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "from haystack_integrations.document_stores.qdrant import QdrantDocumentStore\n", 101 | "from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter\n", 102 | "from haystack_integrations.components.embedders.fastembed import FastembedDocumentEmbedder\n", 103 | "from haystack.document_stores.types import DuplicatePolicy" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 5, 109 | "metadata": { 110 | "id": "eAKP9icf1Inj" 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "document_store = QdrantDocumentStore(\n", 115 | " \":memory:\",\n", 116 | " embedding_dim =384,\n", 117 | " recreate_index=True,\n", 118 | " return_embedding=True,\n", 119 | " wait_result_from_api=True,\n", 120 | ")" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 6, 126 | "metadata": { 127 | "id": "6rbZVTxR907n" 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "cleaner = DocumentCleaner()\n", 132 | "splitter = DocumentSplitter(split_by='sentence', split_length=3)\n", 133 | "splitted_docs = splitter.run(cleaner.run(raw_docs)[\"documents\"])" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 7, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "493" 145 | ] 146 | }, 147 | "execution_count": 7, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "len(splitted_docs[\"documents\"])" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "### FastEmbed Document Embedder\n", 161 | "\n", 162 | "Here we are initializing the FastEmbed Document Embedder and using it to generate embeddings for the documents.\n", 163 | "We are using a small and good model, `BAAI/bge-small-en-v1.5` and specifying the `parallel` parameter to 0 to use all available CPU cores for embedding generation.\n", 164 | "\n", 165 | "⚠️ If you are running this notebook on Google Colab, please note that Google Colab only provides 2 CPU cores, so the embedding generation could be not as fast as it can be on a standard machine.\n", 166 | "\n", 167 | "For more information on FastEmbed-Haystack integration, please refer to the [documentation](https://docs.haystack.deepset.ai/docs/fastembeddocumentembedder) and [API reference](https://docs.haystack.deepset.ai/reference/fastembed-embedders)." 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 8, 173 | "metadata": { 174 | "id": "U3s_68uw9FSW" 175 | }, 176 | "outputs": [ 177 | { 178 | "data": { 179 | "application/vnd.jupyter.widget-view+json": { 180 | "model_id": "a0d7dbcf196047dfa767e6e6b78374b2", 181 | "version_major": 2, 182 | "version_minor": 0 183 | }, 184 | "text/plain": [ 185 | "Fetching 9 files: 0%| | 0/9 [00:00 List[str]:\n", 131 | " titles: List[str] = []\n", 132 | " sites = list(urls.keys())[:num_sites]\n", 133 | " for site in sites:\n", 134 | " feed = feedparser.parse(urls[site])\n", 135 | " entries = feed.entries[:num_titles]\n", 136 | " for entry in entries:\n", 137 | " titles.append(entry.title)\n", 138 | " return titles\n", 139 | "\n", 140 | "titles = get_titles(urls, NUM_WEBSITES, NUM_TITLES)\n" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 40, 146 | "id": "c0c38c22", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "\n", 151 | "document_store = InMemoryDocumentStore()\n", 152 | "document_store.write_documents(\n", 153 | " [\n", 154 | " Document(content=title) for title in titles\n", 155 | " ]\n", 156 | ")\n", 157 | "\n", 158 | "template = \"\"\"\n", 159 | "HEADLINES:\n", 160 | "{% for document in documents %}\n", 161 | " {{ document.content }}\n", 162 | "{% endfor %}\n", 163 | "\n", 164 | "REQUEST: {{ query }}\n", 165 | "\"\"\"\n", 166 | "\n", 167 | "pipe = Pipeline()\n", 168 | "\n", 169 | "pipe.add_component(\"retriever\", InMemoryBM25Retriever(document_store=document_store))\n", 170 | "pipe.add_component(\"prompt_builder\", PromptBuilder(template=template))\n", 171 | "pipe.add_component(\"llm\", TakeoffGenerator(base_url=\"http://localhost\", port=3000))\n", 172 | "pipe.connect(\"retriever\", \"prompt_builder.documents\")\n", 173 | "pipe.connect(\"prompt_builder\", \"llm\")\n", 174 | "\n", 175 | "query = f\"Summarize each of the {NUM_WEBSITES * NUM_TITLES} provided headlines in three words.\"" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 41, 181 | "id": "184f41c6", 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "data": { 186 | "text/plain": [ 187 | "'Two words: poker roguelike - Former Twitter engineers are building Particle, an AI-powered news reader - Best laptops of MWC 2024, including a 2-in-1 that broke a world record'" 188 | ] 189 | }, 190 | "execution_count": 41, 191 | "metadata": {}, 192 | "output_type": "execute_result" 193 | } 194 | ], 195 | "source": [ 196 | "titles_string" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 42, 202 | "id": "b6ae5032", 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "data": { 207 | "application/vnd.jupyter.widget-view+json": { 208 | "model_id": "e73917616d6f48b6b2fe49b3b033f988", 209 | "version_major": 2, 210 | "version_minor": 0 211 | }, 212 | "text/plain": [ 213 | "Ranking by BM25...: 0%| | 0/1 [00:00      \n", 12 | "\n", 13 | "*Notebook by [Stefano Fiorucci](https://github.com/anakin87)*\n", 14 | "\n", 15 | "This notebook shows how to use the [vLLM inference engine](https://docs.vllm.ai/en/latest/) with Haystack." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "t9GskGf6wGgE" 22 | }, 23 | "source": [ 24 | "## Install vLLM + Haystack\n", 25 | "\n", 26 | "- we install vLLM using pip ([docs](https://docs.vllm.ai/en/latest/getting_started/installation.html))\n", 27 | "- for production use cases, there are many other options, including Docker ([docs](https://docs.vllm.ai/en/latest/serving/deploying_with_docker.html))" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 12, 33 | "metadata": { 34 | "colab": { 35 | "base_uri": "https://localhost:8080/" 36 | }, 37 | "id": "46yIpiO_b16C", 38 | "outputId": "5b451759-19e6-4381-a588-e40face7563c" 39 | }, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "nvcc: NVIDIA (R) Cuda compiler driver\n", 46 | "Copyright (c) 2005-2023 NVIDIA Corporation\n", 47 | "Built on Tue_Aug_15_22:02:13_PDT_2023\n", 48 | "Cuda compilation tools, release 12.2, V12.2.140\n", 49 | "Build cuda_12.2.r12.2/compiler.33191640_0\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "# we check that CUDA is >=12.1 (https://docs.vllm.ai/en/latest/getting_started/installation.html#install-with-pip)\n", 55 | "! nvcc --version" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "colab": { 63 | "base_uri": "https://localhost:8080/" 64 | }, 65 | "id": "RXZsGS9YbUr-", 66 | "outputId": "eedf455a-cbe1-477c-b62d-51b832d9e8cc" 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "! pip install vllm haystack-ai" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": { 76 | "id": "qt__RIekwTX1" 77 | }, 78 | "source": [ 79 | "## Run a vLLM OpenAI-compatible server in Colab\n", 80 | "\n", 81 | "vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API. Read more [in the docs](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server).\n", 82 | "\n", 83 | "In Colab, we start the OpenAI-compatible server using Python.\n", 84 | "For environments that support Docker, we can run the server using Docker ([docs](https://docs.vllm.ai/en/latest/serving/deploying_with_docker.html)).\n", 85 | "\n", 86 | "*Significant parameters:*\n", 87 | "- **model**: [TheBloke/notus-7B-v1-AWQ](https://huggingface.co/TheBloke/notus-7B-v1-AWQ) is the AWQ quantized version of a good LLM by Argilla. Several model architectures are supported; models are automatically downloaded from Hugging Face as needed. For a comprehensive list of the supported models, see the [docs](https://docs.vllm.ai/en/latest/models/supported_models.html).\n", 88 | "\n", 89 | "- **quantization**: awq. AWQ is a quantization method that allows LLMs to run (fast) when GPU resources are limited. [Simple blogpost on quantization techniques](https://www.maartengrootendorst.com/blog/quantization/#awq-activation-aware-weight-quantization)\n", 90 | "- **max_model_len**: we specify a [maximum context length](https://docs.vllm.ai/en/latest/models/engine_args.html), which consists of the maximum number of tokens (prompt + response). Otherwise, the model does not fit in Colab and we get an OOM error.\n" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 1, 96 | "metadata": { 97 | "colab": { 98 | "base_uri": "https://localhost:8080/" 99 | }, 100 | "id": "wqceUZ4yj2IT", 101 | "outputId": "f94b1e26-42d7-4362-d5f1-d9c6ce0a5968" 102 | }, 103 | "outputs": [ 104 | { 105 | "name": "stdout", 106 | "output_type": "stream", 107 | "text": [ 108 | "nohup: redirecting stderr to stdout\n" 109 | ] 110 | } 111 | ], 112 | "source": [ 113 | "# we prepend \"nohup\" and postpend \"&\" to make the Colab cell run in background\n", 114 | "! nohup python -m vllm.entrypoints.openai.api_server \\\n", 115 | " --model TheBloke/notus-7B-v1-AWQ \\\n", 116 | " --quantization awq \\\n", 117 | " --max-model-len 2048 \\\n", 118 | " > vllm.log &" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 2, 124 | "metadata": { 125 | "colab": { 126 | "base_uri": "https://localhost:8080/" 127 | }, 128 | "id": "obZ83mNDp2M3", 129 | "outputId": "247812a7-ea57-475c-d3b0-817f2d9f0589" 130 | }, 131 | "outputs": [ 132 | { 133 | "name": "stdout", 134 | "output_type": "stream", 135 | "text": [ 136 | "INFO 02-16 10:57:39 llm_engine.py:72] Initializing an LLM engine with config: model='TheBloke/notus-7B-v1-AWQ', tokenizer='TheBloke/notus-7B-v1-AWQ', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, seed=0)\n", 137 | "INFO 02-16 10:57:43 weight_utils.py:164] Using model weights format ['*.safetensors']\n", 138 | "INFO 02-16 10:57:43 weight_utils.py:164] Using model weights format ['*.safetensors']\n", 139 | "INFO 02-16 10:57:55 llm_engine.py:322] # GPU blocks: 4108, # CPU blocks: 2048\n", 140 | "INFO 02-16 10:57:58 model_runner.py:636] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n", 141 | "INFO 02-16 10:57:58 model_runner.py:636] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n", 142 | "INFO 02-16 10:57:58 model_runner.py:636] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n" 143 | ] 144 | } 145 | ], 146 | "source": [ 147 | "# we check the logs until the server has been started correctly\n", 148 | "!while ! grep -q \"Application startup complete\" vllm.log; do tail -n 1 vllm.log; sleep 5; done" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": { 154 | "id": "MgQbgCcA1jQ6" 155 | }, 156 | "source": [ 157 | "## Chat with the model using OpenAIChatGenerator\n", 158 | "\n", 159 | "Once we have launched the vLLM-compatible OpenAI server,\n", 160 | "we can simply initialize an `OpenAIChatGenerator` pointing to the vLLM server URL and start chatting!" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 3, 166 | "metadata": { 167 | "id": "ypVbte5DrkWO" 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "from haystack.components.generators.chat import OpenAIChatGenerator\n", 172 | "from haystack.dataclasses import ChatMessage\n", 173 | "from haystack.utils import Secret\n", 174 | "\n", 175 | "generator = OpenAIChatGenerator(\n", 176 | " api_key=Secret.from_token(\"VLLM-PLACEHOLDER-API-KEY\"), # for compatibility with the OpenAI API, a placeholder api_key is needed\n", 177 | " model=\"TheBloke/notus-7B-v1-AWQ\",\n", 178 | " api_base_url=\"http://localhost:8000/v1\",\n", 179 | " generation_kwargs = {\"max_tokens\": 512}\n", 180 | ")" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 17, 186 | "metadata": { 187 | "colab": { 188 | "base_uri": "https://localhost:8080/" 189 | }, 190 | "id": "45xJjlEoCB7v", 191 | "outputId": "7f939707-5b25-4d3f-fefc-0368d15f236a" 192 | }, 193 | "outputs": [ 194 | { 195 | "name": "stdout", 196 | "output_type": "stream", 197 | "text": [ 198 | "Enter your message or Q to exit\n", 199 | "🧑 hello. can you help planning my next travel to Italy?\n", 200 | "🤖 Certainly! I'd be happy to help you plan your next trip to Italy. Here are some steps to help you plan your trip:\n", 201 | "\n", 202 | "1. Determine your travel dates: Decide when you want to travel to Italy. Keep in mind that peak season is from June to August, so prices may be higher, and crowds may be larger.\n", 203 | "\n", 204 | "2. Decide on your destination: Italy is a large country with many beautiful destinations. Consider which cities and regions you would like to visit. Some popular destinations include Rome, Florence, Venice, Amalfi Coast, Tuscany, and the Italian Lakes.\n", 205 | "\n", 206 | "3. Research flights and transportation: Look for flights that fit your budget and travel dates. If you're planning on traveling between cities, research trains and buses. Familiarize yourself with the transportation options in your destination cities.\n", 207 | "\n", 208 | "4. Consider accommodation: Research different types of accommodations, such as hotels, vacation rentals, or hostels. Look for a place that fits your budget and travel style.\n", 209 | "\n", 210 | "5. Plan your itinerary: Make a list of the things you want to see and do in each destination. Consider how much time you want to spend in each location.\n", 211 | "\n", 212 | "6. Research tours and activities: Look for tours and activities that interest you. This could include food tours, wine tastings, museum visits, or tours of historic sites.\n", 213 | "\n", 214 | "7. Learn some Italian: It's always a good idea to learn some Italian before your trip, even if it's just basic phrases. This will help you communicate with locals and make your trip more enjoyable.\n", 215 | "\n", 216 | "8. Pack appropriately: Make sure to pack appropriately for the season and destination. Check the weather forecast before you pack.\n", 217 | "\n", 218 | "9. Exchange currency: Research the best place to exchange currency in your destination. Also, check if your bank or credit card offers no-fee withdrawals.\n", 219 | "\n", 220 | "10. Purchase travel insurance: Consider purchasing travel insurance to protect yourself in case of unforeseen circumstances, like a cancelled flight or lost luggage.\n", 221 | "\n", 222 | "I hope these steps help you plan your next trip to Italy! Let me know if you have any questions or if you need further assistance.\n", 223 | "Enter your message or Q to exit\n", 224 | "🧑 Please list 5 places I should absolutely visit\n", 225 | "🤖 Certainly! Here are 5 places in Italy that I think should be on your must-visit list:\n", 226 | "\n", 227 | "1. Rome - The Eternal City is home to ancient ruins like the Colosseum and the Roman Forum, as well as iconic landmarks like the Vatican City, the Pantheon, and the Trevi Fountain. Rome is also known for its delicious food and vibrant nightlife.\n", 228 | "\n", 229 | "2. Florence - Known as the birthplace of the Renaissance, Florence is home to incredible art and architecture, including the Uffizi Gallery, the Duomo, and the Palazzo Vecchio. Be sure to try some of Florence's famous gelato while you're there!\n", 230 | "\n", 231 | "3. Venice - A city built on water, Venice is known for its canals, palaces, and bridges. Don't miss a gondola ride through the city's canals or a visit to St. Mark's Basilica and the Doge's Palace.\n", 232 | "\n", 233 | "4. Amalfi Coast - This scenic coastline is home to charming towns like Positano, Ravello, and Amalfi, as well as stunning cliffs, beaches, and seaside restaurants.\n", 234 | "\n", 235 | "5. Cinque Terre - This UNESCO World Heritage Site is a series of five colorful villages on the Italian Riviera. Hike between the villages or take a boat ride to see the beautiful coastline.\n", 236 | "\n", 237 | "These are just a few of the many incredible places to visit in Italy, but I hope this helps you start planning your itinerary!\n", 238 | "Enter your message or Q to exit\n", 239 | "🧑 Nice! Can you expand the fith point?\n", 240 | "🤖 Absolutely! The Cinque Terre, which translates to \"Five Lands,\" is a series of five picturesque villages located along the Italian Riviera. The villages are Monterosso al Mare, Vernazza, Corniglia, Manarola, and Riomaggiore. Here are some must-see highlights of each village:\n", 241 | "\n", 242 | "1. Monterosso al Mare - This village is the largest of the five, and is known for its long sandy beach, medieval castle ruins, and the Church of San Francesco, which offers stunning views from its bell tower.\n", 243 | "\n", 244 | "2. Vernazza - This village is often considered the jewel of the Cinque Terre. Don't miss the stunning views from the top of the castle ruins, and be sure to try some of Vernazza's famous seafood dishes.\n", 245 | "\n", 246 | "3. Corniglia - This village is perched on a cliff and can only be reached by foot or by a narrow stairway. Once you reach the top, you can visit the Church of San Lorenzo, which offers stunning views of the sea.\n", 247 | "\n", 248 | "4. Manarola - This village is known for its colorful houses that cascade down the cliffs, as well as its ancient church, San Carlo, and the Church of St. Lawrence.\n", 249 | "\n", 250 | "5. Riomaggiore - This village is the southernmost of the five and is known for its narrow streets, colorful houses, and seaside restaurants. Don't miss the Church of San Giovanni Battista, which sits at the edge of the village and offers stunning views of the sea.\n", 251 | "\n", 252 | "The best way to experience the Cinque Terre is by hiking between the villages along the Sentiero Azzurro (Blue Trail). This hike offers incredible views of the coastline and the villages below. Be sure to check the weather forecast before you go, as the trail can be challenging in rainy or windy conditions.\n", 253 | "\n", 254 | "I hope this helps you plan your visit to the stunning Cinque Terre!\n", 255 | "Enter your message or Q to exit\n", 256 | "🧑 Q\n" 257 | ] 258 | } 259 | ], 260 | "source": [ 261 | "messages = []\n", 262 | "\n", 263 | "while True:\n", 264 | " msg = input(\"Enter your message or Q to exit\\n🧑 \")\n", 265 | " if msg==\"Q\":\n", 266 | " break\n", 267 | " messages.append(ChatMessage.from_user(msg))\n", 268 | " response = generator.run(messages=messages)\n", 269 | " assistant_resp = response['replies'][0]\n", 270 | " print(\"🤖 \"+assistant_resp.text)\n", 271 | " messages.append(assistant_resp)" 272 | ] 273 | } 274 | ], 275 | "metadata": { 276 | "accelerator": "GPU", 277 | "colab": { 278 | "collapsed_sections": [ 279 | "t9GskGf6wGgE" 280 | ], 281 | "gpuType": "T4", 282 | "provenance": [] 283 | }, 284 | "kernelspec": { 285 | "display_name": "Python 3", 286 | "name": "python3" 287 | }, 288 | "language_info": { 289 | "name": "python" 290 | } 291 | }, 292 | "nbformat": 4, 293 | "nbformat_minor": 0 294 | } 295 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | nbconvert 2 | -------------------------------------------------------------------------------- /scripts/generate_markdowns.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | from subprocess import check_output 4 | import re 5 | 6 | import tomllib 7 | from nbconvert import MarkdownExporter 8 | from nbconvert.filters.strings import get_lines 9 | from nbformat import read, NO_CONVERT 10 | 11 | 12 | def generate_frontmatter(notebook): 13 | last_commit_date = ( 14 | check_output(f'git log -1 --pretty=format:"%cs" {notebook["file"]}'.split()) 15 | .decode() 16 | .strip() 17 | ) 18 | first_commit_date = ( 19 | check_output( 20 | f'git log --reverse --pretty=format:"%cs" {notebook["file"]}'.split() 21 | ) 22 | .decode() 23 | .strip() 24 | .splitlines()[0] 25 | ) 26 | 27 | frontmatter = f"""--- 28 | layout: cookbook 29 | featured_image: /images/cookbook-thumbnail.png 30 | images: ["/images/cookbook-thumbnail.png"] 31 | sitemap_exclude: {notebook.get("sitemap_exclude", False)} 32 | colab: {notebook.get("colab")} 33 | toc: True 34 | title: "{notebook["title"]}" 35 | lastmod: {last_commit_date} 36 | created_at: {first_commit_date} 37 | download: /downloads/{notebook["file"].name} 38 | featured: {notebook.get("featured", False)} 39 | experimental: {notebook.get("experimental", False)} 40 | discuss: {notebook.get("discuss", False)} 41 | hidden: {notebook.get("hidden", False)} 42 | new: {notebook.get("new", False)} 43 | topics: {notebook.get("topics", False)} 44 | --- 45 | """ 46 | return frontmatter 47 | 48 | 49 | def clean_colab_dataframe_cells(notebook): 50 | """ 51 | When there ara dataframes, Colab notebooks contain text/html data 52 | that include style and script, not properly rendered in markdown. 53 | This function removes this data from the notebook. 54 | """ 55 | 56 | for cell in notebook.cells: 57 | if cell.cell_type == "code" and "outputs" in cell: 58 | for output in cell.outputs: 59 | # we recognize this type of data by the key 'application/vnd.google.colaboratory.intrinsic+json' 60 | # and the type 'dataframe' 61 | if 'application/vnd.google.colaboratory.intrinsic+json' in output.get('data', {}): 62 | if output['data']['application/vnd.google.colaboratory.intrinsic+json'].get('type') == 'dataframe': 63 | output['data'].pop('text/html', None) 64 | 65 | return notebook 66 | 67 | def generate_markdown_from_notebook(notebook, output_path): 68 | frontmatter = generate_frontmatter(notebook) 69 | md_exporter = MarkdownExporter(exclude_output=False) 70 | 71 | with open(notebook["file"], "r", encoding="utf-8") as f: 72 | nb = read(f, as_version=NO_CONVERT) 73 | 74 | # process the notebook to clean Colab dataframe cells 75 | cleaned_nb = clean_colab_dataframe_cells(nb) 76 | 77 | body, _ = md_exporter.from_notebook_node(cleaned_nb) 78 | body = get_lines(body, start=1) 79 | 80 | # remove output images from the markdown: they are not handled properly 81 | img_pattern=r'^!\[png\]\(.*\.png\)$' 82 | body = re.sub(img_pattern, '', body, flags=re.MULTILINE) 83 | 84 | print(f"Processing {notebook['file']}") 85 | filename = notebook["file"].stem 86 | with open(f"{output_path}/{filename}.md", "w", encoding="utf-8") as f: 87 | try: 88 | f.write(frontmatter + "\n\n") 89 | except IndexError as e: 90 | raise IndexError( 91 | "Can't find the header for this tutorial. Have you added it in 'scripts/generate_markdowns.py'?" 92 | ) from e 93 | f.write(body) 94 | 95 | 96 | if __name__ == "__main__": 97 | parser = argparse.ArgumentParser() 98 | parser.add_argument("--output", dest="output", default="markdowns") 99 | args = parser.parse_args() 100 | root_path = Path(__file__).parent.parent 101 | 102 | readme_file = root_path / "README.md" 103 | readme_content = readme_file.read_text() 104 | 105 | index_file = root_path / "index.toml" 106 | index_data = tomllib.loads(index_file.read_text()) 107 | 108 | if not Path(args.output).exists(): 109 | Path(args.output).mkdir(parents=True, exist_ok=True) 110 | 111 | for cookbook_data in index_data["cookbook"]: 112 | data = { 113 | "file": root_path / "notebooks" / cookbook_data["notebook"], 114 | "title": cookbook_data["title"], 115 | "colab": f"{index_data['config']['colab']}/{cookbook_data['notebook']}", 116 | "featured": cookbook_data.get("featured", False), 117 | "experimental": cookbook_data.get("experimental", False), 118 | "discuss": cookbook_data.get("discuss", False), 119 | "hidden": cookbook_data.get("hidden", False), 120 | "new": cookbook_data.get("new", False), 121 | "topics": cookbook_data.get("topics", False), 122 | } 123 | generate_markdown_from_notebook(data, args.output) -------------------------------------------------------------------------------- /scripts/verify_index.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import tomllib 4 | 5 | if __name__ == "__main__": 6 | root_path = Path(__file__).parent.parent 7 | 8 | index_file = root_path / "index.toml" 9 | index_data = tomllib.loads(index_file.read_text()) 10 | 11 | notebooks = list(root_path.glob("notebooks/*.ipynb")) 12 | 13 | failed = False 14 | indexed_notebooks = [ 15 | notebook["notebook"] 16 | for notebook in index_data["cookbook"] 17 | if "notebook" in notebook 18 | ] 19 | for notebook in notebooks: 20 | if notebook.name not in indexed_notebooks: 21 | print(f"Notebook '{notebook.name}' not found in 'index.toml'") 22 | failed = True 23 | 24 | for notebook in index_data["cookbook"]: 25 | if "notebook" not in notebook: 26 | print(f"Notebook '{notebook}' has no file") 27 | failed = True 28 | continue 29 | 30 | if not (root_path / f"notebooks/{notebook['notebook']}").exists(): 31 | print(f"Notebook '{notebook['notebook']}' file not found") 32 | failed = True 33 | 34 | if "title" not in notebook: 35 | print(f"Notebook '{notebook['notebook']}' has no title") 36 | failed = True 37 | 38 | if not notebook.get("topics"): 39 | print(f"Notebook '{notebook['notebook']}' has no topics") 40 | failed = True 41 | 42 | if failed: 43 | exit(1) 44 | --------------------------------------------------------------------------------